From af36d7f0df56de3e3e4bbfb15d0915097ecb8cab Mon Sep 17 00:00:00 2001 From: Jeff Garzik Date: Sun, 28 Aug 2005 20:18:39 -0400 Subject: [libata] license change, other bits - changes license of all code from OSL+GPL to plain ole GPL - except for NVIDIA, who hasn't yet responded about sata_nv - copyright holders were already contacted privately - adds info in each driver about where hardware/protocol docs may be obtained - where I have made major contributions, updated copyright dates diff --git a/drivers/scsi/ahci.c b/drivers/scsi/ahci.c index e3b9692..7eaaf7a 100644 --- a/drivers/scsi/ahci.c +++ b/drivers/scsi/ahci.c @@ -1,26 +1,34 @@ /* * ahci.c - AHCI SATA support * - * Copyright 2004 Red Hat, Inc. + * Maintained by: Jeff Garzik + * Please ALWAYS copy linux-ide@vger.kernel.org + * on emails. * - * The contents of this file are subject to the Open - * Software License version 1.1 that can be found at - * http://www.opensource.org/licenses/osl-1.1.txt and is included herein - * by reference. + * Copyright 2004-2005 Red Hat, Inc. * - * Alternatively, the contents of this file may be used under the terms - * of the GNU General Public License version 2 (the "GPL") as distributed - * in the kernel source COPYING file, in which case the provisions of - * the GPL are applicable instead of the above. If you wish to allow - * the use of your version of this file only under the terms of the - * GPL and not to allow others to use your version of this file under - * the OSL, indicate your decision by deleting the provisions above and - * replace them with the notice and other provisions required by the GPL. - * If you do not delete the provisions above, a recipient may use your - * version of this file under either the OSL or the GPL. * - * Version 1.0 of the AHCI specification: + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * + * + * libata documentation is available via 'make {ps|pdf}docs', + * as Documentation/DocBook/libata.* + * + * AHCI hardware documentation: * http://www.intel.com/technology/serialata/pdf/rev1_0.pdf + * http://www.intel.com/technology/serialata/pdf/rev1_1.pdf * */ diff --git a/drivers/scsi/ata_piix.c b/drivers/scsi/ata_piix.c index d96ebf9..6898b7f 100644 --- a/drivers/scsi/ata_piix.c +++ b/drivers/scsi/ata_piix.c @@ -1,24 +1,42 @@ /* - - ata_piix.c - Intel PATA/SATA controllers - - Maintained by: Jeff Garzik - Please ALWAYS copy linux-ide@vger.kernel.org - on emails. - - - Copyright 2003-2004 Red Hat Inc - Copyright 2003-2004 Jeff Garzik - - - Copyright header from piix.c: - - Copyright (C) 1998-1999 Andrzej Krzysztofowicz, Author and Maintainer - Copyright (C) 1998-2000 Andre Hedrick - Copyright (C) 2003 Red Hat Inc - - May be copied or modified under the terms of the GNU General Public License - + * ata_piix.c - Intel PATA/SATA controllers + * + * Maintained by: Jeff Garzik + * Please ALWAYS copy linux-ide@vger.kernel.org + * on emails. + * + * + * Copyright 2003-2005 Red Hat Inc + * Copyright 2003-2005 Jeff Garzik + * + * + * Copyright header from piix.c: + * + * Copyright (C) 1998-1999 Andrzej Krzysztofowicz, Author and Maintainer + * Copyright (C) 1998-2000 Andre Hedrick + * Copyright (C) 2003 Red Hat Inc + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * + * + * libata documentation is available via 'make {ps|pdf}docs', + * as Documentation/DocBook/libata.* + * + * Hardware documentation available at http://developer.intel.com/ + * */ #include diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c index f4e7dcb..4154e5b 100644 --- a/drivers/scsi/libata-core.c +++ b/drivers/scsi/libata-core.c @@ -1,25 +1,35 @@ /* - libata-core.c - helper library for ATA - - Copyright 2003-2004 Red Hat, Inc. All rights reserved. - Copyright 2003-2004 Jeff Garzik - - The contents of this file are subject to the Open - Software License version 1.1 that can be found at - http://www.opensource.org/licenses/osl-1.1.txt and is included herein - by reference. - - Alternatively, the contents of this file may be used under the terms - of the GNU General Public License version 2 (the "GPL") as distributed - in the kernel source COPYING file, in which case the provisions of - the GPL are applicable instead of the above. If you wish to allow - the use of your version of this file only under the terms of the - GPL and not to allow others to use your version of this file under - the OSL, indicate your decision by deleting the provisions above and - replace them with the notice and other provisions required by the GPL. - If you do not delete the provisions above, a recipient may use your - version of this file under either the OSL or the GPL. - + * libata-core.c - helper library for ATA + * + * Maintained by: Jeff Garzik + * Please ALWAYS copy linux-ide@vger.kernel.org + * on emails. + * + * Copyright 2003-2004 Red Hat, Inc. All rights reserved. + * Copyright 2003-2004 Jeff Garzik + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * + * + * libata documentation is available via 'make {ps|pdf}docs', + * as Documentation/DocBook/libata.* + * + * Hardware documentation available from http://www.t13.org/ and + * http://www.sata-io.org/ + * */ #include diff --git a/drivers/scsi/libata-scsi.c b/drivers/scsi/libata-scsi.c index 6a75ec2..c6aeab1 100644 --- a/drivers/scsi/libata-scsi.c +++ b/drivers/scsi/libata-scsi.c @@ -1,25 +1,36 @@ /* - libata-scsi.c - helper library for ATA - - Copyright 2003-2004 Red Hat, Inc. All rights reserved. - Copyright 2003-2004 Jeff Garzik - - The contents of this file are subject to the Open - Software License version 1.1 that can be found at - http://www.opensource.org/licenses/osl-1.1.txt and is included herein - by reference. - - Alternatively, the contents of this file may be used under the terms - of the GNU General Public License version 2 (the "GPL") as distributed - in the kernel source COPYING file, in which case the provisions of - the GPL are applicable instead of the above. If you wish to allow - the use of your version of this file only under the terms of the - GPL and not to allow others to use your version of this file under - the OSL, indicate your decision by deleting the provisions above and - replace them with the notice and other provisions required by the GPL. - If you do not delete the provisions above, a recipient may use your - version of this file under either the OSL or the GPL. - + * libata-scsi.c - helper library for ATA + * + * Maintained by: Jeff Garzik + * Please ALWAYS copy linux-ide@vger.kernel.org + * on emails. + * + * Copyright 2003-2004 Red Hat, Inc. All rights reserved. + * Copyright 2003-2004 Jeff Garzik + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * + * + * libata documentation is available via 'make {ps|pdf}docs', + * as Documentation/DocBook/libata.* + * + * Hardware documentation available from + * - http://www.t10.org/ + * - http://www.t13.org/ + * */ #include diff --git a/drivers/scsi/libata.h b/drivers/scsi/libata.h index 3e7f4843..c51d658 100644 --- a/drivers/scsi/libata.h +++ b/drivers/scsi/libata.h @@ -1,25 +1,28 @@ /* - libata.h - helper library for ATA - - Copyright 2003-2004 Red Hat, Inc. All rights reserved. - Copyright 2003-2004 Jeff Garzik - - The contents of this file are subject to the Open - Software License version 1.1 that can be found at - http://www.opensource.org/licenses/osl-1.1.txt and is included herein - by reference. - - Alternatively, the contents of this file may be used under the terms - of the GNU General Public License version 2 (the "GPL") as distributed - in the kernel source COPYING file, in which case the provisions of - the GPL are applicable instead of the above. If you wish to allow - the use of your version of this file only under the terms of the - GPL and not to allow others to use your version of this file under - the OSL, indicate your decision by deleting the provisions above and - replace them with the notice and other provisions required by the GPL. - If you do not delete the provisions above, a recipient may use your - version of this file under either the OSL or the GPL. - + * libata.h - helper library for ATA + * + * Copyright 2003-2004 Red Hat, Inc. All rights reserved. + * Copyright 2003-2004 Jeff Garzik + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * + * + * libata documentation is available via 'make {ps|pdf}docs', + * as Documentation/DocBook/libata.* + * */ #ifndef __LIBATA_H__ diff --git a/drivers/scsi/sata_nv.c b/drivers/scsi/sata_nv.c index b0403cc..1e10370 100644 --- a/drivers/scsi/sata_nv.c +++ b/drivers/scsi/sata_nv.c @@ -20,6 +20,17 @@ * If you do not delete the provisions above, a recipient may use your * version of this file under either the OSL or the GPL. * + * + * libata documentation is available via 'make {ps|pdf}docs', + * as Documentation/DocBook/libata.* + * + * No hardware documentation available outside of NVIDIA. + * This driver programs the NVIDIA SATA controller in a similar + * fashion as with other PCI IDE BMDMA controllers, with a few + * NV-specific details such as register offsets, SATA phy location, + * hotplug info, etc. + * + * * 0.06 * - Added generic SATA support by using a pci_device_id that filters on * the IDE storage class code. diff --git a/drivers/scsi/sata_promise.c b/drivers/scsi/sata_promise.c index 919fb31..b27e2e2 100644 --- a/drivers/scsi/sata_promise.c +++ b/drivers/scsi/sata_promise.c @@ -7,21 +7,26 @@ * * Copyright 2003-2004 Red Hat, Inc. * - * The contents of this file are subject to the Open - * Software License version 1.1 that can be found at - * http://www.opensource.org/licenses/osl-1.1.txt and is included herein - * by reference. * - * Alternatively, the contents of this file may be used under the terms - * of the GNU General Public License version 2 (the "GPL") as distributed - * in the kernel source COPYING file, in which case the provisions of - * the GPL are applicable instead of the above. If you wish to allow - * the use of your version of this file only under the terms of the - * GPL and not to allow others to use your version of this file under - * the OSL, indicate your decision by deleting the provisions above and - * replace them with the notice and other provisions required by the GPL. - * If you do not delete the provisions above, a recipient may use your - * version of this file under either the OSL or the GPL. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * + * + * libata documentation is available via 'make {ps|pdf}docs', + * as Documentation/DocBook/libata.* + * + * Hardware information only available under NDA. * */ diff --git a/drivers/scsi/sata_promise.h b/drivers/scsi/sata_promise.h index 6e7e96b..6ee5e190 100644 --- a/drivers/scsi/sata_promise.h +++ b/drivers/scsi/sata_promise.h @@ -3,21 +3,24 @@ * * Copyright 2003-2004 Red Hat, Inc. * - * The contents of this file are subject to the Open - * Software License version 1.1 that can be found at - * http://www.opensource.org/licenses/osl-1.1.txt and is included herein - * by reference. * - * Alternatively, the contents of this file may be used under the terms - * of the GNU General Public License version 2 (the "GPL") as distributed - * in the kernel source COPYING file, in which case the provisions of - * the GPL are applicable instead of the above. If you wish to allow - * the use of your version of this file only under the terms of the - * GPL and not to allow others to use your version of this file under - * the OSL, indicate your decision by deleting the provisions above and - * replace them with the notice and other provisions required by the GPL. - * If you do not delete the provisions above, a recipient may use your - * version of this file under either the OSL or the GPL. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * + * + * libata documentation is available via 'make {ps|pdf}docs', + * as Documentation/DocBook/libata.* * */ diff --git a/drivers/scsi/sata_qstor.c b/drivers/scsi/sata_qstor.c index 1383e8a..f6b716f 100644 --- a/drivers/scsi/sata_qstor.c +++ b/drivers/scsi/sata_qstor.c @@ -6,21 +6,24 @@ * Copyright 2005 Pacific Digital Corporation. * (OSL/GPL code release authorized by Jalil Fadavi). * - * The contents of this file are subject to the Open - * Software License version 1.1 that can be found at - * http://www.opensource.org/licenses/osl-1.1.txt and is included herein - * by reference. * - * Alternatively, the contents of this file may be used under the terms - * of the GNU General Public License version 2 (the "GPL") as distributed - * in the kernel source COPYING file, in which case the provisions of - * the GPL are applicable instead of the above. If you wish to allow - * the use of your version of this file only under the terms of the - * GPL and not to allow others to use your version of this file under - * the OSL, indicate your decision by deleting the provisions above and - * replace them with the notice and other provisions required by the GPL. - * If you do not delete the provisions above, a recipient may use your - * version of this file under either the OSL or the GPL. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * + * + * libata documentation is available via 'make {ps|pdf}docs', + * as Documentation/DocBook/libata.* * */ diff --git a/drivers/scsi/sata_sil.c b/drivers/scsi/sata_sil.c index 49ed557..345e6f2 100644 --- a/drivers/scsi/sata_sil.c +++ b/drivers/scsi/sata_sil.c @@ -5,24 +5,27 @@ * Please ALWAYS copy linux-ide@vger.kernel.org * on emails. * - * Copyright 2003 Red Hat, Inc. + * Copyright 2003-2005 Red Hat, Inc. * Copyright 2003 Benjamin Herrenschmidt * - * The contents of this file are subject to the Open - * Software License version 1.1 that can be found at - * http://www.opensource.org/licenses/osl-1.1.txt and is included herein - * by reference. * - * Alternatively, the contents of this file may be used under the terms - * of the GNU General Public License version 2 (the "GPL") as distributed - * in the kernel source COPYING file, in which case the provisions of - * the GPL are applicable instead of the above. If you wish to allow - * the use of your version of this file only under the terms of the - * GPL and not to allow others to use your version of this file under - * the OSL, indicate your decision by deleting the provisions above and - * replace them with the notice and other provisions required by the GPL. - * If you do not delete the provisions above, a recipient may use your - * version of this file under either the OSL or the GPL. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * + * + * libata documentation is available via 'make {ps|pdf}docs', + * as Documentation/DocBook/libata.* * */ diff --git a/drivers/scsi/sata_sis.c b/drivers/scsi/sata_sis.c index e418b89..6db8b09 100644 --- a/drivers/scsi/sata_sis.c +++ b/drivers/scsi/sata_sis.c @@ -7,21 +7,26 @@ * * Copyright 2004 Uwe Koziolek * - * The contents of this file are subject to the Open - * Software License version 1.1 that can be found at - * http://www.opensource.org/licenses/osl-1.1.txt and is included herein - * by reference. * - * Alternatively, the contents of this file may be used under the terms - * of the GNU General Public License version 2 (the "GPL") as distributed - * in the kernel source COPYING file, in which case the provisions of - * the GPL are applicable instead of the above. If you wish to allow - * the use of your version of this file only under the terms of the - * GPL and not to allow others to use your version of this file under - * the OSL, indicate your decision by deleting the provisions above and - * replace them with the notice and other provisions required by the GPL. - * If you do not delete the provisions above, a recipient may use your - * version of this file under either the OSL or the GPL. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * + * + * libata documentation is available via 'make {ps|pdf}docs', + * as Documentation/DocBook/libata.* + * + * Hardware documentation available under NDA. * */ diff --git a/drivers/scsi/sata_svw.c b/drivers/scsi/sata_svw.c index 858e071..3884a3c 100644 --- a/drivers/scsi/sata_svw.c +++ b/drivers/scsi/sata_svw.c @@ -13,21 +13,26 @@ * This driver probably works with non-Apple versions of the * Broadcom chipset... * - * The contents of this file are subject to the Open - * Software License version 1.1 that can be found at - * http://www.opensource.org/licenses/osl-1.1.txt and is included herein - * by reference. * - * Alternatively, the contents of this file may be used under the terms - * of the GNU General Public License version 2 (the "GPL") as distributed - * in the kernel source COPYING file, in which case the provisions of - * the GPL are applicable instead of the above. If you wish to allow - * the use of your version of this file only under the terms of the - * GPL and not to allow others to use your version of this file under - * the OSL, indicate your decision by deleting the provisions above and - * replace them with the notice and other provisions required by the GPL. - * If you do not delete the provisions above, a recipient may use your - * version of this file under either the OSL or the GPL. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * + * + * libata documentation is available via 'make {ps|pdf}docs', + * as Documentation/DocBook/libata.* + * + * Hardware documentation available under NDA. * */ diff --git a/drivers/scsi/sata_sx4.c b/drivers/scsi/sata_sx4.c index efd7d7a..c7f6ec2 100644 --- a/drivers/scsi/sata_sx4.c +++ b/drivers/scsi/sata_sx4.c @@ -7,21 +7,26 @@ * * Copyright 2003-2004 Red Hat, Inc. * - * The contents of this file are subject to the Open - * Software License version 1.1 that can be found at - * http://www.opensource.org/licenses/osl-1.1.txt and is included herein - * by reference. * - * Alternatively, the contents of this file may be used under the terms - * of the GNU General Public License version 2 (the "GPL") as distributed - * in the kernel source COPYING file, in which case the provisions of - * the GPL are applicable instead of the above. If you wish to allow - * the use of your version of this file only under the terms of the - * GPL and not to allow others to use your version of this file under - * the OSL, indicate your decision by deleting the provisions above and - * replace them with the notice and other provisions required by the GPL. - * If you do not delete the provisions above, a recipient may use your - * version of this file under either the OSL or the GPL. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * + * + * libata documentation is available via 'make {ps|pdf}docs', + * as Documentation/DocBook/libata.* + * + * Hardware documentation available under NDA. * */ diff --git a/drivers/scsi/sata_uli.c b/drivers/scsi/sata_uli.c index a71fb54..fa10219 100644 --- a/drivers/scsi/sata_uli.c +++ b/drivers/scsi/sata_uli.c @@ -1,21 +1,26 @@ /* * sata_uli.c - ULi Electronics SATA * - * The contents of this file are subject to the Open - * Software License version 1.1 that can be found at - * http://www.opensource.org/licenses/osl-1.1.txt and is included herein - * by reference. * - * Alternatively, the contents of this file may be used under the terms - * of the GNU General Public License version 2 (the "GPL") as distributed - * in the kernel source COPYING file, in which case the provisions of - * the GPL are applicable instead of the above. If you wish to allow - * the use of your version of this file only under the terms of the - * GPL and not to allow others to use your version of this file under - * the OSL, indicate your decision by deleting the provisions above and - * replace them with the notice and other provisions required by the GPL. - * If you do not delete the provisions above, a recipient may use your - * version of this file under either the OSL or the GPL. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * + * + * libata documentation is available via 'make {ps|pdf}docs', + * as Documentation/DocBook/libata.* + * + * Hardware documentation available under NDA. * */ diff --git a/drivers/scsi/sata_via.c b/drivers/scsi/sata_via.c index f43183c..6653ffe 100644 --- a/drivers/scsi/sata_via.c +++ b/drivers/scsi/sata_via.c @@ -1,34 +1,38 @@ /* - sata_via.c - VIA Serial ATA controllers - - Maintained by: Jeff Garzik - Please ALWAYS copy linux-ide@vger.kernel.org + * sata_via.c - VIA Serial ATA controllers + * + * Maintained by: Jeff Garzik + * Please ALWAYS copy linux-ide@vger.kernel.org on emails. - - Copyright 2003-2004 Red Hat, Inc. All rights reserved. - Copyright 2003-2004 Jeff Garzik - - The contents of this file are subject to the Open - Software License version 1.1 that can be found at - http://www.opensource.org/licenses/osl-1.1.txt and is included herein - by reference. - - Alternatively, the contents of this file may be used under the terms - of the GNU General Public License version 2 (the "GPL") as distributed - in the kernel source COPYING file, in which case the provisions of - the GPL are applicable instead of the above. If you wish to allow - the use of your version of this file only under the terms of the - GPL and not to allow others to use your version of this file under - the OSL, indicate your decision by deleting the provisions above and - replace them with the notice and other provisions required by the GPL. - If you do not delete the provisions above, a recipient may use your - version of this file under either the OSL or the GPL. - - ---------------------------------------------------------------------- - - To-do list: - * VT6421 PATA support - + * + * Copyright 2003-2004 Red Hat, Inc. All rights reserved. + * Copyright 2003-2004 Jeff Garzik + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * + * + * libata documentation is available via 'make {ps|pdf}docs', + * as Documentation/DocBook/libata.* + * + * Hardware documentation available under NDA. + * + * + * To-do list: + * - VT6421 PATA support + * */ #include diff --git a/drivers/scsi/sata_vsc.c b/drivers/scsi/sata_vsc.c index c5e09dc..8bddb82 100644 --- a/drivers/scsi/sata_vsc.c +++ b/drivers/scsi/sata_vsc.c @@ -9,9 +9,29 @@ * * Bits from Jeff Garzik, Copyright RedHat, Inc. * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * + * + * libata documentation is available via 'make {ps|pdf}docs', + * as Documentation/DocBook/libata.* + * + * Vitesse hardware documentation presumably available under NDA. + * Intel 31244 (same hardware interface) documentation presumably + * available from http://developer.intel.com/ + * */ #include diff --git a/include/linux/ata.h b/include/linux/ata.h index ca5fcad..19c3e28 100644 --- a/include/linux/ata.h +++ b/include/linux/ata.h @@ -1,24 +1,29 @@ /* - Copyright 2003-2004 Red Hat, Inc. All rights reserved. - Copyright 2003-2004 Jeff Garzik - - The contents of this file are subject to the Open - Software License version 1.1 that can be found at - http://www.opensource.org/licenses/osl-1.1.txt and is included herein - by reference. - - Alternatively, the contents of this file may be used under the terms - of the GNU General Public License version 2 (the "GPL") as distributed - in the kernel source COPYING file, in which case the provisions of - the GPL are applicable instead of the above. If you wish to allow - the use of your version of this file only under the terms of the - GPL and not to allow others to use your version of this file under - the OSL, indicate your decision by deleting the provisions above and - replace them with the notice and other provisions required by the GPL. - If you do not delete the provisions above, a recipient may use your - version of this file under either the OSL or the GPL. - + * Copyright 2003-2004 Red Hat, Inc. All rights reserved. + * Copyright 2003-2004 Jeff Garzik + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * + * + * libata documentation is available via 'make {ps|pdf}docs', + * as Documentation/DocBook/libata.* + * + * Hardware documentation available from http://www.t13.org/ + * */ #ifndef __LINUX_ATA_H__ diff --git a/include/linux/libata.h b/include/linux/libata.h index 6cd9ba6..51d2b20 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1,23 +1,26 @@ /* - Copyright 2003-2004 Red Hat, Inc. All rights reserved. - Copyright 2003-2004 Jeff Garzik - - The contents of this file are subject to the Open - Software License version 1.1 that can be found at - http://www.opensource.org/licenses/osl-1.1.txt and is included herein - by reference. - - Alternatively, the contents of this file may be used under the terms - of the GNU General Public License version 2 (the "GPL") as distributed - in the kernel source COPYING file, in which case the provisions of - the GPL are applicable instead of the above. If you wish to allow - the use of your version of this file only under the terms of the - GPL and not to allow others to use your version of this file under - the OSL, indicate your decision by deleting the provisions above and - replace them with the notice and other provisions required by the GPL. - If you do not delete the provisions above, a recipient may use your - version of this file under either the OSL or the GPL. - + * Copyright 2003-2005 Red Hat, Inc. All rights reserved. + * Copyright 2003-2005 Jeff Garzik + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * + * + * libata documentation is available via 'make {ps|pdf}docs', + * as Documentation/DocBook/libata.* + * */ #ifndef __LINUX_LIBATA_H__ -- cgit v0.10.2 From decd300b30e499fe6be1bbfc5650fc971de8c1fa Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Mon, 8 Aug 2005 13:24:38 +1000 Subject: [PATCH] ppc64: make arch/ppc64/boot standalone Make the bootheader for ppc64 independent from kernel and libc headers. * add -nostdinc -isystem $gccincludes to not include libc headers * declare all functions in header files, also the stuff from string.S * declare some functions static * use stddef.h to get size_t (hopefully ok) * remove ppc32-types.h, only elf.h used the __NN types With further modifications by Paul Mackerras and Stephen Rothwell. Signed-off-by: Olaf Hering Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/boot/Makefile b/arch/ppc64/boot/Makefile index 683b2d4..2c5f5e7 100644 --- a/arch/ppc64/boot/Makefile +++ b/arch/ppc64/boot/Makefile @@ -22,8 +22,8 @@ HOSTCC := gcc -BOOTCFLAGS := $(HOSTCFLAGS) $(LINUXINCLUDE) -fno-builtin -BOOTAFLAGS := -D__ASSEMBLY__ $(BOOTCFLAGS) -traditional +BOOTCFLAGS := $(HOSTCFLAGS) -fno-builtin -nostdinc -isystem $(shell $(CROSS32CC) -print-file-name=include) +BOOTAFLAGS := -D__ASSEMBLY__ $(BOOTCFLAGS) -traditional -nostdinc BOOTLFLAGS := -Ttext 0x00400000 -e _start -T $(srctree)/$(src)/zImage.lds OBJCOPYFLAGS := contents,alloc,load,readonly,data diff --git a/arch/ppc64/boot/addnote.c b/arch/ppc64/boot/addnote.c index 719663a..8041a98 100644 --- a/arch/ppc64/boot/addnote.c +++ b/arch/ppc64/boot/addnote.c @@ -157,7 +157,7 @@ main(int ac, char **av) PUT_32BE(ns, strlen(arch) + 1); PUT_32BE(ns + 4, N_DESCR * 4); PUT_32BE(ns + 8, 0x1275); - strcpy(&buf[ns + 12], arch); + strcpy((char *) &buf[ns + 12], arch); ns += 12 + strlen(arch) + 1; for (i = 0; i < N_DESCR; ++i, ns += 4) PUT_32BE(ns, descr[i]); @@ -172,7 +172,7 @@ main(int ac, char **av) PUT_32BE(ns, strlen(rpaname) + 1); PUT_32BE(ns + 4, sizeof(rpanote)); PUT_32BE(ns + 8, 0x12759999); - strcpy(&buf[ns + 12], rpaname); + strcpy((char *) &buf[ns + 12], rpaname); ns += 12 + ROUNDUP(strlen(rpaname) + 1); for (i = 0; i < N_RPA_DESCR; ++i, ns += 4) PUT_32BE(ns, rpanote[i]); diff --git a/arch/ppc64/boot/crt0.S b/arch/ppc64/boot/crt0.S index 04d3e74..3861e7f 100644 --- a/arch/ppc64/boot/crt0.S +++ b/arch/ppc64/boot/crt0.S @@ -9,7 +9,7 @@ * NOTE: this code runs in 32 bit mode and is packaged as ELF32. */ -#include +#include "ppc_asm.h" .text .globl _start diff --git a/arch/ppc64/boot/div64.S b/arch/ppc64/boot/div64.S index 38f7e46..722f360 100644 --- a/arch/ppc64/boot/div64.S +++ b/arch/ppc64/boot/div64.S @@ -13,7 +13,7 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ -#include +#include "ppc_asm.h" .globl __div64_32 __div64_32: diff --git a/arch/ppc64/boot/elf.h b/arch/ppc64/boot/elf.h new file mode 100644 index 0000000..d4828fc --- /dev/null +++ b/arch/ppc64/boot/elf.h @@ -0,0 +1,149 @@ +#ifndef _PPC_BOOT_ELF_H_ +#define _PPC_BOOT_ELF_H_ + +/* 32-bit ELF base types. */ +typedef unsigned int Elf32_Addr; +typedef unsigned short Elf32_Half; +typedef unsigned int Elf32_Off; +typedef signed int Elf32_Sword; +typedef unsigned int Elf32_Word; + +/* 64-bit ELF base types. */ +typedef unsigned long long Elf64_Addr; +typedef unsigned short Elf64_Half; +typedef signed short Elf64_SHalf; +typedef unsigned long long Elf64_Off; +typedef signed int Elf64_Sword; +typedef unsigned int Elf64_Word; +typedef unsigned long long Elf64_Xword; +typedef signed long long Elf64_Sxword; + +/* These constants are for the segment types stored in the image headers */ +#define PT_NULL 0 +#define PT_LOAD 1 +#define PT_DYNAMIC 2 +#define PT_INTERP 3 +#define PT_NOTE 4 +#define PT_SHLIB 5 +#define PT_PHDR 6 +#define PT_TLS 7 /* Thread local storage segment */ +#define PT_LOOS 0x60000000 /* OS-specific */ +#define PT_HIOS 0x6fffffff /* OS-specific */ +#define PT_LOPROC 0x70000000 +#define PT_HIPROC 0x7fffffff +#define PT_GNU_EH_FRAME 0x6474e550 + +#define PT_GNU_STACK (PT_LOOS + 0x474e551) + +/* These constants define the different elf file types */ +#define ET_NONE 0 +#define ET_REL 1 +#define ET_EXEC 2 +#define ET_DYN 3 +#define ET_CORE 4 +#define ET_LOPROC 0xff00 +#define ET_HIPROC 0xffff + +/* These constants define the various ELF target machines */ +#define EM_NONE 0 +#define EM_PPC 20 /* PowerPC */ +#define EM_PPC64 21 /* PowerPC64 */ + +#define EI_NIDENT 16 + +typedef struct elf32_hdr { + unsigned char e_ident[EI_NIDENT]; + Elf32_Half e_type; + Elf32_Half e_machine; + Elf32_Word e_version; + Elf32_Addr e_entry; /* Entry point */ + Elf32_Off e_phoff; + Elf32_Off e_shoff; + Elf32_Word e_flags; + Elf32_Half e_ehsize; + Elf32_Half e_phentsize; + Elf32_Half e_phnum; + Elf32_Half e_shentsize; + Elf32_Half e_shnum; + Elf32_Half e_shstrndx; +} Elf32_Ehdr; + +typedef struct elf64_hdr { + unsigned char e_ident[16]; /* ELF "magic number" */ + Elf64_Half e_type; + Elf64_Half e_machine; + Elf64_Word e_version; + Elf64_Addr e_entry; /* Entry point virtual address */ + Elf64_Off e_phoff; /* Program header table file offset */ + Elf64_Off e_shoff; /* Section header table file offset */ + Elf64_Word e_flags; + Elf64_Half e_ehsize; + Elf64_Half e_phentsize; + Elf64_Half e_phnum; + Elf64_Half e_shentsize; + Elf64_Half e_shnum; + Elf64_Half e_shstrndx; +} Elf64_Ehdr; + +/* These constants define the permissions on sections in the program + header, p_flags. */ +#define PF_R 0x4 +#define PF_W 0x2 +#define PF_X 0x1 + +typedef struct elf32_phdr { + Elf32_Word p_type; + Elf32_Off p_offset; + Elf32_Addr p_vaddr; + Elf32_Addr p_paddr; + Elf32_Word p_filesz; + Elf32_Word p_memsz; + Elf32_Word p_flags; + Elf32_Word p_align; +} Elf32_Phdr; + +typedef struct elf64_phdr { + Elf64_Word p_type; + Elf64_Word p_flags; + Elf64_Off p_offset; /* Segment file offset */ + Elf64_Addr p_vaddr; /* Segment virtual address */ + Elf64_Addr p_paddr; /* Segment physical address */ + Elf64_Xword p_filesz; /* Segment size in file */ + Elf64_Xword p_memsz; /* Segment size in memory */ + Elf64_Xword p_align; /* Segment alignment, file & memory */ +} Elf64_Phdr; + +#define EI_MAG0 0 /* e_ident[] indexes */ +#define EI_MAG1 1 +#define EI_MAG2 2 +#define EI_MAG3 3 +#define EI_CLASS 4 +#define EI_DATA 5 +#define EI_VERSION 6 +#define EI_OSABI 7 +#define EI_PAD 8 + +#define ELFMAG0 0x7f /* EI_MAG */ +#define ELFMAG1 'E' +#define ELFMAG2 'L' +#define ELFMAG3 'F' +#define ELFMAG "\177ELF" +#define SELFMAG 4 + +#define ELFCLASSNONE 0 /* EI_CLASS */ +#define ELFCLASS32 1 +#define ELFCLASS64 2 +#define ELFCLASSNUM 3 + +#define ELFDATANONE 0 /* e_ident[EI_DATA] */ +#define ELFDATA2LSB 1 +#define ELFDATA2MSB 2 + +#define EV_NONE 0 /* e_version, EI_VERSION */ +#define EV_CURRENT 1 +#define EV_NUM 2 + +#define ELFOSABI_NONE 0 +#define ELFOSABI_LINUX 3 + +#endif /* _PPC_BOOT_ELF_H_ */ diff --git a/arch/ppc64/boot/main.c b/arch/ppc64/boot/main.c index 199d980..99e68cf 100644 --- a/arch/ppc64/boot/main.c +++ b/arch/ppc64/boot/main.c @@ -8,36 +8,28 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ -#include "ppc32-types.h" +#include +#include +#include "elf.h" +#include "page.h" +#include "string.h" +#include "stdio.h" +#include "prom.h" #include "zlib.h" -#include -#include -#include -#include - -extern void *finddevice(const char *); -extern int getprop(void *, const char *, void *, int); -extern void printf(const char *fmt, ...); -extern int sprintf(char *buf, const char *fmt, ...); -void gunzip(void *, int, unsigned char *, int *); -void *claim(unsigned int, unsigned int, unsigned int); -void flush_cache(void *, unsigned long); -void pause(void); -extern void exit(void); - -unsigned long strlen(const char *s); -void *memmove(void *dest, const void *src, unsigned long n); -void *memcpy(void *dest, const void *src, unsigned long n); + +static void gunzip(void *, int, unsigned char *, int *); +extern void flush_cache(void *, unsigned long); + /* Value picked to match that used by yaboot */ #define PROG_START 0x01400000 #define RAM_END (256<<20) // Fixme: use OF */ -char *avail_ram; -char *begin_avail, *end_avail; -char *avail_high; -unsigned int heap_use; -unsigned int heap_max; +static char *avail_ram; +static char *begin_avail, *end_avail; +static char *avail_high; +static unsigned int heap_use; +static unsigned int heap_max; extern char _start[]; extern char _vmlinux_start[]; @@ -52,9 +44,9 @@ struct addr_range { unsigned long size; unsigned long memsize; }; -struct addr_range vmlinux = {0, 0, 0}; -struct addr_range vmlinuz = {0, 0, 0}; -struct addr_range initrd = {0, 0, 0}; +static struct addr_range vmlinux = {0, 0, 0}; +static struct addr_range vmlinuz = {0, 0, 0}; +static struct addr_range initrd = {0, 0, 0}; static char scratch[128<<10]; /* 128kB of scratch space for gunzip */ @@ -64,13 +56,6 @@ typedef void (*kernel_entry_t)( unsigned long, void *); -int (*prom)(void *); - -void *chosen_handle; -void *stdin; -void *stdout; -void *stderr; - #undef DEBUG static unsigned long claim_base = PROG_START; @@ -277,7 +262,7 @@ void zfree(void *x, void *addr, unsigned nb) #define DEFLATED 8 -void gunzip(void *dst, int dstlen, unsigned char *src, int *lenp) +static void gunzip(void *dst, int dstlen, unsigned char *src, int *lenp) { z_stream s; int r, i, flags; diff --git a/arch/ppc64/boot/page.h b/arch/ppc64/boot/page.h new file mode 100644 index 0000000..14eca30f --- /dev/null +++ b/arch/ppc64/boot/page.h @@ -0,0 +1,34 @@ +#ifndef _PPC_BOOT_PAGE_H +#define _PPC_BOOT_PAGE_H +/* + * Copyright (C) 2001 PPC64 Team, IBM Corp + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifdef __ASSEMBLY__ +#define ASM_CONST(x) x +#else +#define __ASM_CONST(x) x##UL +#define ASM_CONST(x) __ASM_CONST(x) +#endif + +/* PAGE_SHIFT determines the page size */ +#define PAGE_SHIFT 12 +#define PAGE_SIZE (ASM_CONST(1) << PAGE_SHIFT) +#define PAGE_MASK (~(PAGE_SIZE-1)) + +/* align addr on a size boundary - adjust address up/down if needed */ +#define _ALIGN_UP(addr,size) (((addr)+((size)-1))&(~((size)-1))) +#define _ALIGN_DOWN(addr,size) ((addr)&(~((size)-1))) + +/* align addr on a size boundary - adjust address up if needed */ +#define _ALIGN(addr,size) _ALIGN_UP(addr,size) + +/* to align the pointer to the (next) page boundary */ +#define PAGE_ALIGN(addr) _ALIGN(addr, PAGE_SIZE) + +#endif /* _PPC_BOOT_PAGE_H */ diff --git a/arch/ppc64/boot/ppc32-types.h b/arch/ppc64/boot/ppc32-types.h deleted file mode 100644 index f7b8884..0000000 --- a/arch/ppc64/boot/ppc32-types.h +++ /dev/null @@ -1,36 +0,0 @@ -#ifndef _PPC64_TYPES_H -#define _PPC64_TYPES_H - -typedef __signed__ char __s8; -typedef unsigned char __u8; - -typedef __signed__ short __s16; -typedef unsigned short __u16; - -typedef __signed__ int __s32; -typedef unsigned int __u32; - -typedef __signed__ long long __s64; -typedef unsigned long long __u64; - -typedef signed char s8; -typedef unsigned char u8; - -typedef signed short s16; -typedef unsigned short u16; - -typedef signed int s32; -typedef unsigned int u32; - -typedef signed long long s64; -typedef unsigned long long u64; - -typedef struct { - __u32 u[4]; -} __attribute((aligned(16))) __vector128; - -#define BITS_PER_LONG 32 - -typedef __vector128 vector128; - -#endif /* _PPC64_TYPES_H */ diff --git a/arch/ppc64/boot/ppc_asm.h b/arch/ppc64/boot/ppc_asm.h new file mode 100644 index 0000000..1c2c281 --- /dev/null +++ b/arch/ppc64/boot/ppc_asm.h @@ -0,0 +1,62 @@ +#ifndef _PPC64_PPC_ASM_H +#define _PPC64_PPC_ASM_H +/* + * + * Definitions used by various bits of low-level assembly code on PowerPC. + * + * Copyright (C) 1995-1999 Gary Thomas, Paul Mackerras, Cort Dougan. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* Condition Register Bit Fields */ + +#define cr0 0 +#define cr1 1 +#define cr2 2 +#define cr3 3 +#define cr4 4 +#define cr5 5 +#define cr6 6 +#define cr7 7 + + +/* General Purpose Registers (GPRs) */ + +#define r0 0 +#define r1 1 +#define r2 2 +#define r3 3 +#define r4 4 +#define r5 5 +#define r6 6 +#define r7 7 +#define r8 8 +#define r9 9 +#define r10 10 +#define r11 11 +#define r12 12 +#define r13 13 +#define r14 14 +#define r15 15 +#define r16 16 +#define r17 17 +#define r18 18 +#define r19 19 +#define r20 20 +#define r21 21 +#define r22 22 +#define r23 23 +#define r24 24 +#define r25 25 +#define r26 26 +#define r27 27 +#define r28 28 +#define r29 29 +#define r30 30 +#define r31 31 + +#endif /* _PPC64_PPC_ASM_H */ diff --git a/arch/ppc64/boot/prom.c b/arch/ppc64/boot/prom.c index 5e48b80..4bea2f4 100644 --- a/arch/ppc64/boot/prom.c +++ b/arch/ppc64/boot/prom.c @@ -7,43 +7,19 @@ * 2 of the License, or (at your option) any later version. */ #include -#include -#include -#include - -extern __u32 __div64_32(unsigned long long *dividend, __u32 divisor); - -/* The unnecessary pointer compare is there - * to check for type safety (n must be 64bit) - */ -# define do_div(n,base) ({ \ - __u32 __base = (base); \ - __u32 __rem; \ - (void)(((typeof((n)) *)0) == ((unsigned long long *)0)); \ - if (((n) >> 32) == 0) { \ - __rem = (__u32)(n) % __base; \ - (n) = (__u32)(n) / __base; \ - } else \ - __rem = __div64_32(&(n), __base); \ - __rem; \ - }) +#include +#include "string.h" +#include "stdio.h" +#include "prom.h" int (*prom)(void *); void *chosen_handle; + void *stdin; void *stdout; void *stderr; -void exit(void); -void *finddevice(const char *name); -int getprop(void *phandle, const char *name, void *buf, int buflen); -void chrpboot(int a1, int a2, void *prom); /* in main.c */ - -int printf(char *fmt, ...); - -/* there is no convenient header to get this from... -- paulus */ -extern unsigned long strlen(const char *); int write(void *handle, void *ptr, int nb) @@ -210,107 +186,6 @@ fputs(char *str, void *f) return write(f, str, n) == n? 0: -1; } -int -readchar(void) -{ - char ch; - - for (;;) { - switch (read(stdin, &ch, 1)) { - case 1: - return ch; - case -1: - printf("read(stdin) returned -1\r\n"); - return -1; - } - } -} - -static char line[256]; -static char *lineptr; -static int lineleft; - -int -getchar(void) -{ - int c; - - if (lineleft == 0) { - lineptr = line; - for (;;) { - c = readchar(); - if (c == -1 || c == 4) - break; - if (c == '\r' || c == '\n') { - *lineptr++ = '\n'; - putchar('\n'); - break; - } - switch (c) { - case 0177: - case '\b': - if (lineptr > line) { - putchar('\b'); - putchar(' '); - putchar('\b'); - --lineptr; - } - break; - case 'U' & 0x1F: - while (lineptr > line) { - putchar('\b'); - putchar(' '); - putchar('\b'); - --lineptr; - } - break; - default: - if (lineptr >= &line[sizeof(line) - 1]) - putchar('\a'); - else { - putchar(c); - *lineptr++ = c; - } - } - } - lineleft = lineptr - line; - lineptr = line; - } - if (lineleft == 0) - return -1; - --lineleft; - return *lineptr++; -} - - - -/* String functions lifted from lib/vsprintf.c and lib/ctype.c */ -unsigned char _ctype[] = { -_C,_C,_C,_C,_C,_C,_C,_C, /* 0-7 */ -_C,_C|_S,_C|_S,_C|_S,_C|_S,_C|_S,_C,_C, /* 8-15 */ -_C,_C,_C,_C,_C,_C,_C,_C, /* 16-23 */ -_C,_C,_C,_C,_C,_C,_C,_C, /* 24-31 */ -_S|_SP,_P,_P,_P,_P,_P,_P,_P, /* 32-39 */ -_P,_P,_P,_P,_P,_P,_P,_P, /* 40-47 */ -_D,_D,_D,_D,_D,_D,_D,_D, /* 48-55 */ -_D,_D,_P,_P,_P,_P,_P,_P, /* 56-63 */ -_P,_U|_X,_U|_X,_U|_X,_U|_X,_U|_X,_U|_X,_U, /* 64-71 */ -_U,_U,_U,_U,_U,_U,_U,_U, /* 72-79 */ -_U,_U,_U,_U,_U,_U,_U,_U, /* 80-87 */ -_U,_U,_U,_P,_P,_P,_P,_P, /* 88-95 */ -_P,_L|_X,_L|_X,_L|_X,_L|_X,_L|_X,_L|_X,_L, /* 96-103 */ -_L,_L,_L,_L,_L,_L,_L,_L, /* 104-111 */ -_L,_L,_L,_L,_L,_L,_L,_L, /* 112-119 */ -_L,_L,_L,_P,_P,_P,_P,_C, /* 120-127 */ -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 128-143 */ -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 144-159 */ -_S|_SP,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P, /* 160-175 */ -_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P, /* 176-191 */ -_U,_U,_U,_U,_U,_U,_U,_U,_U,_U,_U,_U,_U,_U,_U,_U, /* 192-207 */ -_U,_U,_U,_U,_U,_U,_U,_P,_U,_U,_U,_U,_U,_U,_U,_L, /* 208-223 */ -_L,_L,_L,_L,_L,_L,_L,_L,_L,_L,_L,_L,_L,_L,_L,_L, /* 224-239 */ -_L,_L,_L,_L,_L,_L,_L,_P,_L,_L,_L,_L,_L,_L,_L,_L}; /* 240-255 */ - size_t strnlen(const char * s, size_t count) { const char *sc; @@ -320,44 +195,30 @@ size_t strnlen(const char * s, size_t count) return sc - s; } -unsigned long simple_strtoul(const char *cp,char **endp,unsigned int base) -{ - unsigned long result = 0,value; +extern unsigned int __div64_32(unsigned long long *dividend, + unsigned int divisor); - if (!base) { - base = 10; - if (*cp == '0') { - base = 8; - cp++; - if ((*cp == 'x') && isxdigit(cp[1])) { - cp++; - base = 16; - } - } - } - while (isxdigit(*cp) && - (value = isdigit(*cp) ? *cp-'0' : toupper(*cp)-'A'+10) < base) { - result = result*base + value; - cp++; - } - if (endp) - *endp = (char *)cp; - return result; -} - -long simple_strtol(const char *cp,char **endp,unsigned int base) -{ - if(*cp=='-') - return -simple_strtoul(cp+1,endp,base); - return simple_strtoul(cp,endp,base); -} +/* The unnecessary pointer compare is there + * to check for type safety (n must be 64bit) + */ +# define do_div(n,base) ({ \ + unsigned int __base = (base); \ + unsigned int __rem; \ + (void)(((typeof((n)) *)0) == ((unsigned long long *)0)); \ + if (((n) >> 32) == 0) { \ + __rem = (unsigned int)(n) % __base; \ + (n) = (unsigned int)(n) / __base; \ + } else \ + __rem = __div64_32(&(n), __base); \ + __rem; \ + }) static int skip_atoi(const char **s) { - int i=0; + int i, c; - while (isdigit(**s)) - i = i*10 + *((*s)++) - '0'; + for (i = 0; '0' <= (c = **s) && c <= '9'; ++*s) + i = i*10 + c - '0'; return i; } @@ -436,9 +297,6 @@ static char * number(char * str, unsigned long long num, int base, int size, int return str; } -/* Forward decl. needed for IP address printing stuff... */ -int sprintf(char * buf, const char *fmt, ...); - int vsprintf(char *buf, const char *fmt, va_list args) { int len; @@ -477,7 +335,7 @@ int vsprintf(char *buf, const char *fmt, va_list args) /* get field width */ field_width = -1; - if (isdigit(*fmt)) + if ('0' <= *fmt && *fmt <= '9') field_width = skip_atoi(&fmt); else if (*fmt == '*') { ++fmt; @@ -493,7 +351,7 @@ int vsprintf(char *buf, const char *fmt, va_list args) precision = -1; if (*fmt == '.') { ++fmt; - if (isdigit(*fmt)) + if ('0' <= *fmt && *fmt <= '9') precision = skip_atoi(&fmt); else if (*fmt == '*') { ++fmt; @@ -628,7 +486,7 @@ int sprintf(char * buf, const char *fmt, ...) static char sprint_buf[1024]; int -printf(char *fmt, ...) +printf(const char *fmt, ...) { va_list args; int n; diff --git a/arch/ppc64/boot/prom.h b/arch/ppc64/boot/prom.h new file mode 100644 index 0000000..96ab5ae --- /dev/null +++ b/arch/ppc64/boot/prom.h @@ -0,0 +1,18 @@ +#ifndef _PPC_BOOT_PROM_H_ +#define _PPC_BOOT_PROM_H_ + +extern int (*prom) (void *); +extern void *chosen_handle; + +extern void *stdin; +extern void *stdout; +extern void *stderr; + +extern int write(void *handle, void *ptr, int nb); +extern int read(void *handle, void *ptr, int nb); +extern void exit(void); +extern void pause(void); +extern void *finddevice(const char *); +extern void *claim(unsigned long virt, unsigned long size, unsigned long align); +extern int getprop(void *phandle, const char *name, void *buf, int buflen); +#endif /* _PPC_BOOT_PROM_H_ */ diff --git a/arch/ppc64/boot/stdio.h b/arch/ppc64/boot/stdio.h new file mode 100644 index 0000000..24bd3a8 --- /dev/null +++ b/arch/ppc64/boot/stdio.h @@ -0,0 +1,16 @@ +#ifndef _PPC_BOOT_STDIO_H_ +#define _PPC_BOOT_STDIO_H_ + +extern int printf(const char *fmt, ...); + +extern int sprintf(char *buf, const char *fmt, ...); + +extern int vsprintf(char *buf, const char *fmt, va_list args); + +extern int putc(int c, void *f); +extern int putchar(int c); +extern int getchar(void); + +extern int fputs(char *str, void *f); + +#endif /* _PPC_BOOT_STDIO_H_ */ diff --git a/arch/ppc64/boot/string.S b/arch/ppc64/boot/string.S index ba5f2d2..7ade87a 100644 --- a/arch/ppc64/boot/string.S +++ b/arch/ppc64/boot/string.S @@ -9,7 +9,7 @@ * NOTE: this code runs in 32 bit mode and is packaged as ELF32. */ -#include +#include "ppc_asm.h" .text .globl strcpy diff --git a/arch/ppc64/boot/string.h b/arch/ppc64/boot/string.h new file mode 100644 index 0000000..9289258 --- /dev/null +++ b/arch/ppc64/boot/string.h @@ -0,0 +1,16 @@ +#ifndef _PPC_BOOT_STRING_H_ +#define _PPC_BOOT_STRING_H_ + +extern char *strcpy(char *dest, const char *src); +extern char *strncpy(char *dest, const char *src, size_t n); +extern char *strcat(char *dest, const char *src); +extern int strcmp(const char *s1, const char *s2); +extern size_t strlen(const char *s); +extern size_t strnlen(const char *s, size_t count); + +extern void *memset(void *s, int c, size_t n); +extern void *memmove(void *dest, const void *src, unsigned long n); +extern void *memcpy(void *dest, const void *src, unsigned long n); +extern int memcmp(const void *s1, const void *s2, size_t n); + +#endif /* _PPC_BOOT_STRING_H_ */ diff --git a/arch/ppc64/boot/zlib.c b/arch/ppc64/boot/zlib.c index 78837e8..0d910cd 100644 --- a/arch/ppc64/boot/zlib.c +++ b/arch/ppc64/boot/zlib.c @@ -107,7 +107,7 @@ extern void *memcpy(void *, const void *, unsigned long); /* Diagnostic functions */ #ifdef DEBUG_ZLIB -# include +# include "stdio.h" # ifndef verbose # define verbose 0 # endif -- cgit v0.10.2 From e28f7faf05159f1cfd564596f5e6178edba6bd49 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Fri, 5 Aug 2005 19:39:06 +1000 Subject: [PATCH] Four level pagetables for ppc64 Implement 4-level pagetables for ppc64 This patch implements full four-level page tables for ppc64, thereby extending the usable user address range to 44 bits (16T). The patch uses a full page for the tables at the bottom and top level, and a quarter page for the intermediate levels. It uses full 64-bit pointers at every level, thus also increasing the addressable range of physical memory. This patch also tweaks the VSID allocation to allow matching range for user addresses (this halves the number of available contexts) and adds some #if and BUILD_BUG sanity checks. Signed-off-by: David Gibson Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/mm/hash_utils.c b/arch/ppc64/mm/hash_utils.c index 623b5d1..65d6e85 100644 --- a/arch/ppc64/mm/hash_utils.c +++ b/arch/ppc64/mm/hash_utils.c @@ -302,7 +302,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap) int local = 0; cpumask_t tmp; - if ((ea & ~REGION_MASK) > EADDR_MASK) + if ((ea & ~REGION_MASK) >= PGTABLE_RANGE) return 1; switch (REGION_ID(ea)) { diff --git a/arch/ppc64/mm/hugetlbpage.c b/arch/ppc64/mm/hugetlbpage.c index f952460..a13e442 100644 --- a/arch/ppc64/mm/hugetlbpage.c +++ b/arch/ppc64/mm/hugetlbpage.c @@ -27,124 +27,91 @@ #include -#define HUGEPGDIR_SHIFT (HPAGE_SHIFT + PAGE_SHIFT - 3) -#define HUGEPGDIR_SIZE (1UL << HUGEPGDIR_SHIFT) -#define HUGEPGDIR_MASK (~(HUGEPGDIR_SIZE-1)) - -#define HUGEPTE_INDEX_SIZE 9 -#define HUGEPGD_INDEX_SIZE 10 - -#define PTRS_PER_HUGEPTE (1 << HUGEPTE_INDEX_SIZE) -#define PTRS_PER_HUGEPGD (1 << HUGEPGD_INDEX_SIZE) - -static inline int hugepgd_index(unsigned long addr) -{ - return (addr & ~REGION_MASK) >> HUGEPGDIR_SHIFT; -} - -static pud_t *hugepgd_offset(struct mm_struct *mm, unsigned long addr) +/* Modelled after find_linux_pte() */ +pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) { - int index; + pgd_t *pg; + pud_t *pu; + pmd_t *pm; + pte_t *pt; - if (! mm->context.huge_pgdir) - return NULL; + BUG_ON(! in_hugepage_area(mm->context, addr)); + addr &= HPAGE_MASK; + + pg = pgd_offset(mm, addr); + if (!pgd_none(*pg)) { + pu = pud_offset(pg, addr); + if (!pud_none(*pu)) { + pm = pmd_offset(pu, addr); + pt = (pte_t *)pm; + BUG_ON(!pmd_none(*pm) + && !(pte_present(*pt) && pte_huge(*pt))); + return pt; + } + } - index = hugepgd_index(addr); - BUG_ON(index >= PTRS_PER_HUGEPGD); - return (pud_t *)(mm->context.huge_pgdir + index); + return NULL; } -static inline pte_t *hugepte_offset(pud_t *dir, unsigned long addr) +pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) { - int index; - - if (pud_none(*dir)) - return NULL; - - index = (addr >> HPAGE_SHIFT) % PTRS_PER_HUGEPTE; - return (pte_t *)pud_page(*dir) + index; -} + pgd_t *pg; + pud_t *pu; + pmd_t *pm; + pte_t *pt; -static pud_t *hugepgd_alloc(struct mm_struct *mm, unsigned long addr) -{ BUG_ON(! in_hugepage_area(mm->context, addr)); - if (! mm->context.huge_pgdir) { - pgd_t *new; - spin_unlock(&mm->page_table_lock); - /* Don't use pgd_alloc(), because we want __GFP_REPEAT */ - new = kmem_cache_alloc(zero_cache, GFP_KERNEL | __GFP_REPEAT); - BUG_ON(memcmp(new, empty_zero_page, PAGE_SIZE)); - spin_lock(&mm->page_table_lock); - - /* - * Because we dropped the lock, we should re-check the - * entry, as somebody else could have populated it.. - */ - if (mm->context.huge_pgdir) - pgd_free(new); - else - mm->context.huge_pgdir = new; - } - return hugepgd_offset(mm, addr); -} + addr &= HPAGE_MASK; -static pte_t *hugepte_alloc(struct mm_struct *mm, pud_t *dir, unsigned long addr) -{ - if (! pud_present(*dir)) { - pte_t *new; + pg = pgd_offset(mm, addr); + pu = pud_alloc(mm, pg, addr); - spin_unlock(&mm->page_table_lock); - new = kmem_cache_alloc(zero_cache, GFP_KERNEL | __GFP_REPEAT); - BUG_ON(memcmp(new, empty_zero_page, PAGE_SIZE)); - spin_lock(&mm->page_table_lock); - /* - * Because we dropped the lock, we should re-check the - * entry, as somebody else could have populated it.. - */ - if (pud_present(*dir)) { - if (new) - kmem_cache_free(zero_cache, new); - } else { - struct page *ptepage; - - if (! new) - return NULL; - ptepage = virt_to_page(new); - ptepage->mapping = (void *) mm; - ptepage->index = addr & HUGEPGDIR_MASK; - pud_populate(mm, dir, new); + if (pu) { + pm = pmd_alloc(mm, pu, addr); + if (pm) { + pt = (pte_t *)pm; + BUG_ON(!pmd_none(*pm) + && !(pte_present(*pt) && pte_huge(*pt))); + return pt; } } - return hugepte_offset(dir, addr); + return NULL; } -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) -{ - pud_t *pud; +#define HUGEPTE_BATCH_SIZE (HPAGE_SIZE / PMD_SIZE) - BUG_ON(! in_hugepage_area(mm->context, addr)); +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + int i; - pud = hugepgd_offset(mm, addr); - if (! pud) - return NULL; + if (pte_present(*ptep)) { + pte_clear(mm, addr, ptep); + flush_tlb_pending(); + } - return hugepte_offset(pud, addr); + for (i = 0; i < HUGEPTE_BATCH_SIZE; i++) { + *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); + ptep++; + } } -pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) +pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) { - pud_t *pud; + unsigned long old = pte_update(ptep, ~0UL); + int i; - BUG_ON(! in_hugepage_area(mm->context, addr)); + if (old & _PAGE_HASHPTE) + hpte_update(mm, addr, old, 0); - pud = hugepgd_alloc(mm, addr); - if (! pud) - return NULL; + for (i = 1; i < HUGEPTE_BATCH_SIZE; i++) + ptep[i] = __pte(0); - return hugepte_alloc(mm, pud, addr); + return __pte(old); } /* @@ -541,42 +508,6 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, } } -void hugetlb_mm_free_pgd(struct mm_struct *mm) -{ - int i; - pgd_t *pgdir; - - spin_lock(&mm->page_table_lock); - - pgdir = mm->context.huge_pgdir; - if (! pgdir) - goto out; - - mm->context.huge_pgdir = NULL; - - /* cleanup any hugepte pages leftover */ - for (i = 0; i < PTRS_PER_HUGEPGD; i++) { - pud_t *pud = (pud_t *)(pgdir + i); - - if (! pud_none(*pud)) { - pte_t *pte = (pte_t *)pud_page(*pud); - struct page *ptepage = virt_to_page(pte); - - ptepage->mapping = NULL; - - BUG_ON(memcmp(pte, empty_zero_page, PAGE_SIZE)); - kmem_cache_free(zero_cache, pte); - } - pud_clear(pud); - } - - BUG_ON(memcmp(pgdir, empty_zero_page, PAGE_SIZE)); - kmem_cache_free(zero_cache, pgdir); - - out: - spin_unlock(&mm->page_table_lock); -} - int hash_huge_page(struct mm_struct *mm, unsigned long access, unsigned long ea, unsigned long vsid, int local) { diff --git a/arch/ppc64/mm/imalloc.c b/arch/ppc64/mm/imalloc.c index b6e75b8..c65b87b 100644 --- a/arch/ppc64/mm/imalloc.c +++ b/arch/ppc64/mm/imalloc.c @@ -31,7 +31,7 @@ static int get_free_im_addr(unsigned long size, unsigned long *im_addr) break; if ((unsigned long)tmp->addr >= ioremap_bot) addr = tmp->size + (unsigned long) tmp->addr; - if (addr > IMALLOC_END-size) + if (addr >= IMALLOC_END-size) return 1; } *im_addr = addr; diff --git a/arch/ppc64/mm/init.c b/arch/ppc64/mm/init.c index e58a24d..87f256d 100644 --- a/arch/ppc64/mm/init.c +++ b/arch/ppc64/mm/init.c @@ -66,6 +66,14 @@ #include #include +#if PGTABLE_RANGE > USER_VSID_RANGE +#warning Limited user VSID range means pagetable space is wasted +#endif + +#if (TASK_SIZE_USER64 < PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE) +#warning TASK_SIZE is smaller than it needs to be. +#endif + int mem_init_done; unsigned long ioremap_bot = IMALLOC_BASE; static unsigned long phbs_io_bot = PHBS_IO_BASE; @@ -226,7 +234,7 @@ void __iomem * __ioremap(unsigned long addr, unsigned long size, * Before that, we map using addresses going * up from ioremap_bot. imalloc will use * the addresses from ioremap_bot through - * IMALLOC_END (0xE000001fffffffff) + * IMALLOC_END * */ pa = addr & PAGE_MASK; @@ -417,12 +425,6 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm) int index; int err; -#ifdef CONFIG_HUGETLB_PAGE - /* We leave htlb_segs as it was, but for a fork, we need to - * clear the huge_pgdir. */ - mm->context.huge_pgdir = NULL; -#endif - again: if (!idr_pre_get(&mmu_context_idr, GFP_KERNEL)) return -ENOMEM; @@ -453,8 +455,6 @@ void destroy_context(struct mm_struct *mm) spin_unlock(&mmu_context_lock); mm->context.id = NO_CONTEXT; - - hugetlb_mm_free_pgd(mm); } /* @@ -833,23 +833,43 @@ void __iomem * reserve_phb_iospace(unsigned long size) return virt_addr; } -kmem_cache_t *zero_cache; - -static void zero_ctor(void *pte, kmem_cache_t *cache, unsigned long flags) +static void zero_ctor(void *addr, kmem_cache_t *cache, unsigned long flags) { - memset(pte, 0, PAGE_SIZE); + memset(addr, 0, kmem_cache_size(cache)); } +static const int pgtable_cache_size[2] = { + PTE_TABLE_SIZE, PMD_TABLE_SIZE +}; +static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = { + "pgd_pte_cache", "pud_pmd_cache", +}; + +kmem_cache_t *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)]; + void pgtable_cache_init(void) { - zero_cache = kmem_cache_create("zero", - PAGE_SIZE, - 0, - SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, - zero_ctor, - NULL); - if (!zero_cache) - panic("pgtable_cache_init(): could not create zero_cache!\n"); + int i; + + BUILD_BUG_ON(PTE_TABLE_SIZE != pgtable_cache_size[PTE_CACHE_NUM]); + BUILD_BUG_ON(PMD_TABLE_SIZE != pgtable_cache_size[PMD_CACHE_NUM]); + BUILD_BUG_ON(PUD_TABLE_SIZE != pgtable_cache_size[PUD_CACHE_NUM]); + BUILD_BUG_ON(PGD_TABLE_SIZE != pgtable_cache_size[PGD_CACHE_NUM]); + + for (i = 0; i < ARRAY_SIZE(pgtable_cache_size); i++) { + int size = pgtable_cache_size[i]; + const char *name = pgtable_cache_name[i]; + + pgtable_cache[i] = kmem_cache_create(name, + size, size, + SLAB_HWCACHE_ALIGN + | SLAB_MUST_HWCACHE_ALIGN, + zero_ctor, + NULL); + if (! pgtable_cache[i]) + panic("pgtable_cache_init(): could not create %s!\n", + name); + } } pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr, diff --git a/arch/ppc64/mm/slb_low.S b/arch/ppc64/mm/slb_low.S index 8379d67..f20fc52 100644 --- a/arch/ppc64/mm/slb_low.S +++ b/arch/ppc64/mm/slb_low.S @@ -91,7 +91,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE) 0: /* user address: proto-VSID = context<<15 | ESID */ li r11,SLB_VSID_USER - srdi. r9,r3,13 + srdi. r9,r3,USER_ESID_BITS bne- 8f /* invalid ea bits set */ #ifdef CONFIG_HUGETLB_PAGE diff --git a/arch/ppc64/mm/tlb.c b/arch/ppc64/mm/tlb.c index 26f0172..d8a6593 100644 --- a/arch/ppc64/mm/tlb.c +++ b/arch/ppc64/mm/tlb.c @@ -41,7 +41,58 @@ DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur); unsigned long pte_freelist_forced_free; -void __pte_free_tlb(struct mmu_gather *tlb, struct page *ptepage) +struct pte_freelist_batch +{ + struct rcu_head rcu; + unsigned int index; + pgtable_free_t tables[0]; +}; + +DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur); +unsigned long pte_freelist_forced_free; + +#define PTE_FREELIST_SIZE \ + ((PAGE_SIZE - sizeof(struct pte_freelist_batch)) \ + / sizeof(pgtable_free_t)) + +#ifdef CONFIG_SMP +static void pte_free_smp_sync(void *arg) +{ + /* Do nothing, just ensure we sync with all CPUs */ +} +#endif + +/* This is only called when we are critically out of memory + * (and fail to get a page in pte_free_tlb). + */ +static void pgtable_free_now(pgtable_free_t pgf) +{ + pte_freelist_forced_free++; + + smp_call_function(pte_free_smp_sync, NULL, 0, 1); + + pgtable_free(pgf); +} + +static void pte_free_rcu_callback(struct rcu_head *head) +{ + struct pte_freelist_batch *batch = + container_of(head, struct pte_freelist_batch, rcu); + unsigned int i; + + for (i = 0; i < batch->index; i++) + pgtable_free(batch->tables[i]); + + free_page((unsigned long)batch); +} + +static void pte_free_submit(struct pte_freelist_batch *batch) +{ + INIT_RCU_HEAD(&batch->rcu); + call_rcu(&batch->rcu, pte_free_rcu_callback); +} + +void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf) { /* This is safe as we are holding page_table_lock */ cpumask_t local_cpumask = cpumask_of_cpu(smp_processor_id()); @@ -49,19 +100,19 @@ void __pte_free_tlb(struct mmu_gather *tlb, struct page *ptepage) if (atomic_read(&tlb->mm->mm_users) < 2 || cpus_equal(tlb->mm->cpu_vm_mask, local_cpumask)) { - pte_free(ptepage); + pgtable_free(pgf); return; } if (*batchp == NULL) { *batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC); if (*batchp == NULL) { - pte_free_now(ptepage); + pgtable_free_now(pgf); return; } (*batchp)->index = 0; } - (*batchp)->pages[(*batchp)->index++] = ptepage; + (*batchp)->tables[(*batchp)->index++] = pgf; if ((*batchp)->index == PTE_FREELIST_SIZE) { pte_free_submit(*batchp); *batchp = NULL; @@ -132,42 +183,6 @@ void __flush_tlb_pending(struct ppc64_tlb_batch *batch) put_cpu(); } -#ifdef CONFIG_SMP -static void pte_free_smp_sync(void *arg) -{ - /* Do nothing, just ensure we sync with all CPUs */ -} -#endif - -/* This is only called when we are critically out of memory - * (and fail to get a page in pte_free_tlb). - */ -void pte_free_now(struct page *ptepage) -{ - pte_freelist_forced_free++; - - smp_call_function(pte_free_smp_sync, NULL, 0, 1); - - pte_free(ptepage); -} - -static void pte_free_rcu_callback(struct rcu_head *head) -{ - struct pte_freelist_batch *batch = - container_of(head, struct pte_freelist_batch, rcu); - unsigned int i; - - for (i = 0; i < batch->index; i++) - pte_free(batch->pages[i]); - free_page((unsigned long)batch); -} - -void pte_free_submit(struct pte_freelist_batch *batch) -{ - INIT_RCU_HEAD(&batch->rcu); - call_rcu(&batch->rcu, pte_free_rcu_callback); -} - void pte_free_finish(void) { /* This is safe as we are holding page_table_lock */ diff --git a/include/asm-ppc64/imalloc.h b/include/asm-ppc64/imalloc.h index e46ff68..42adf70 100644 --- a/include/asm-ppc64/imalloc.h +++ b/include/asm-ppc64/imalloc.h @@ -6,7 +6,7 @@ */ #define PHBS_IO_BASE VMALLOC_END #define IMALLOC_BASE (PHBS_IO_BASE + 0x80000000ul) /* Reserve 2 gigs for PHBs */ -#define IMALLOC_END (VMALLOC_START + EADDR_MASK) +#define IMALLOC_END (VMALLOC_START + PGTABLE_RANGE) /* imalloc region types */ diff --git a/include/asm-ppc64/mmu.h b/include/asm-ppc64/mmu.h index 70348a8..959a4bf 100644 --- a/include/asm-ppc64/mmu.h +++ b/include/asm-ppc64/mmu.h @@ -259,8 +259,10 @@ extern void stabs_alloc(void); #define VSID_BITS 36 #define VSID_MODULUS ((1UL<context.htlb_segs) @@ -125,36 +126,42 @@ extern void copy_user_page(void *to, void *from, unsigned long vaddr, struct pag * Entries in the pte table are 64b, while entries in the pgd & pmd are 32b. */ typedef struct { unsigned long pte; } pte_t; -typedef struct { unsigned int pmd; } pmd_t; -typedef struct { unsigned int pgd; } pgd_t; +typedef struct { unsigned long pmd; } pmd_t; +typedef struct { unsigned long pud; } pud_t; +typedef struct { unsigned long pgd; } pgd_t; typedef struct { unsigned long pgprot; } pgprot_t; #define pte_val(x) ((x).pte) #define pmd_val(x) ((x).pmd) +#define pud_val(x) ((x).pud) #define pgd_val(x) ((x).pgd) #define pgprot_val(x) ((x).pgprot) -#define __pte(x) ((pte_t) { (x) } ) -#define __pmd(x) ((pmd_t) { (x) } ) -#define __pgd(x) ((pgd_t) { (x) } ) -#define __pgprot(x) ((pgprot_t) { (x) } ) +#define __pte(x) ((pte_t) { (x) }) +#define __pmd(x) ((pmd_t) { (x) }) +#define __pud(x) ((pud_t) { (x) }) +#define __pgd(x) ((pgd_t) { (x) }) +#define __pgprot(x) ((pgprot_t) { (x) }) #else /* * .. while these make it easier on the compiler */ typedef unsigned long pte_t; -typedef unsigned int pmd_t; -typedef unsigned int pgd_t; +typedef unsigned long pmd_t; +typedef unsigned long pud_t; +typedef unsigned long pgd_t; typedef unsigned long pgprot_t; #define pte_val(x) (x) #define pmd_val(x) (x) +#define pud_val(x) (x) #define pgd_val(x) (x) #define pgprot_val(x) (x) #define __pte(x) (x) #define __pmd(x) (x) +#define __pud(x) (x) #define __pgd(x) (x) #define __pgprot(x) (x) @@ -208,9 +215,6 @@ extern u64 ppc64_pft_size; /* Log 2 of page table size */ #define USER_REGION_ID (0UL) #define REGION_ID(ea) (((unsigned long)(ea)) >> REGION_SHIFT) -#define __bpn_to_ba(x) ((((unsigned long)(x)) << PAGE_SHIFT) + KERNELBASE) -#define __ba_to_bpn(x) ((((unsigned long)(x)) & ~REGION_MASK) >> PAGE_SHIFT) - #define __va(x) ((void *)((unsigned long)(x) + KERNELBASE)) #ifdef CONFIG_DISCONTIGMEM diff --git a/include/asm-ppc64/pgalloc.h b/include/asm-ppc64/pgalloc.h index 4fc4b73..26bc49c 100644 --- a/include/asm-ppc64/pgalloc.h +++ b/include/asm-ppc64/pgalloc.h @@ -6,7 +6,12 @@ #include #include -extern kmem_cache_t *zero_cache; +extern kmem_cache_t *pgtable_cache[]; + +#define PTE_CACHE_NUM 0 +#define PMD_CACHE_NUM 1 +#define PUD_CACHE_NUM 1 +#define PGD_CACHE_NUM 0 /* * This program is free software; you can redistribute it and/or @@ -15,30 +20,40 @@ extern kmem_cache_t *zero_cache; * 2 of the License, or (at your option) any later version. */ -static inline pgd_t * -pgd_alloc(struct mm_struct *mm) +static inline pgd_t *pgd_alloc(struct mm_struct *mm) { - return kmem_cache_alloc(zero_cache, GFP_KERNEL); + return kmem_cache_alloc(pgtable_cache[PGD_CACHE_NUM], GFP_KERNEL); } -static inline void -pgd_free(pgd_t *pgd) +static inline void pgd_free(pgd_t *pgd) { - kmem_cache_free(zero_cache, pgd); + kmem_cache_free(pgtable_cache[PGD_CACHE_NUM], pgd); +} + +#define pgd_populate(MM, PGD, PUD) pgd_set(PGD, PUD) + +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + return kmem_cache_alloc(pgtable_cache[PUD_CACHE_NUM], + GFP_KERNEL|__GFP_REPEAT); +} + +static inline void pud_free(pud_t *pud) +{ + kmem_cache_free(pgtable_cache[PUD_CACHE_NUM], pud); } #define pud_populate(MM, PUD, PMD) pud_set(PUD, PMD) -static inline pmd_t * -pmd_alloc_one(struct mm_struct *mm, unsigned long addr) +static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { - return kmem_cache_alloc(zero_cache, GFP_KERNEL|__GFP_REPEAT); + return kmem_cache_alloc(pgtable_cache[PMD_CACHE_NUM], + GFP_KERNEL|__GFP_REPEAT); } -static inline void -pmd_free(pmd_t *pmd) +static inline void pmd_free(pmd_t *pmd) { - kmem_cache_free(zero_cache, pmd); + kmem_cache_free(pgtable_cache[PMD_CACHE_NUM], pmd); } #define pmd_populate_kernel(mm, pmd, pte) pmd_set(pmd, pte) @@ -47,44 +62,58 @@ pmd_free(pmd_t *pmd) static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - return kmem_cache_alloc(zero_cache, GFP_KERNEL|__GFP_REPEAT); + return kmem_cache_alloc(pgtable_cache[PTE_CACHE_NUM], + GFP_KERNEL|__GFP_REPEAT); } static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) { - pte_t *pte = kmem_cache_alloc(zero_cache, GFP_KERNEL|__GFP_REPEAT); - if (pte) - return virt_to_page(pte); - return NULL; + return virt_to_page(pte_alloc_one_kernel(mm, address)); } static inline void pte_free_kernel(pte_t *pte) { - kmem_cache_free(zero_cache, pte); + kmem_cache_free(pgtable_cache[PTE_CACHE_NUM], pte); } static inline void pte_free(struct page *ptepage) { - kmem_cache_free(zero_cache, page_address(ptepage)); + pte_free_kernel(page_address(ptepage)); } -struct pte_freelist_batch +#define PGF_CACHENUM_MASK 0xf + +typedef struct pgtable_free { + unsigned long val; +} pgtable_free_t; + +static inline pgtable_free_t pgtable_free_cache(void *p, int cachenum, + unsigned long mask) { - struct rcu_head rcu; - unsigned int index; - struct page * pages[0]; -}; + BUG_ON(cachenum > PGF_CACHENUM_MASK); -#define PTE_FREELIST_SIZE ((PAGE_SIZE - sizeof(struct pte_freelist_batch)) / \ - sizeof(struct page *)) + return (pgtable_free_t){.val = ((unsigned long) p & ~mask) | cachenum}; +} -extern void pte_free_now(struct page *ptepage); -extern void pte_free_submit(struct pte_freelist_batch *batch); +static inline void pgtable_free(pgtable_free_t pgf) +{ + void *p = (void *)(pgf.val & ~PGF_CACHENUM_MASK); + int cachenum = pgf.val & PGF_CACHENUM_MASK; -DECLARE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur); + kmem_cache_free(pgtable_cache[cachenum], p); +} -void __pte_free_tlb(struct mmu_gather *tlb, struct page *ptepage); -#define __pmd_free_tlb(tlb, pmd) __pte_free_tlb(tlb, virt_to_page(pmd)) +void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf); + +#define __pte_free_tlb(tlb, ptepage) \ + pgtable_free_tlb(tlb, pgtable_free_cache(page_address(ptepage), \ + PTE_CACHE_NUM, PTE_TABLE_SIZE-1)) +#define __pmd_free_tlb(tlb, pmd) \ + pgtable_free_tlb(tlb, pgtable_free_cache(pmd, \ + PMD_CACHE_NUM, PMD_TABLE_SIZE-1)) +#define __pud_free_tlb(tlb, pmd) \ + pgtable_free_tlb(tlb, pgtable_free_cache(pud, \ + PUD_CACHE_NUM, PUD_TABLE_SIZE-1)) #define check_pgt_cache() do { } while (0) diff --git a/include/asm-ppc64/pgtable.h b/include/asm-ppc64/pgtable.h index 46cf61c..5ea952a 100644 --- a/include/asm-ppc64/pgtable.h +++ b/include/asm-ppc64/pgtable.h @@ -15,19 +15,24 @@ #include #endif /* __ASSEMBLY__ */ -#include - /* * Entries per page directory level. The PTE level must use a 64b record * for each page table entry. The PMD and PGD level use a 32b record for * each entry by assuming that each entry is page aligned. */ #define PTE_INDEX_SIZE 9 -#define PMD_INDEX_SIZE 10 -#define PGD_INDEX_SIZE 10 +#define PMD_INDEX_SIZE 7 +#define PUD_INDEX_SIZE 7 +#define PGD_INDEX_SIZE 9 + +#define PTE_TABLE_SIZE (sizeof(pte_t) << PTE_INDEX_SIZE) +#define PMD_TABLE_SIZE (sizeof(pmd_t) << PMD_INDEX_SIZE) +#define PUD_TABLE_SIZE (sizeof(pud_t) << PUD_INDEX_SIZE) +#define PGD_TABLE_SIZE (sizeof(pgd_t) << PGD_INDEX_SIZE) #define PTRS_PER_PTE (1 << PTE_INDEX_SIZE) #define PTRS_PER_PMD (1 << PMD_INDEX_SIZE) +#define PTRS_PER_PUD (1 << PMD_INDEX_SIZE) #define PTRS_PER_PGD (1 << PGD_INDEX_SIZE) /* PMD_SHIFT determines what a second-level page table entry can map */ @@ -35,8 +40,13 @@ #define PMD_SIZE (1UL << PMD_SHIFT) #define PMD_MASK (~(PMD_SIZE-1)) -/* PGDIR_SHIFT determines what a third-level page table entry can map */ -#define PGDIR_SHIFT (PMD_SHIFT + PMD_INDEX_SIZE) +/* PUD_SHIFT determines what a third-level page table entry can map */ +#define PUD_SHIFT (PMD_SHIFT + PMD_INDEX_SIZE) +#define PUD_SIZE (1UL << PUD_SHIFT) +#define PUD_MASK (~(PUD_SIZE-1)) + +/* PGDIR_SHIFT determines what a fourth-level page table entry can map */ +#define PGDIR_SHIFT (PUD_SHIFT + PUD_INDEX_SIZE) #define PGDIR_SIZE (1UL << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE-1)) @@ -45,15 +55,23 @@ /* * Size of EA range mapped by our pagetables. */ -#define EADDR_SIZE (PTE_INDEX_SIZE + PMD_INDEX_SIZE + \ - PGD_INDEX_SIZE + PAGE_SHIFT) -#define EADDR_MASK ((1UL << EADDR_SIZE) - 1) +#define PGTABLE_EADDR_SIZE (PTE_INDEX_SIZE + PMD_INDEX_SIZE + \ + PUD_INDEX_SIZE + PGD_INDEX_SIZE + PAGE_SHIFT) +#define PGTABLE_RANGE (1UL << PGTABLE_EADDR_SIZE) + +#if TASK_SIZE_USER64 > PGTABLE_RANGE +#error TASK_SIZE_USER64 exceeds pagetable range +#endif + +#if TASK_SIZE_USER64 > (1UL << (USER_ESID_BITS + SID_SHIFT)) +#error TASK_SIZE_USER64 exceeds user VSID range +#endif /* * Define the address range of the vmalloc VM area. */ #define VMALLOC_START (0xD000000000000000ul) -#define VMALLOC_SIZE (0x10000000000UL) +#define VMALLOC_SIZE (0x80000000000UL) #define VMALLOC_END (VMALLOC_START + VMALLOC_SIZE) /* @@ -154,8 +172,6 @@ extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)]; #ifndef __ASSEMBLY__ int hash_huge_page(struct mm_struct *mm, unsigned long access, unsigned long ea, unsigned long vsid, int local); - -void hugetlb_mm_free_pgd(struct mm_struct *mm); #endif /* __ASSEMBLY__ */ #define HAVE_ARCH_UNMAPPED_AREA @@ -163,7 +179,6 @@ void hugetlb_mm_free_pgd(struct mm_struct *mm); #else #define hash_huge_page(mm,a,ea,vsid,local) -1 -#define hugetlb_mm_free_pgd(mm) do {} while (0) #endif @@ -197,39 +212,45 @@ static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot) #define pte_pfn(x) ((unsigned long)((pte_val(x) >> PTE_SHIFT))) #define pte_page(x) pfn_to_page(pte_pfn(x)) -#define pmd_set(pmdp, ptep) \ - (pmd_val(*(pmdp)) = __ba_to_bpn(ptep)) +#define pmd_set(pmdp, ptep) ({BUG_ON((u64)ptep < KERNELBASE); pmd_val(*(pmdp)) = (unsigned long)(ptep);}) #define pmd_none(pmd) (!pmd_val(pmd)) #define pmd_bad(pmd) (pmd_val(pmd) == 0) #define pmd_present(pmd) (pmd_val(pmd) != 0) #define pmd_clear(pmdp) (pmd_val(*(pmdp)) = 0) -#define pmd_page_kernel(pmd) (__bpn_to_ba(pmd_val(pmd))) +#define pmd_page_kernel(pmd) (pmd_val(pmd)) #define pmd_page(pmd) virt_to_page(pmd_page_kernel(pmd)) -#define pud_set(pudp, pmdp) (pud_val(*(pudp)) = (__ba_to_bpn(pmdp))) +#define pud_set(pudp, pmdp) (pud_val(*(pudp)) = (unsigned long)(pmdp)) #define pud_none(pud) (!pud_val(pud)) -#define pud_bad(pud) ((pud_val(pud)) == 0UL) -#define pud_present(pud) (pud_val(pud) != 0UL) -#define pud_clear(pudp) (pud_val(*(pudp)) = 0UL) -#define pud_page(pud) (__bpn_to_ba(pud_val(pud))) +#define pud_bad(pud) ((pud_val(pud)) == 0) +#define pud_present(pud) (pud_val(pud) != 0) +#define pud_clear(pudp) (pud_val(*(pudp)) = 0) +#define pud_page(pud) (pud_val(pud)) + +#define pgd_set(pgdp, pudp) ({pgd_val(*(pgdp)) = (unsigned long)(pudp);}) +#define pgd_none(pgd) (!pgd_val(pgd)) +#define pgd_bad(pgd) (pgd_val(pgd) == 0) +#define pgd_present(pgd) (pgd_val(pgd) != 0) +#define pgd_clear(pgdp) (pgd_val(*(pgdp)) = 0) +#define pgd_page(pgd) (pgd_val(pgd)) /* * Find an entry in a page-table-directory. We combine the address region * (the high order N bits) and the pgd portion of the address. */ /* to avoid overflow in free_pgtables we don't use PTRS_PER_PGD here */ -#define pgd_index(address) (((address) >> (PGDIR_SHIFT)) & 0x7ff) +#define pgd_index(address) (((address) >> (PGDIR_SHIFT)) & 0x1ff) #define pgd_offset(mm, address) ((mm)->pgd + pgd_index(address)) -/* Find an entry in the second-level page table.. */ +#define pud_offset(pgdp, addr) \ + (((pud_t *) pgd_page(*(pgdp))) + (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))) + #define pmd_offset(pudp,addr) \ - ((pmd_t *) pud_page(*(pudp)) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))) + (((pmd_t *) pud_page(*(pudp))) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))) -/* Find an entry in the third-level page table.. */ #define pte_offset_kernel(dir,addr) \ - ((pte_t *) pmd_page_kernel(*(dir)) \ - + (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))) + (((pte_t *) pmd_page_kernel(*(dir))) + (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))) #define pte_offset_map(dir,addr) pte_offset_kernel((dir), (addr)) #define pte_offset_map_nested(dir,addr) pte_offset_kernel((dir), (addr)) @@ -458,23 +479,18 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr, #define pte_same(A,B) (((pte_val(A) ^ pte_val(B)) & ~_PAGE_HPTEFLAGS) == 0) #define pmd_ERROR(e) \ - printk("%s:%d: bad pmd %08x.\n", __FILE__, __LINE__, pmd_val(e)) + printk("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pmd_val(e)) +#define pud_ERROR(e) \ + printk("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pud_val(e)) #define pgd_ERROR(e) \ - printk("%s:%d: bad pgd %08x.\n", __FILE__, __LINE__, pgd_val(e)) + printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) extern pgd_t swapper_pg_dir[]; extern void paging_init(void); -/* - * Because the huge pgtables are only 2 level, they can take - * at most around 4M, much less than one hugepage which the - * process is presumably entitled to use. So we don't bother - * freeing up the pagetables on unmap, and wait until - * destroy_context() to clean up the lot. - */ #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \ - do { } while (0) + free_pgd_range(tlb, addr, end, floor, ceiling) /* * This gets called at the end of handling a page fault, when diff --git a/include/asm-ppc64/processor.h b/include/asm-ppc64/processor.h index 352306c..50b14c0 100644 --- a/include/asm-ppc64/processor.h +++ b/include/asm-ppc64/processor.h @@ -382,8 +382,8 @@ extern long kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); extern struct task_struct *last_task_used_math; extern struct task_struct *last_task_used_altivec; -/* 64-bit user address space is 41-bits (2TBs user VM) */ -#define TASK_SIZE_USER64 (0x0000020000000000UL) +/* 64-bit user address space is 44-bits (16TB user VM) */ +#define TASK_SIZE_USER64 (0x0000100000000000UL) /* * 32-bit user address space is 4GB - 1 page -- cgit v0.10.2 From 34153fa3af45d84f3221d9b67ba2ab7e8a220d28 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Tue, 9 Aug 2005 10:36:34 +0200 Subject: [PATCH] flattened device tree changes This patch updates the format of the flattened device-tree passed between the boot trampoline and the kernel to support a more compact representation, for use by embedded systems mostly. Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/kernel/prom.c b/arch/ppc64/kernel/prom.c index 5aca01d..255c39a 100644 --- a/arch/ppc64/kernel/prom.c +++ b/arch/ppc64/kernel/prom.c @@ -625,8 +625,8 @@ void __init finish_device_tree(void) static inline char *find_flat_dt_string(u32 offset) { - return ((char *)initial_boot_params) + initial_boot_params->off_dt_strings - + offset; + return ((char *)initial_boot_params) + + initial_boot_params->off_dt_strings + offset; } /** @@ -635,26 +635,33 @@ static inline char *find_flat_dt_string(u32 offset) * unflatten the tree */ static int __init scan_flat_dt(int (*it)(unsigned long node, - const char *full_path, void *data), + const char *uname, int depth, + void *data), void *data) { unsigned long p = ((unsigned long)initial_boot_params) + initial_boot_params->off_dt_struct; int rc = 0; + int depth = -1; do { u32 tag = *((u32 *)p); char *pathp; p += 4; - if (tag == OF_DT_END_NODE) + if (tag == OF_DT_END_NODE) { + depth --; + continue; + } + if (tag == OF_DT_NOP) continue; if (tag == OF_DT_END) break; if (tag == OF_DT_PROP) { u32 sz = *((u32 *)p); p += 8; - p = _ALIGN(p, sz >= 8 ? 8 : 4); + if (initial_boot_params->version < 0x10) + p = _ALIGN(p, sz >= 8 ? 8 : 4); p += sz; p = _ALIGN(p, 4); continue; @@ -664,9 +671,18 @@ static int __init scan_flat_dt(int (*it)(unsigned long node, " device tree !\n", tag); return -EINVAL; } + depth++; pathp = (char *)p; p = _ALIGN(p + strlen(pathp) + 1, 4); - rc = it(p, pathp, data); + if ((*pathp) == '/') { + char *lp, *np; + for (lp = NULL, np = pathp; *np; np++) + if ((*np) == '/') + lp = np+1; + if (lp != NULL) + pathp = lp; + } + rc = it(p, pathp, depth, data); if (rc != 0) break; } while(1); @@ -689,17 +705,21 @@ static void* __init get_flat_dt_prop(unsigned long node, const char *name, const char *nstr; p += 4; + if (tag == OF_DT_NOP) + continue; if (tag != OF_DT_PROP) return NULL; sz = *((u32 *)p); noff = *((u32 *)(p + 4)); p += 8; - p = _ALIGN(p, sz >= 8 ? 8 : 4); + if (initial_boot_params->version < 0x10) + p = _ALIGN(p, sz >= 8 ? 8 : 4); nstr = find_flat_dt_string(noff); if (nstr == NULL) { - printk(KERN_WARNING "Can't find property index name !\n"); + printk(KERN_WARNING "Can't find property index" + " name !\n"); return NULL; } if (strcmp(name, nstr) == 0) { @@ -713,7 +733,7 @@ static void* __init get_flat_dt_prop(unsigned long node, const char *name, } static void *__init unflatten_dt_alloc(unsigned long *mem, unsigned long size, - unsigned long align) + unsigned long align) { void *res; @@ -727,13 +747,16 @@ static void *__init unflatten_dt_alloc(unsigned long *mem, unsigned long size, static unsigned long __init unflatten_dt_node(unsigned long mem, unsigned long *p, struct device_node *dad, - struct device_node ***allnextpp) + struct device_node ***allnextpp, + unsigned long fpsize) { struct device_node *np; struct property *pp, **prev_pp = NULL; char *pathp; u32 tag; - unsigned int l; + unsigned int l, allocl; + int has_name = 0; + int new_format = 0; tag = *((u32 *)(*p)); if (tag != OF_DT_BEGIN_NODE) { @@ -742,21 +765,62 @@ static unsigned long __init unflatten_dt_node(unsigned long mem, } *p += 4; pathp = (char *)*p; - l = strlen(pathp) + 1; + l = allocl = strlen(pathp) + 1; *p = _ALIGN(*p + l, 4); - np = unflatten_dt_alloc(&mem, sizeof(struct device_node) + l, + /* version 0x10 has a more compact unit name here instead of the full + * path. we accumulate the full path size using "fpsize", we'll rebuild + * it later. We detect this because the first character of the name is + * not '/'. + */ + if ((*pathp) != '/') { + new_format = 1; + if (fpsize == 0) { + /* root node: special case. fpsize accounts for path + * plus terminating zero. root node only has '/', so + * fpsize should be 2, but we want to avoid the first + * level nodes to have two '/' so we use fpsize 1 here + */ + fpsize = 1; + allocl = 2; + } else { + /* account for '/' and path size minus terminal 0 + * already in 'l' + */ + fpsize += l; + allocl = fpsize; + } + } + + + np = unflatten_dt_alloc(&mem, sizeof(struct device_node) + allocl, __alignof__(struct device_node)); if (allnextpp) { memset(np, 0, sizeof(*np)); np->full_name = ((char*)np) + sizeof(struct device_node); - memcpy(np->full_name, pathp, l); + if (new_format) { + char *p = np->full_name; + /* rebuild full path for new format */ + if (dad && dad->parent) { + strcpy(p, dad->full_name); +#ifdef DEBUG + if ((strlen(p) + l + 1) != allocl) { + DBG("%s: p: %d, l: %d, a: %d\n", + pathp, strlen(p), l, allocl); + } +#endif + p += strlen(p); + } + *(p++) = '/'; + memcpy(p, pathp, l); + } else + memcpy(np->full_name, pathp, l); prev_pp = &np->properties; **allnextpp = np; *allnextpp = &np->allnext; if (dad != NULL) { np->parent = dad; - /* we temporarily use the `next' field as `last_child'. */ + /* we temporarily use the next field as `last_child'*/ if (dad->next == 0) dad->child = np; else @@ -770,18 +834,26 @@ static unsigned long __init unflatten_dt_node(unsigned long mem, char *pname; tag = *((u32 *)(*p)); + if (tag == OF_DT_NOP) { + *p += 4; + continue; + } if (tag != OF_DT_PROP) break; *p += 4; sz = *((u32 *)(*p)); noff = *((u32 *)((*p) + 4)); - *p = _ALIGN((*p) + 8, sz >= 8 ? 8 : 4); + *p += 8; + if (initial_boot_params->version < 0x10) + *p = _ALIGN(*p, sz >= 8 ? 8 : 4); pname = find_flat_dt_string(noff); if (pname == NULL) { printk("Can't find property name in list !\n"); break; } + if (strcmp(pname, "name") == 0) + has_name = 1; l = strlen(pname) + 1; pp = unflatten_dt_alloc(&mem, sizeof(struct property), __alignof__(struct property)); @@ -801,6 +873,36 @@ static unsigned long __init unflatten_dt_node(unsigned long mem, } *p = _ALIGN((*p) + sz, 4); } + /* with version 0x10 we may not have the name property, recreate + * it here from the unit name if absent + */ + if (!has_name) { + char *p = pathp, *ps = pathp, *pa = NULL; + int sz; + + while (*p) { + if ((*p) == '@') + pa = p; + if ((*p) == '/') + ps = p + 1; + p++; + } + if (pa < ps) + pa = p; + sz = (pa - ps) + 1; + pp = unflatten_dt_alloc(&mem, sizeof(struct property) + sz, + __alignof__(struct property)); + if (allnextpp) { + pp->name = "name"; + pp->length = sz; + pp->value = (unsigned char *)(pp + 1); + *prev_pp = pp; + prev_pp = &pp->next; + memcpy(pp->value, ps, sz - 1); + ((char *)pp->value)[sz - 1] = 0; + DBG("fixed up name for %s -> %s\n", pathp, pp->value); + } + } if (allnextpp) { *prev_pp = NULL; np->name = get_property(np, "name", NULL); @@ -812,7 +914,7 @@ static unsigned long __init unflatten_dt_node(unsigned long mem, np->type = ""; } while (tag == OF_DT_BEGIN_NODE) { - mem = unflatten_dt_node(mem, p, np, allnextpp); + mem = unflatten_dt_node(mem, p, np, allnextpp, fpsize); tag = *((u32 *)(*p)); } if (tag != OF_DT_END_NODE) { @@ -842,21 +944,27 @@ void __init unflatten_device_tree(void) /* First pass, scan for size */ start = ((unsigned long)initial_boot_params) + initial_boot_params->off_dt_struct; - size = unflatten_dt_node(0, &start, NULL, NULL); + size = unflatten_dt_node(0, &start, NULL, NULL, 0); + size = (size | 3) + 1; DBG(" size is %lx, allocating...\n", size); /* Allocate memory for the expanded device tree */ - mem = (unsigned long)abs_to_virt(lmb_alloc(size, + mem = (unsigned long)abs_to_virt(lmb_alloc(size + 4, __alignof__(struct device_node))); + ((u32 *)mem)[size / 4] = 0xdeadbeef; + DBG(" unflattening...\n", mem); /* Second pass, do actual unflattening */ start = ((unsigned long)initial_boot_params) + initial_boot_params->off_dt_struct; - unflatten_dt_node(mem, &start, NULL, &allnextp); + unflatten_dt_node(mem, &start, NULL, &allnextp, 0); if (*((u32 *)start) != OF_DT_END) - printk(KERN_WARNING "Weird tag at end of tree: %x\n", *((u32 *)start)); + printk(KERN_WARNING "Weird tag at end of tree: %08x\n", *((u32 *)start)); + if (((u32 *)mem)[size / 4] != 0xdeadbeef) + printk(KERN_WARNING "End of tree marker overwritten: %08x\n", + ((u32 *)mem)[size / 4] ); *allnextp = NULL; /* Get pointer to OF "/chosen" node for use everywhere */ @@ -880,7 +988,7 @@ void __init unflatten_device_tree(void) static int __init early_init_dt_scan_cpus(unsigned long node, - const char *full_path, void *data) + const char *uname, int depth, void *data) { char *type = get_flat_dt_prop(node, "device_type", NULL); u32 *prop; @@ -947,13 +1055,15 @@ static int __init early_init_dt_scan_cpus(unsigned long node, } static int __init early_init_dt_scan_chosen(unsigned long node, - const char *full_path, void *data) + const char *uname, int depth, void *data) { u32 *prop; u64 *prop64; extern unsigned long memory_limit, tce_alloc_start, tce_alloc_end; - if (strcmp(full_path, "/chosen") != 0) + DBG("search \"chosen\", depth: %d, uname: %s\n", depth, uname); + + if (depth != 1 || strcmp(uname, "chosen") != 0) return 0; /* get platform type */ @@ -1003,18 +1113,20 @@ static int __init early_init_dt_scan_chosen(unsigned long node, } static int __init early_init_dt_scan_root(unsigned long node, - const char *full_path, void *data) + const char *uname, int depth, void *data) { u32 *prop; - if (strcmp(full_path, "/") != 0) + if (depth != 0) return 0; prop = (u32 *)get_flat_dt_prop(node, "#size-cells", NULL); dt_root_size_cells = (prop == NULL) ? 1 : *prop; - + DBG("dt_root_size_cells = %x\n", dt_root_size_cells); + prop = (u32 *)get_flat_dt_prop(node, "#address-cells", NULL); dt_root_addr_cells = (prop == NULL) ? 2 : *prop; + DBG("dt_root_addr_cells = %x\n", dt_root_addr_cells); /* break now */ return 1; @@ -1042,7 +1154,7 @@ static unsigned long __init dt_mem_next_cell(int s, cell_t **cellp) static int __init early_init_dt_scan_memory(unsigned long node, - const char *full_path, void *data) + const char *uname, int depth, void *data) { char *type = get_flat_dt_prop(node, "device_type", NULL); cell_t *reg, *endp; @@ -1058,7 +1170,9 @@ static int __init early_init_dt_scan_memory(unsigned long node, endp = reg + (l / sizeof(cell_t)); - DBG("memory scan node %s ...\n", full_path); + DBG("memory scan node %s ..., reg size %ld, data: %x %x %x %x, ...\n", + uname, l, reg[0], reg[1], reg[2], reg[3]); + while ((endp - reg) >= (dt_root_addr_cells + dt_root_size_cells)) { unsigned long base, size; @@ -1469,10 +1583,11 @@ struct device_node *of_find_node_by_path(const char *path) struct device_node *np = allnodes; read_lock(&devtree_lock); - for (; np != 0; np = np->allnext) + for (; np != 0; np = np->allnext) { if (np->full_name != 0 && strcasecmp(np->full_name, path) == 0 && of_node_get(np)) break; + } read_unlock(&devtree_lock); return np; } diff --git a/arch/ppc64/kernel/prom_init.c b/arch/ppc64/kernel/prom_init.c index dbbe6c79..adcf972 100644 --- a/arch/ppc64/kernel/prom_init.c +++ b/arch/ppc64/kernel/prom_init.c @@ -1534,7 +1534,8 @@ static unsigned long __init dt_find_string(char *str) */ #define MAX_PROPERTY_NAME 64 -static void __init scan_dt_build_strings(phandle node, unsigned long *mem_start, +static void __init scan_dt_build_strings(phandle node, + unsigned long *mem_start, unsigned long *mem_end) { unsigned long offset = reloc_offset(); @@ -1547,16 +1548,21 @@ static void __init scan_dt_build_strings(phandle node, unsigned long *mem_start, /* get and store all property names */ prev_name = RELOC(""); for (;;) { - int rc; - /* 64 is max len of name including nul. */ namep = make_room(mem_start, mem_end, MAX_PROPERTY_NAME, 1); - rc = call_prom("nextprop", 3, 1, node, prev_name, namep); - if (rc != 1) { + if (call_prom("nextprop", 3, 1, node, prev_name, namep) != 1) { /* No more nodes: unwind alloc */ *mem_start = (unsigned long)namep; break; } + + /* skip "name" */ + if (strcmp(namep, RELOC("name")) == 0) { + *mem_start = (unsigned long)namep; + prev_name = RELOC("name"); + continue; + } + /* get/create string entry */ soff = dt_find_string(namep); if (soff != 0) { *mem_start = (unsigned long)namep; @@ -1571,7 +1577,7 @@ static void __init scan_dt_build_strings(phandle node, unsigned long *mem_start, /* do all our children */ child = call_prom("child", 1, 1, node); - while (child != (phandle)0) { + while (child != 0) { scan_dt_build_strings(child, mem_start, mem_end); child = call_prom("peer", 1, 1, child); } @@ -1580,16 +1586,13 @@ static void __init scan_dt_build_strings(phandle node, unsigned long *mem_start, static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start, unsigned long *mem_end) { - int l, align; phandle child; - char *namep, *prev_name, *sstart, *p, *ep; + char *namep, *prev_name, *sstart, *p, *ep, *lp, *path; unsigned long soff; unsigned char *valp; unsigned long offset = reloc_offset(); - char pname[MAX_PROPERTY_NAME]; - char *path; - - path = RELOC(prom_scratch); + static char pname[MAX_PROPERTY_NAME]; + int l; dt_push_token(OF_DT_BEGIN_NODE, mem_start, mem_end); @@ -1599,23 +1602,33 @@ static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start, namep, *mem_end - *mem_start); if (l >= 0) { /* Didn't fit? Get more room. */ - if (l+1 > *mem_end - *mem_start) { + if ((l+1) > (*mem_end - *mem_start)) { namep = make_room(mem_start, mem_end, l+1, 1); call_prom("package-to-path", 3, 1, node, namep, l); } namep[l] = '\0'; + /* Fixup an Apple bug where they have bogus \0 chars in the * middle of the path in some properties */ for (p = namep, ep = namep + l; p < ep; p++) if (*p == '\0') { memmove(p, p+1, ep - p); - ep--; l--; + ep--; l--; p--; } - *mem_start = _ALIGN(((unsigned long) namep) + strlen(namep) + 1, 4); + + /* now try to extract the unit name in that mess */ + for (p = namep, lp = NULL; *p; p++) + if (*p == '/') + lp = p + 1; + if (lp != NULL) + memmove(namep, lp, strlen(lp) + 1); + *mem_start = _ALIGN(((unsigned long) namep) + + strlen(namep) + 1, 4); } /* get it again for debugging */ + path = RELOC(prom_scratch); memset(path, 0, PROM_SCRATCH_SIZE); call_prom("package-to-path", 3, 1, node, path, PROM_SCRATCH_SIZE-1); @@ -1623,23 +1636,27 @@ static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start, prev_name = RELOC(""); sstart = (char *)RELOC(dt_string_start); for (;;) { - int rc; - - rc = call_prom("nextprop", 3, 1, node, prev_name, pname); - if (rc != 1) + if (call_prom("nextprop", 3, 1, node, prev_name, + RELOC(pname)) != 1) break; + /* skip "name" */ + if (strcmp(RELOC(pname), RELOC("name")) == 0) { + prev_name = RELOC("name"); + continue; + } + /* find string offset */ - soff = dt_find_string(pname); + soff = dt_find_string(RELOC(pname)); if (soff == 0) { - prom_printf("WARNING: Can't find string index for <%s>, node %s\n", - pname, path); + prom_printf("WARNING: Can't find string index for" + " <%s>, node %s\n", RELOC(pname), path); break; } prev_name = sstart + soff; /* get length */ - l = call_prom("getproplen", 2, 1, node, pname); + l = call_prom("getproplen", 2, 1, node, RELOC(pname)); /* sanity checks */ if (l == PROM_ERROR) @@ -1648,7 +1665,7 @@ static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start, prom_printf("WARNING: ignoring large property "); /* It seems OF doesn't null-terminate the path :-( */ prom_printf("[%s] ", path); - prom_printf("%s length 0x%x\n", pname, l); + prom_printf("%s length 0x%x\n", RELOC(pname), l); continue; } @@ -1658,17 +1675,16 @@ static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start, dt_push_token(soff, mem_start, mem_end); /* push property content */ - align = (l >= 8) ? 8 : 4; - valp = make_room(mem_start, mem_end, l, align); - call_prom("getprop", 4, 1, node, pname, valp, l); + valp = make_room(mem_start, mem_end, l, 4); + call_prom("getprop", 4, 1, node, RELOC(pname), valp, l); *mem_start = _ALIGN(*mem_start, 4); } /* Add a "linux,phandle" property. */ soff = dt_find_string(RELOC("linux,phandle")); if (soff == 0) - prom_printf("WARNING: Can't find string index for " - " node %s\n", path); + prom_printf("WARNING: Can't find string index for" + " node %s\n", path); else { dt_push_token(OF_DT_PROP, mem_start, mem_end); dt_push_token(4, mem_start, mem_end); @@ -1679,7 +1695,7 @@ static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start, /* do all our children */ child = call_prom("child", 1, 1, node); - while (child != (phandle)0) { + while (child != 0) { scan_dt_build_struct(child, mem_start, mem_end); child = call_prom("peer", 1, 1, child); } @@ -1718,7 +1734,8 @@ static void __init flatten_device_tree(void) /* Build header and make room for mem rsv map */ mem_start = _ALIGN(mem_start, 4); - hdr = make_room(&mem_start, &mem_end, sizeof(struct boot_param_header), 4); + hdr = make_room(&mem_start, &mem_end, + sizeof(struct boot_param_header), 4); RELOC(dt_header_start) = (unsigned long)hdr; rsvmap = make_room(&mem_start, &mem_end, sizeof(mem_reserve_map), 8); @@ -1731,11 +1748,11 @@ static void __init flatten_device_tree(void) namep = make_room(&mem_start, &mem_end, 16, 1); strcpy(namep, RELOC("linux,phandle")); mem_start = (unsigned long)namep + strlen(namep) + 1; - RELOC(dt_string_end) = mem_start; /* Build string array */ prom_printf("Building dt strings...\n"); scan_dt_build_strings(root, &mem_start, &mem_end); + RELOC(dt_string_end) = mem_start; /* Build structure */ mem_start = PAGE_ALIGN(mem_start); @@ -1750,9 +1767,11 @@ static void __init flatten_device_tree(void) hdr->totalsize = RELOC(dt_struct_end) - RELOC(dt_header_start); hdr->off_dt_struct = RELOC(dt_struct_start) - RELOC(dt_header_start); hdr->off_dt_strings = RELOC(dt_string_start) - RELOC(dt_header_start); + hdr->dt_strings_size = RELOC(dt_string_end) - RELOC(dt_string_start); hdr->off_mem_rsvmap = ((unsigned long)rsvmap) - RELOC(dt_header_start); hdr->version = OF_DT_VERSION; - hdr->last_comp_version = 1; + /* Version 16 is not backward compatible */ + hdr->last_comp_version = 0x10; /* Reserve the whole thing and copy the reserve map in, we * also bump mem_reserve_cnt to cause further reservations to @@ -1808,6 +1827,9 @@ static void __init fixup_device_tree(void) /* does it need fixup ? */ if (prom_getproplen(i2c, "interrupts") > 0) return; + + prom_printf("fixing up bogus interrupts for u3 i2c...\n"); + /* interrupt on this revision of u3 is number 0 and level */ interrupts[0] = 0; interrupts[1] = 1; diff --git a/include/asm-ppc64/prom.h b/include/asm-ppc64/prom.h index 04b1a84..dc5330b 100644 --- a/include/asm-ppc64/prom.h +++ b/include/asm-ppc64/prom.h @@ -22,13 +22,15 @@ #define RELOC(x) (*PTRRELOC(&(x))) /* Definitions used by the flattened device tree */ -#define OF_DT_HEADER 0xd00dfeed /* 4: version, 4: total size */ -#define OF_DT_BEGIN_NODE 0x1 /* Start node: full name */ +#define OF_DT_HEADER 0xd00dfeed /* marker */ +#define OF_DT_BEGIN_NODE 0x1 /* Start of node, full name */ #define OF_DT_END_NODE 0x2 /* End node */ -#define OF_DT_PROP 0x3 /* Property: name off, size, content */ +#define OF_DT_PROP 0x3 /* Property: name off, size, + * content */ +#define OF_DT_NOP 0x4 /* nop */ #define OF_DT_END 0x9 -#define OF_DT_VERSION 1 +#define OF_DT_VERSION 0x10 /* * This is what gets passed to the kernel by prom_init or kexec @@ -54,7 +56,9 @@ struct boot_param_header u32 version; /* format version */ u32 last_comp_version; /* last compatible version */ /* version 2 fields below */ - u32 boot_cpuid_phys; /* Which physical CPU id we're booting on */ + u32 boot_cpuid_phys; /* Physical CPU id we're booting on */ + /* version 3 fields below */ + u32 dt_strings_size; /* size of the DT strings block */ }; -- cgit v0.10.2 From 293da76b3d4c2f362f906bce8c5d2e053bdf8d44 Mon Sep 17 00:00:00 2001 From: Jake Moilanen Date: Thu, 9 Jun 2005 09:31:12 -0500 Subject: [PATCH] ppc64: PCI device-node failure detection OpenFirmware marks devices as failed in the device-tree when a hardware problem is detected. The kernel needs to fail config reads/writes to prevent a kernel crash when incorrect data is read. This patch validates that the device-node is not marked "fail" when config space reads/writes are attempted. Signed-off-by: Jake Moilanen Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/kernel/rtas_pci.c b/arch/ppc64/kernel/rtas_pci.c index 1048817..1dccada 100644 --- a/arch/ppc64/kernel/rtas_pci.c +++ b/arch/ppc64/kernel/rtas_pci.c @@ -58,6 +58,21 @@ static int config_access_valid(struct device_node *dn, int where) return 0; } +static int of_device_available(struct device_node * dn) +{ + char * status; + + status = get_property(dn, "status", NULL); + + if (!status) + return 1; + + if (!strcmp(status, "okay")) + return 1; + + return 0; +} + static int rtas_read_config(struct device_node *dn, int where, int size, u32 *val) { int returnval = -1; @@ -103,7 +118,7 @@ static int rtas_pci_read_config(struct pci_bus *bus, /* Search only direct children of the bus */ for (dn = busdn->child; dn; dn = dn->sibling) - if (dn->devfn == devfn) + if (dn->devfn == devfn && of_device_available(dn)) return rtas_read_config(dn, where, size, val); return PCIBIOS_DEVICE_NOT_FOUND; } @@ -146,7 +161,7 @@ static int rtas_pci_write_config(struct pci_bus *bus, /* Search only direct children of the bus */ for (dn = busdn->child; dn; dn = dn->sibling) - if (dn->devfn == devfn) + if (dn->devfn == devfn && of_device_available(dn)) return rtas_write_config(dn, where, size, val); return PCIBIOS_DEVICE_NOT_FOUND; } -- cgit v0.10.2 From 6020164499ff3a61cd8bebceb9e294a155079f71 Mon Sep 17 00:00:00 2001 From: Frank Rowand Date: Tue, 28 Jun 2005 16:48:04 -0700 Subject: [PATCH] ppc64: change duplicate Kconfig menu "General setup" to "Bus Options" arch/ppc64/Kconfig defines a "General setup" menu, but also sources init/Kconfig which also defines a "General setup" menu. Both of these menus appear at the top level of make menuconfig. Having two menus with the same name is confusing. This patch renames the ppc64/Kconfig menu to be "Bus Options" and moves options in this menu which are not bus related to the end of the "Platform support" menu. There are many variations among architectures on the exact naming of the "Bus Options" menu. I chose to use the simplest one, which is also used in arch/ppc/Kconfig. Signed-off-by: Frank Rowand Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/Kconfig b/arch/ppc64/Kconfig index 2ce8783..4d4f81c 100644 --- a/arch/ppc64/Kconfig +++ b/arch/ppc64/Kconfig @@ -350,13 +350,46 @@ config SECCOMP If unsure, say Y. Only embedded should say N here. +source "fs/Kconfig.binfmt" + +config HOTPLUG_CPU + bool "Support for hot-pluggable CPUs" + depends on SMP && EXPERIMENTAL && (PPC_PSERIES || PPC_PMAC) + select HOTPLUG + ---help--- + Say Y here to be able to turn CPUs off and on. + + Say N if you are unsure. + +config PROC_DEVICETREE + bool "Support for Open Firmware device tree in /proc" + depends on !PPC_ISERIES + help + This option adds a device-tree directory under /proc which contains + an image of the device tree that the kernel copies from Open + Firmware. If unsure, say Y here. + +config CMDLINE_BOOL + bool "Default bootloader kernel arguments" + depends on !PPC_ISERIES + +config CMDLINE + string "Initial kernel command string" + depends on CMDLINE_BOOL + default "console=ttyS0,9600 console=tty0 root=/dev/sda2" + help + On some platforms, there is currently no way for the boot loader to + pass arguments to the kernel. For these platforms, you can supply + some command-line options at build time by entering them here. In + most cases you will need to specify the root device here. + endmenu config ISA_DMA_API bool default y -menu "General setup" +menu "Bus Options" config ISA bool @@ -389,45 +422,12 @@ config PCI_DOMAINS bool default PCI -source "fs/Kconfig.binfmt" - source "drivers/pci/Kconfig" -config HOTPLUG_CPU - bool "Support for hot-pluggable CPUs" - depends on SMP && EXPERIMENTAL && (PPC_PSERIES || PPC_PMAC) - select HOTPLUG - ---help--- - Say Y here to be able to turn CPUs off and on. - - Say N if you are unsure. - source "drivers/pcmcia/Kconfig" source "drivers/pci/hotplug/Kconfig" -config PROC_DEVICETREE - bool "Support for Open Firmware device tree in /proc" - depends on !PPC_ISERIES - help - This option adds a device-tree directory under /proc which contains - an image of the device tree that the kernel copies from Open - Firmware. If unsure, say Y here. - -config CMDLINE_BOOL - bool "Default bootloader kernel arguments" - depends on !PPC_ISERIES - -config CMDLINE - string "Initial kernel command string" - depends on CMDLINE_BOOL - default "console=ttyS0,9600 console=tty0 root=/dev/sda2" - help - On some platforms, there is currently no way for the boot loader to - pass arguments to the kernel. For these platforms, you can supply - some command-line options at build time by entering them here. In - most cases you will need to specify the root device here. - endmenu source "net/Kconfig" -- cgit v0.10.2 From 3e494c80481653bbc810b4e67651097595ea0294 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Tue, 12 Jul 2005 17:40:17 +1000 Subject: [PATCH] ppc64: split iSeries specific parts out of vio.c This patch splits the iSeries specific parts out of vio.c. Signed-off-by: Stephen Rothwell Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/kernel/Makefile b/arch/ppc64/kernel/Makefile index 2ecccb6..a22c94f 100644 --- a/arch/ppc64/kernel/Makefile +++ b/arch/ppc64/kernel/Makefile @@ -50,7 +50,9 @@ obj-$(CONFIG_LPARCFG) += lparcfg.o obj-$(CONFIG_HVC_CONSOLE) += hvconsole.o obj-$(CONFIG_BOOTX_TEXT) += btext.o obj-$(CONFIG_HVCS) += hvcserver.o -obj-$(CONFIG_IBMVIO) += vio.o + +vio-obj-$(CONFIG_PPC_ISERIES) += iSeries_vio.o +obj-$(CONFIG_IBMVIO) += vio.o $(vio-obj-y) obj-$(CONFIG_XICS) += xics.o obj-$(CONFIG_MPIC) += mpic.o diff --git a/arch/ppc64/kernel/iSeries_vio.c b/arch/ppc64/kernel/iSeries_vio.c new file mode 100644 index 0000000..e876b43 --- /dev/null +++ b/arch/ppc64/kernel/iSeries_vio.c @@ -0,0 +1,133 @@ +/* + * IBM PowerPC iSeries Virtual I/O Infrastructure Support. + * + * Copyright (c) 2005 Stephen Rothwell, IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +struct device *iSeries_vio_dev = &vio_bus_device.dev; +EXPORT_SYMBOL(iSeries_vio_dev); + +static struct iommu_table veth_iommu_table; +static struct iommu_table vio_iommu_table; + +void __init iommu_vio_init(void) +{ + struct iommu_table *t; + struct iommu_table_cb cb; + unsigned long cbp; + unsigned long itc_entries; + + cb.itc_busno = 255; /* Bus 255 is the virtual bus */ + cb.itc_virtbus = 0xff; /* Ask for virtual bus */ + + cbp = virt_to_abs(&cb); + HvCallXm_getTceTableParms(cbp); + + itc_entries = cb.itc_size * PAGE_SIZE / sizeof(union tce_entry); + veth_iommu_table.it_size = itc_entries / 2; + veth_iommu_table.it_busno = cb.itc_busno; + veth_iommu_table.it_offset = cb.itc_offset; + veth_iommu_table.it_index = cb.itc_index; + veth_iommu_table.it_type = TCE_VB; + veth_iommu_table.it_blocksize = 1; + + t = iommu_init_table(&veth_iommu_table); + + if (!t) + printk("Virtual Bus VETH TCE table failed.\n"); + + vio_iommu_table.it_size = itc_entries - veth_iommu_table.it_size; + vio_iommu_table.it_busno = cb.itc_busno; + vio_iommu_table.it_offset = cb.itc_offset + + veth_iommu_table.it_size; + vio_iommu_table.it_index = cb.itc_index; + vio_iommu_table.it_type = TCE_VB; + vio_iommu_table.it_blocksize = 1; + + t = iommu_init_table(&vio_iommu_table); + + if (!t) + printk("Virtual Bus VIO TCE table failed.\n"); +} + +/** + * vio_register_device: - Register a new vio device. + * @voidev: The device to register. + */ +static struct vio_dev *__init vio_register_device_iseries(char *type, + uint32_t unit_num) +{ + struct vio_dev *viodev; + + /* allocate a vio_dev for this node */ + viodev = kmalloc(sizeof(struct vio_dev), GFP_KERNEL); + if (!viodev) + return NULL; + memset(viodev, 0, sizeof(struct vio_dev)); + + snprintf(viodev->dev.bus_id, BUS_ID_SIZE, "%s%d", type, unit_num); + + return vio_register_device_common(viodev, viodev->dev.bus_id, type, + unit_num, &vio_iommu_table); +} + +void __init probe_bus_iseries(void) +{ + HvLpIndexMap vlan_map; + struct vio_dev *viodev; + int i; + + /* there is only one of each of these */ + vio_register_device_iseries("viocons", 0); + vio_register_device_iseries("vscsi", 0); + + vlan_map = HvLpConfig_getVirtualLanIndexMap(); + for (i = 0; i < HVMAXARCHITECTEDVIRTUALLANS; i++) { + if ((vlan_map & (0x8000 >> i)) == 0) + continue; + viodev = vio_register_device_iseries("vlan", i); + /* veth is special and has it own iommu_table */ + viodev->iommu_table = &veth_iommu_table; + } + for (i = 0; i < HVMAXARCHITECTEDVIRTUALDISKS; i++) + vio_register_device_iseries("viodasd", i); + for (i = 0; i < HVMAXARCHITECTEDVIRTUALCDROMS; i++) + vio_register_device_iseries("viocd", i); + for (i = 0; i < HVMAXARCHITECTEDVIRTUALTAPES; i++) + vio_register_device_iseries("viotape", i); +} + +/** + * vio_bus_init_iseries: - Initialize the iSeries virtual IO bus + */ +static int __init vio_bus_init_iseries(void) +{ + int err; + + err = vio_bus_init(); + if (err == 0) { + vio_bus_device.iommu_table = &vio_iommu_table; + iSeries_vio_dev = &vio_bus_device.dev; + probe_bus_iseries(); + } + return err; +} + +__initcall(vio_bus_init_iseries); diff --git a/arch/ppc64/kernel/vio.c b/arch/ppc64/kernel/vio.c index 0c0ba71..4b9e371 100644 --- a/arch/ppc64/kernel/vio.c +++ b/arch/ppc64/kernel/vio.c @@ -25,10 +25,6 @@ #include #include #include -#include -#include -#include -#include #define DBGENTER() pr_debug("%s entered\n", __FUNCTION__) @@ -41,26 +37,14 @@ static const struct vio_device_id *vio_match_device( static struct iommu_table *vio_build_iommu_table(struct vio_dev *); static int vio_num_address_cells; #endif -#ifdef CONFIG_PPC_ISERIES -static struct iommu_table veth_iommu_table; -static struct iommu_table vio_iommu_table; -#endif -static struct vio_dev vio_bus_device = { /* fake "parent" device */ +struct vio_dev vio_bus_device = { /* fake "parent" device */ .name = vio_bus_device.dev.bus_id, .type = "", -#ifdef CONFIG_PPC_ISERIES - .iommu_table = &vio_iommu_table, -#endif .dev.bus_id = "vio", .dev.bus = &vio_bus_type, }; #ifdef CONFIG_PPC_ISERIES -static struct vio_dev *__init vio_register_device_iseries(char *type, - uint32_t unit_num); - -struct device *iSeries_vio_dev = &vio_bus_device.dev; -EXPORT_SYMBOL(iSeries_vio_dev); #define device_is_compatible(a, b) 1 @@ -157,48 +141,6 @@ static const struct vio_device_id * vio_match_device(const struct vio_device_id return NULL; } -#ifdef CONFIG_PPC_ISERIES -void __init iommu_vio_init(void) -{ - struct iommu_table *t; - struct iommu_table_cb cb; - unsigned long cbp; - unsigned long itc_entries; - - cb.itc_busno = 255; /* Bus 255 is the virtual bus */ - cb.itc_virtbus = 0xff; /* Ask for virtual bus */ - - cbp = virt_to_abs(&cb); - HvCallXm_getTceTableParms(cbp); - - itc_entries = cb.itc_size * PAGE_SIZE / sizeof(union tce_entry); - veth_iommu_table.it_size = itc_entries / 2; - veth_iommu_table.it_busno = cb.itc_busno; - veth_iommu_table.it_offset = cb.itc_offset; - veth_iommu_table.it_index = cb.itc_index; - veth_iommu_table.it_type = TCE_VB; - veth_iommu_table.it_blocksize = 1; - - t = iommu_init_table(&veth_iommu_table); - - if (!t) - printk("Virtual Bus VETH TCE table failed.\n"); - - vio_iommu_table.it_size = itc_entries - veth_iommu_table.it_size; - vio_iommu_table.it_busno = cb.itc_busno; - vio_iommu_table.it_offset = cb.itc_offset + - veth_iommu_table.it_size; - vio_iommu_table.it_index = cb.itc_index; - vio_iommu_table.it_type = TCE_VB; - vio_iommu_table.it_blocksize = 1; - - t = iommu_init_table(&vio_iommu_table); - - if (!t) - printk("Virtual Bus VIO TCE table failed.\n"); -} -#endif - #ifdef CONFIG_PPC_PSERIES static void probe_bus_pseries(void) { @@ -223,38 +165,10 @@ static void probe_bus_pseries(void) } #endif -#ifdef CONFIG_PPC_ISERIES -static void probe_bus_iseries(void) -{ - HvLpIndexMap vlan_map = HvLpConfig_getVirtualLanIndexMap(); - struct vio_dev *viodev; - int i; - - /* there is only one of each of these */ - vio_register_device_iseries("viocons", 0); - vio_register_device_iseries("vscsi", 0); - - vlan_map = HvLpConfig_getVirtualLanIndexMap(); - for (i = 0; i < HVMAXARCHITECTEDVIRTUALLANS; i++) { - if ((vlan_map & (0x8000 >> i)) == 0) - continue; - viodev = vio_register_device_iseries("vlan", i); - /* veth is special and has it own iommu_table */ - viodev->iommu_table = &veth_iommu_table; - } - for (i = 0; i < HVMAXARCHITECTEDVIRTUALDISKS; i++) - vio_register_device_iseries("viodasd", i); - for (i = 0; i < HVMAXARCHITECTEDVIRTUALCDROMS; i++) - vio_register_device_iseries("viocd", i); - for (i = 0; i < HVMAXARCHITECTEDVIRTUALTAPES; i++) - vio_register_device_iseries("viotape", i); -} -#endif - /** * vio_bus_init: - Initialize the virtual IO bus */ -static int __init vio_bus_init(void) +int __init vio_bus_init(void) { int err; @@ -264,25 +178,35 @@ static int __init vio_bus_init(void) return err; } - /* the fake parent of all vio devices, just to give us a nice directory */ + /* the fake parent of all vio devices, just to give us + * a nice directory + */ err = device_register(&vio_bus_device.dev); if (err) { - printk(KERN_WARNING "%s: device_register returned %i\n", __FUNCTION__, - err); + printk(KERN_WARNING "%s: device_register returned %i\n", + __FUNCTION__, err); return err; } + return 0; +} + #ifdef CONFIG_PPC_PSERIES - probe_bus_pseries(); -#endif -#ifdef CONFIG_PPC_ISERIES - probe_bus_iseries(); -#endif +/** + * vio_bus_init_pseries: - Initialize the pSeries virtual IO bus + */ +static int __init vio_bus_init_pseries(void) +{ + int err; - return 0; + err = vio_bus_init(); + if (err == 0) + probe_bus_pseries(); + return err; } -__initcall(vio_bus_init); +__initcall(vio_bus_init_pseries); +#endif /* vio_dev refcount hit 0 */ static void __devinit vio_dev_release(struct device *dev) @@ -312,7 +236,7 @@ static ssize_t viodev_show_name(struct device *dev, struct device_attribute *att } DEVICE_ATTR(name, S_IRUSR | S_IRGRP | S_IROTH, viodev_show_name, NULL); -static struct vio_dev * __devinit vio_register_device_common( +struct vio_dev * __devinit vio_register_device_common( struct vio_dev *viodev, char *name, char *type, uint32_t unit_address, struct iommu_table *iommu_table) { @@ -408,31 +332,6 @@ struct vio_dev * __devinit vio_register_device_node(struct device_node *of_node) EXPORT_SYMBOL(vio_register_device_node); #endif -#ifdef CONFIG_PPC_ISERIES -/** - * vio_register_device: - Register a new vio device. - * @voidev: The device to register. - */ -static struct vio_dev *__init vio_register_device_iseries(char *type, - uint32_t unit_num) -{ - struct vio_dev *viodev; - - DBGENTER(); - - /* allocate a vio_dev for this node */ - viodev = kmalloc(sizeof(struct vio_dev), GFP_KERNEL); - if (!viodev) - return NULL; - memset(viodev, 0, sizeof(struct vio_dev)); - - snprintf(viodev->dev.bus_id, BUS_ID_SIZE, "%s%d", type, unit_num); - - return vio_register_device_common(viodev, viodev->dev.bus_id, type, - unit_num, &vio_iommu_table); -} -#endif - void __devinit vio_unregister_device(struct vio_dev *viodev) { DBGENTER(); diff --git a/include/asm-ppc64/vio.h b/include/asm-ppc64/vio.h index 20cd98e..1e6d4c4 100644 --- a/include/asm-ppc64/vio.h +++ b/include/asm-ppc64/vio.h @@ -56,6 +56,9 @@ const void * vio_get_attribute(struct vio_dev *vdev, void* which, int* length); int vio_get_irq(struct vio_dev *dev); int vio_enable_interrupts(struct vio_dev *dev); int vio_disable_interrupts(struct vio_dev *dev); +extern struct vio_dev * __devinit vio_register_device_common( + struct vio_dev *viodev, char *name, char *type, + uint32_t unit_address, struct iommu_table *iommu_table); extern struct dma_mapping_ops vio_dma_ops; @@ -95,9 +98,13 @@ struct vio_dev { struct device dev; }; +extern struct vio_dev vio_bus_device; + static inline struct vio_dev *to_vio_dev(struct device *dev) { return container_of(dev, struct vio_dev, dev); } +extern int vio_bus_init(void); + #endif /* _ASM_VIO_H */ -- cgit v0.10.2 From 8c65b5c955b8598d9c63b4e97392377269873a54 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Tue, 12 Jul 2005 17:42:49 +1000 Subject: [PATCH] ppc64: move iSeries vio iommu init Since the iSeries vio iommu tables cannot be used until after the vio bus has been initialised, move the initialisation of the tables to there. Signed-off-by: Stephen Rothwell Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/kernel/iSeries_vio.c b/arch/ppc64/kernel/iSeries_vio.c index e876b43..48f0ebf 100644 --- a/arch/ppc64/kernel/iSeries_vio.c +++ b/arch/ppc64/kernel/iSeries_vio.c @@ -27,7 +27,7 @@ EXPORT_SYMBOL(iSeries_vio_dev); static struct iommu_table veth_iommu_table; static struct iommu_table vio_iommu_table; -void __init iommu_vio_init(void) +static void __init iommu_vio_init(void) { struct iommu_table *t; struct iommu_table_cb cb; @@ -123,6 +123,7 @@ static int __init vio_bus_init_iseries(void) err = vio_bus_init(); if (err == 0) { + iommu_vio_init(); vio_bus_device.iommu_table = &vio_iommu_table; iSeries_vio_dev = &vio_bus_device.dev; probe_bus_iseries(); diff --git a/arch/ppc64/mm/init.c b/arch/ppc64/mm/init.c index 87f256d..9edfe26 100644 --- a/arch/ppc64/mm/init.c +++ b/arch/ppc64/mm/init.c @@ -686,9 +686,6 @@ void __init mem_init(void) mem_init_done = 1; -#ifdef CONFIG_PPC_ISERIES - iommu_vio_init(); -#endif /* Initialize the vDSO */ vdso_init(); } diff --git a/include/asm-ppc64/iommu.h b/include/asm-ppc64/iommu.h index 729de5c..72dcf81 100644 --- a/include/asm-ppc64/iommu.h +++ b/include/asm-ppc64/iommu.h @@ -104,9 +104,6 @@ extern void iommu_devnode_init_pSeries(struct device_node *dn); #ifdef CONFIG_PPC_ISERIES -/* Initializes tables for bio buses */ -extern void __init iommu_vio_init(void); - struct iSeries_Device_Node; /* Creates table for an individual device node */ extern void iommu_devnode_init_iSeries(struct iSeries_Device_Node *dn); -- cgit v0.10.2 From 6312236fe82bbd3b0e1dee60b3eb3b270a2f6aeb Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Tue, 12 Jul 2005 17:45:27 +1000 Subject: [PATCH] ppc64: make the bus matching function platform specific This patch allows us to have a different bus if matching function for each platform. Signed-off-by: Stephen Rothwell Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/kernel/iSeries_vio.c b/arch/ppc64/kernel/iSeries_vio.c index 48f0ebf..2656b1c 100644 --- a/arch/ppc64/kernel/iSeries_vio.c +++ b/arch/ppc64/kernel/iSeries_vio.c @@ -115,13 +115,23 @@ void __init probe_bus_iseries(void) } /** + * vio_match_device_iseries: - Tell if a iSeries VIO device matches a + * vio_device_id + */ +static int vio_match_device_iseries(const struct vio_device_id *id, + const struct vio_dev *dev) +{ + return strncmp(dev->type, id->type, strlen(id->type)) == 0; +} + +/** * vio_bus_init_iseries: - Initialize the iSeries virtual IO bus */ static int __init vio_bus_init_iseries(void) { int err; - err = vio_bus_init(); + err = vio_bus_init(vio_match_device_iseries); if (err == 0) { iommu_vio_init(); vio_bus_device.iommu_table = &vio_iommu_table; diff --git a/arch/ppc64/kernel/vio.c b/arch/ppc64/kernel/vio.c index 4b9e371..8a243ca 100644 --- a/arch/ppc64/kernel/vio.c +++ b/arch/ppc64/kernel/vio.c @@ -44,11 +44,8 @@ struct vio_dev vio_bus_device = { /* fake "parent" device */ .dev.bus = &vio_bus_type, }; -#ifdef CONFIG_PPC_ISERIES - -#define device_is_compatible(a, b) 1 - -#endif +static int (*is_match)(const struct vio_device_id *id, + const struct vio_dev *dev); /* convert from struct device to struct vio_dev and pass to driver. * dev->driver has already been set by generic code because vio_bus_match @@ -133,8 +130,7 @@ static const struct vio_device_id * vio_match_device(const struct vio_device_id DBGENTER(); while (ids->type) { - if ((strncmp(dev->type, ids->type, strlen(ids->type)) == 0) && - device_is_compatible(dev->dev.platform_data, ids->compat)) + if (is_match(ids, dev)) return ids; ids++; } @@ -168,10 +164,13 @@ static void probe_bus_pseries(void) /** * vio_bus_init: - Initialize the virtual IO bus */ -int __init vio_bus_init(void) +int __init vio_bus_init(int (*match_func)(const struct vio_device_id *id, + const struct vio_dev *dev)) { int err; + is_match = match_func; + err = bus_register(&vio_bus_type); if (err) { printk(KERN_ERR "failed to register VIO bus\n"); @@ -193,13 +192,24 @@ int __init vio_bus_init(void) #ifdef CONFIG_PPC_PSERIES /** + * vio_match_device_pseries: - Tell if a pSeries VIO device matches a + * vio_device_id + */ +static int vio_match_device_pseries(const struct vio_device_id *id, + const struct vio_dev *dev) +{ + return (strncmp(dev->type, id->type, strlen(id->type)) == 0) && + device_is_compatible(dev->dev.platform_data, id->compat); +} + +/** * vio_bus_init_pseries: - Initialize the pSeries virtual IO bus */ static int __init vio_bus_init_pseries(void) { int err; - err = vio_bus_init(); + err = vio_bus_init(vio_match_device_pseries); if (err == 0) probe_bus_pseries(); return err; diff --git a/include/asm-ppc64/vio.h b/include/asm-ppc64/vio.h index 1e6d4c4..70644a2 100644 --- a/include/asm-ppc64/vio.h +++ b/include/asm-ppc64/vio.h @@ -105,6 +105,7 @@ static inline struct vio_dev *to_vio_dev(struct device *dev) return container_of(dev, struct vio_dev, dev); } -extern int vio_bus_init(void); +extern int vio_bus_init(int (*is_match)(const struct vio_device_id *id, + const struct vio_dev *dev)); #endif /* _ASM_VIO_H */ -- cgit v0.10.2 From 19dbd0f6a74f7529d6d49dd50ad6b31adbe0598d Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Tue, 12 Jul 2005 17:50:26 +1000 Subject: [PATCH] ppc64: split pSeries specific parts out of vio.c This patch just splits out the pSeries specific parts of vio.c. Signed-off-by: Stephen Rothwell Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/kernel/Makefile b/arch/ppc64/kernel/Makefile index a22c94f..cbf87dc 100644 --- a/arch/ppc64/kernel/Makefile +++ b/arch/ppc64/kernel/Makefile @@ -51,6 +51,7 @@ obj-$(CONFIG_HVC_CONSOLE) += hvconsole.o obj-$(CONFIG_BOOTX_TEXT) += btext.o obj-$(CONFIG_HVCS) += hvcserver.o +vio-obj-$(CONFIG_PPC_PSERIES) += pSeries_vio.o vio-obj-$(CONFIG_PPC_ISERIES) += iSeries_vio.o obj-$(CONFIG_IBMVIO) += vio.o $(vio-obj-y) obj-$(CONFIG_XICS) += xics.o diff --git a/arch/ppc64/kernel/iSeries_vio.c b/arch/ppc64/kernel/iSeries_vio.c index 2656b1c..b4268cc 100644 --- a/arch/ppc64/kernel/iSeries_vio.c +++ b/arch/ppc64/kernel/iSeries_vio.c @@ -131,7 +131,7 @@ static int __init vio_bus_init_iseries(void) { int err; - err = vio_bus_init(vio_match_device_iseries); + err = vio_bus_init(vio_match_device_iseries, NULL, NULL); if (err == 0) { iommu_vio_init(); vio_bus_device.iommu_table = &vio_iommu_table; diff --git a/arch/ppc64/kernel/pSeries_vio.c b/arch/ppc64/kernel/pSeries_vio.c new file mode 100644 index 0000000..338f9e1 --- /dev/null +++ b/arch/ppc64/kernel/pSeries_vio.c @@ -0,0 +1,266 @@ +/* + * IBM PowerPC pSeries Virtual I/O Infrastructure Support. + * + * Copyright (c) 2003-2005 IBM Corp. + * Dave Engebretsen engebret@us.ibm.com + * Santiago Leon santil@us.ibm.com + * Hollis Blanchard + * Stephen Rothwell + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +extern struct subsystem devices_subsys; /* needed for vio_find_name() */ + +static void probe_bus_pseries(void) +{ + struct device_node *node_vroot, *of_node; + + node_vroot = find_devices("vdevice"); + if ((node_vroot == NULL) || (node_vroot->child == NULL)) + /* this machine doesn't do virtual IO, and that's ok */ + return; + + /* + * Create struct vio_devices for each virtual device in the device tree. + * Drivers will associate with them later. + */ + for (of_node = node_vroot->child; of_node != NULL; + of_node = of_node->sibling) { + printk(KERN_DEBUG "%s: processing %p\n", __FUNCTION__, of_node); + vio_register_device_node(of_node); + } +} + +/** + * vio_match_device_pseries: - Tell if a pSeries VIO device matches a + * vio_device_id + */ +static int vio_match_device_pseries(const struct vio_device_id *id, + const struct vio_dev *dev) +{ + return (strncmp(dev->type, id->type, strlen(id->type)) == 0) && + device_is_compatible(dev->dev.platform_data, id->compat); +} + +static void vio_release_device_pseries(struct device *dev) +{ + /* XXX free TCE table */ + of_node_put(dev->platform_data); +} + +static ssize_t viodev_show_devspec(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct device_node *of_node = dev->platform_data; + + return sprintf(buf, "%s\n", of_node->full_name); +} +DEVICE_ATTR(devspec, S_IRUSR | S_IRGRP | S_IROTH, viodev_show_devspec, NULL); + +static void vio_unregister_device_pseries(struct vio_dev *viodev) +{ + device_remove_file(&viodev->dev, &dev_attr_devspec); +} + +/** + * vio_bus_init_pseries: - Initialize the pSeries virtual IO bus + */ +static int __init vio_bus_init_pseries(void) +{ + int err; + + err = vio_bus_init(vio_match_device_pseries, + vio_unregister_device_pseries, + vio_release_device_pseries); + if (err == 0) + probe_bus_pseries(); + return err; +} + +__initcall(vio_bus_init_pseries); + +/** + * vio_build_iommu_table: - gets the dma information from OF and + * builds the TCE tree. + * @dev: the virtual device. + * + * Returns a pointer to the built tce tree, or NULL if it can't + * find property. +*/ +static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev) +{ + unsigned int *dma_window; + struct iommu_table *newTceTable; + unsigned long offset; + int dma_window_property_size; + + dma_window = (unsigned int *) get_property(dev->dev.platform_data, "ibm,my-dma-window", &dma_window_property_size); + if(!dma_window) { + return NULL; + } + + newTceTable = (struct iommu_table *) kmalloc(sizeof(struct iommu_table), GFP_KERNEL); + + /* There should be some code to extract the phys-encoded offset + using prom_n_addr_cells(). However, according to a comment + on earlier versions, it's always zero, so we don't bother */ + offset = dma_window[1] >> PAGE_SHIFT; + + /* TCE table size - measured in tce entries */ + newTceTable->it_size = dma_window[4] >> PAGE_SHIFT; + /* offset for VIO should always be 0 */ + newTceTable->it_offset = offset; + newTceTable->it_busno = 0; + newTceTable->it_index = (unsigned long)dma_window[0]; + newTceTable->it_type = TCE_VB; + + return iommu_init_table(newTceTable); +} + +/** + * vio_register_device_node: - Register a new vio device. + * @of_node: The OF node for this device. + * + * Creates and initializes a vio_dev structure from the data in + * of_node (dev.platform_data) and adds it to the list of virtual devices. + * Returns a pointer to the created vio_dev or NULL if node has + * NULL device_type or compatible fields. + */ +struct vio_dev * __devinit vio_register_device_node(struct device_node *of_node) +{ + struct vio_dev *viodev; + unsigned int *unit_address; + unsigned int *irq_p; + + /* we need the 'device_type' property, in order to match with drivers */ + if ((NULL == of_node->type)) { + printk(KERN_WARNING + "%s: node %s missing 'device_type'\n", __FUNCTION__, + of_node->name ? of_node->name : ""); + return NULL; + } + + unit_address = (unsigned int *)get_property(of_node, "reg", NULL); + if (!unit_address) { + printk(KERN_WARNING "%s: node %s missing 'reg'\n", __FUNCTION__, + of_node->name ? of_node->name : ""); + return NULL; + } + + /* allocate a vio_dev for this node */ + viodev = kmalloc(sizeof(struct vio_dev), GFP_KERNEL); + if (!viodev) { + return NULL; + } + memset(viodev, 0, sizeof(struct vio_dev)); + + viodev->dev.platform_data = of_node_get(of_node); + + viodev->irq = NO_IRQ; + irq_p = (unsigned int *)get_property(of_node, "interrupts", NULL); + if (irq_p) { + int virq = virt_irq_create_mapping(*irq_p); + if (virq == NO_IRQ) { + printk(KERN_ERR "Unable to allocate interrupt " + "number for %s\n", of_node->full_name); + } else + viodev->irq = irq_offset_up(virq); + } + + snprintf(viodev->dev.bus_id, BUS_ID_SIZE, "%x", *unit_address); + + /* register with generic device framework */ + if (vio_register_device_common(viodev, of_node->name, of_node->type, + *unit_address, vio_build_iommu_table(viodev)) + == NULL) { + /* XXX free TCE table */ + kfree(viodev); + return NULL; + } + device_create_file(&viodev->dev, &dev_attr_devspec); + + return viodev; +} +EXPORT_SYMBOL(vio_register_device_node); + +/** + * vio_get_attribute: - get attribute for virtual device + * @vdev: The vio device to get property. + * @which: The property/attribute to be extracted. + * @length: Pointer to length of returned data size (unused if NULL). + * + * Calls prom.c's get_property() to return the value of the + * attribute specified by the preprocessor constant @which +*/ +const void * vio_get_attribute(struct vio_dev *vdev, void* which, int* length) +{ + return get_property(vdev->dev.platform_data, (char*)which, length); +} +EXPORT_SYMBOL(vio_get_attribute); + +/* vio_find_name() - internal because only vio.c knows how we formatted the + * kobject name + * XXX once vio_bus_type.devices is actually used as a kset in + * drivers/base/bus.c, this function should be removed in favor of + * "device_find(kobj_name, &vio_bus_type)" + */ +static struct vio_dev *vio_find_name(const char *kobj_name) +{ + struct kobject *found; + + found = kset_find_obj(&devices_subsys.kset, kobj_name); + if (!found) + return NULL; + + return to_vio_dev(container_of(found, struct device, kobj)); +} + +/** + * vio_find_node - find an already-registered vio_dev + * @vnode: device_node of the virtual device we're looking for + */ +struct vio_dev *vio_find_node(struct device_node *vnode) +{ + uint32_t *unit_address; + char kobj_name[BUS_ID_SIZE]; + + /* construct the kobject name from the device node */ + unit_address = (uint32_t *)get_property(vnode, "reg", NULL); + if (!unit_address) + return NULL; + snprintf(kobj_name, BUS_ID_SIZE, "%x", *unit_address); + + return vio_find_name(kobj_name); +} +EXPORT_SYMBOL(vio_find_node); + +int vio_enable_interrupts(struct vio_dev *dev) +{ + int rc = h_vio_signal(dev->unit_address, VIO_IRQ_ENABLE); + if (rc != H_Success) + printk(KERN_ERR "vio: Error 0x%x enabling interrupts\n", rc); + return rc; +} +EXPORT_SYMBOL(vio_enable_interrupts); + +int vio_disable_interrupts(struct vio_dev *dev) +{ + int rc = h_vio_signal(dev->unit_address, VIO_IRQ_DISABLE); + if (rc != H_Success) + printk(KERN_ERR "vio: Error 0x%x disabling interrupts\n", rc); + return rc; +} +EXPORT_SYMBOL(vio_disable_interrupts); diff --git a/arch/ppc64/kernel/vio.c b/arch/ppc64/kernel/vio.c index 8a243ca..3b790ba 100644 --- a/arch/ppc64/kernel/vio.c +++ b/arch/ppc64/kernel/vio.c @@ -1,10 +1,11 @@ /* * IBM PowerPC Virtual I/O Infrastructure Support. * - * Copyright (c) 2003 IBM Corp. + * Copyright (c) 2003-2005 IBM Corp. * Dave Engebretsen engebret@us.ibm.com * Santiago Leon santil@us.ibm.com * Hollis Blanchard + * Stephen Rothwell * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -14,29 +15,16 @@ #include #include -#include #include -#include #include #include -#include #include #include -#include #include -#include - -#define DBGENTER() pr_debug("%s entered\n", __FUNCTION__) - -extern struct subsystem devices_subsys; /* needed for vio_find_name() */ static const struct vio_device_id *vio_match_device( const struct vio_device_id *, const struct vio_dev *); -#ifdef CONFIG_PPC_PSERIES -static struct iommu_table *vio_build_iommu_table(struct vio_dev *); -static int vio_num_address_cells; -#endif struct vio_dev vio_bus_device = { /* fake "parent" device */ .name = vio_bus_device.dev.bus_id, .type = "", @@ -46,6 +34,8 @@ struct vio_dev vio_bus_device = { /* fake "parent" device */ static int (*is_match)(const struct vio_device_id *id, const struct vio_dev *dev); +static void (*unregister_device_callback)(struct vio_dev *dev); +static void (*release_device_callback)(struct device *dev); /* convert from struct device to struct vio_dev and pass to driver. * dev->driver has already been set by generic code because vio_bus_match @@ -57,8 +47,6 @@ static int vio_bus_probe(struct device *dev) const struct vio_device_id *id; int error = -ENODEV; - DBGENTER(); - if (!viodrv->probe) return error; @@ -76,8 +64,6 @@ static int vio_bus_remove(struct device *dev) struct vio_dev *viodev = to_vio_dev(dev); struct vio_driver *viodrv = to_vio_driver(dev->driver); - DBGENTER(); - if (viodrv->remove) { return viodrv->remove(viodev); } @@ -127,8 +113,6 @@ EXPORT_SYMBOL(vio_unregister_driver); static const struct vio_device_id * vio_match_device(const struct vio_device_id *ids, const struct vio_dev *dev) { - DBGENTER(); - while (ids->type) { if (is_match(ids, dev)) return ids; @@ -137,39 +121,19 @@ static const struct vio_device_id * vio_match_device(const struct vio_device_id return NULL; } -#ifdef CONFIG_PPC_PSERIES -static void probe_bus_pseries(void) -{ - struct device_node *node_vroot, *of_node; - - node_vroot = find_devices("vdevice"); - if ((node_vroot == NULL) || (node_vroot->child == NULL)) - /* this machine doesn't do virtual IO, and that's ok */ - return; - - vio_num_address_cells = prom_n_addr_cells(node_vroot->child); - - /* - * Create struct vio_devices for each virtual device in the device tree. - * Drivers will associate with them later. - */ - for (of_node = node_vroot->child; of_node != NULL; - of_node = of_node->sibling) { - printk(KERN_DEBUG "%s: processing %p\n", __FUNCTION__, of_node); - vio_register_device_node(of_node); - } -} -#endif - /** * vio_bus_init: - Initialize the virtual IO bus */ int __init vio_bus_init(int (*match_func)(const struct vio_device_id *id, - const struct vio_dev *dev)) + const struct vio_dev *dev), + void (*unregister_dev)(struct vio_dev *), + void (*release_dev)(struct device *)) { int err; is_match = match_func; + unregister_device_callback = unregister_dev; + release_device_callback = release_dev; err = bus_register(&vio_bus_type); if (err) { @@ -190,56 +154,14 @@ int __init vio_bus_init(int (*match_func)(const struct vio_device_id *id, return 0; } -#ifdef CONFIG_PPC_PSERIES -/** - * vio_match_device_pseries: - Tell if a pSeries VIO device matches a - * vio_device_id - */ -static int vio_match_device_pseries(const struct vio_device_id *id, - const struct vio_dev *dev) -{ - return (strncmp(dev->type, id->type, strlen(id->type)) == 0) && - device_is_compatible(dev->dev.platform_data, id->compat); -} - -/** - * vio_bus_init_pseries: - Initialize the pSeries virtual IO bus - */ -static int __init vio_bus_init_pseries(void) -{ - int err; - - err = vio_bus_init(vio_match_device_pseries); - if (err == 0) - probe_bus_pseries(); - return err; -} - -__initcall(vio_bus_init_pseries); -#endif - /* vio_dev refcount hit 0 */ static void __devinit vio_dev_release(struct device *dev) { - DBGENTER(); - -#ifdef CONFIG_PPC_PSERIES - /* XXX free TCE table */ - of_node_put(dev->platform_data); -#endif + if (release_device_callback) + release_device_callback(dev); kfree(to_vio_dev(dev)); } -#ifdef CONFIG_PPC_PSERIES -static ssize_t viodev_show_devspec(struct device *dev, struct device_attribute *attr, char *buf) -{ - struct device_node *of_node = dev->platform_data; - - return sprintf(buf, "%s\n", of_node->full_name); -} -DEVICE_ATTR(devspec, S_IRUSR | S_IRGRP | S_IROTH, viodev_show_devspec, NULL); -#endif - static ssize_t viodev_show_name(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%s\n", to_vio_dev(dev)->name); @@ -250,8 +172,6 @@ struct vio_dev * __devinit vio_register_device_common( struct vio_dev *viodev, char *name, char *type, uint32_t unit_address, struct iommu_table *iommu_table) { - DBGENTER(); - viodev->name = name; viodev->type = type; viodev->unit_address = unit_address; @@ -272,197 +192,15 @@ struct vio_dev * __devinit vio_register_device_common( return viodev; } -#ifdef CONFIG_PPC_PSERIES -/** - * vio_register_device_node: - Register a new vio device. - * @of_node: The OF node for this device. - * - * Creates and initializes a vio_dev structure from the data in - * of_node (dev.platform_data) and adds it to the list of virtual devices. - * Returns a pointer to the created vio_dev or NULL if node has - * NULL device_type or compatible fields. - */ -struct vio_dev * __devinit vio_register_device_node(struct device_node *of_node) -{ - struct vio_dev *viodev; - unsigned int *unit_address; - unsigned int *irq_p; - - DBGENTER(); - - /* we need the 'device_type' property, in order to match with drivers */ - if ((NULL == of_node->type)) { - printk(KERN_WARNING - "%s: node %s missing 'device_type'\n", __FUNCTION__, - of_node->name ? of_node->name : ""); - return NULL; - } - - unit_address = (unsigned int *)get_property(of_node, "reg", NULL); - if (!unit_address) { - printk(KERN_WARNING "%s: node %s missing 'reg'\n", __FUNCTION__, - of_node->name ? of_node->name : ""); - return NULL; - } - - /* allocate a vio_dev for this node */ - viodev = kmalloc(sizeof(struct vio_dev), GFP_KERNEL); - if (!viodev) { - return NULL; - } - memset(viodev, 0, sizeof(struct vio_dev)); - - viodev->dev.platform_data = of_node_get(of_node); - - viodev->irq = NO_IRQ; - irq_p = (unsigned int *)get_property(of_node, "interrupts", NULL); - if (irq_p) { - int virq = virt_irq_create_mapping(*irq_p); - if (virq == NO_IRQ) { - printk(KERN_ERR "Unable to allocate interrupt " - "number for %s\n", of_node->full_name); - } else - viodev->irq = irq_offset_up(virq); - } - - snprintf(viodev->dev.bus_id, BUS_ID_SIZE, "%x", *unit_address); - - /* register with generic device framework */ - if (vio_register_device_common(viodev, of_node->name, of_node->type, - *unit_address, vio_build_iommu_table(viodev)) - == NULL) { - /* XXX free TCE table */ - kfree(viodev); - return NULL; - } - device_create_file(&viodev->dev, &dev_attr_devspec); - - return viodev; -} -EXPORT_SYMBOL(vio_register_device_node); -#endif - void __devinit vio_unregister_device(struct vio_dev *viodev) { - DBGENTER(); -#ifdef CONFIG_PPC_PSERIES - device_remove_file(&viodev->dev, &dev_attr_devspec); -#endif + if (unregister_device_callback) + unregister_device_callback(viodev); device_remove_file(&viodev->dev, &dev_attr_name); device_unregister(&viodev->dev); } EXPORT_SYMBOL(vio_unregister_device); -#ifdef CONFIG_PPC_PSERIES -/** - * vio_get_attribute: - get attribute for virtual device - * @vdev: The vio device to get property. - * @which: The property/attribute to be extracted. - * @length: Pointer to length of returned data size (unused if NULL). - * - * Calls prom.c's get_property() to return the value of the - * attribute specified by the preprocessor constant @which -*/ -const void * vio_get_attribute(struct vio_dev *vdev, void* which, int* length) -{ - return get_property(vdev->dev.platform_data, (char*)which, length); -} -EXPORT_SYMBOL(vio_get_attribute); - -/* vio_find_name() - internal because only vio.c knows how we formatted the - * kobject name - * XXX once vio_bus_type.devices is actually used as a kset in - * drivers/base/bus.c, this function should be removed in favor of - * "device_find(kobj_name, &vio_bus_type)" - */ -static struct vio_dev *vio_find_name(const char *kobj_name) -{ - struct kobject *found; - - found = kset_find_obj(&devices_subsys.kset, kobj_name); - if (!found) - return NULL; - - return to_vio_dev(container_of(found, struct device, kobj)); -} - -/** - * vio_find_node - find an already-registered vio_dev - * @vnode: device_node of the virtual device we're looking for - */ -struct vio_dev *vio_find_node(struct device_node *vnode) -{ - uint32_t *unit_address; - char kobj_name[BUS_ID_SIZE]; - - /* construct the kobject name from the device node */ - unit_address = (uint32_t *)get_property(vnode, "reg", NULL); - if (!unit_address) - return NULL; - snprintf(kobj_name, BUS_ID_SIZE, "%x", *unit_address); - - return vio_find_name(kobj_name); -} -EXPORT_SYMBOL(vio_find_node); - -/** - * vio_build_iommu_table: - gets the dma information from OF and builds the TCE tree. - * @dev: the virtual device. - * - * Returns a pointer to the built tce tree, or NULL if it can't - * find property. -*/ -static struct iommu_table * vio_build_iommu_table(struct vio_dev *dev) -{ - unsigned int *dma_window; - struct iommu_table *newTceTable; - unsigned long offset; - int dma_window_property_size; - - dma_window = (unsigned int *) get_property(dev->dev.platform_data, "ibm,my-dma-window", &dma_window_property_size); - if(!dma_window) { - return NULL; - } - - newTceTable = (struct iommu_table *) kmalloc(sizeof(struct iommu_table), GFP_KERNEL); - - /* There should be some code to extract the phys-encoded offset - using prom_n_addr_cells(). However, according to a comment - on earlier versions, it's always zero, so we don't bother */ - offset = dma_window[1] >> PAGE_SHIFT; - - /* TCE table size - measured in tce entries */ - newTceTable->it_size = dma_window[4] >> PAGE_SHIFT; - /* offset for VIO should always be 0 */ - newTceTable->it_offset = offset; - newTceTable->it_busno = 0; - newTceTable->it_index = (unsigned long)dma_window[0]; - newTceTable->it_type = TCE_VB; - - return iommu_init_table(newTceTable); -} - -int vio_enable_interrupts(struct vio_dev *dev) -{ - int rc = h_vio_signal(dev->unit_address, VIO_IRQ_ENABLE); - if (rc != H_Success) { - printk(KERN_ERR "vio: Error 0x%x enabling interrupts\n", rc); - } - return rc; -} -EXPORT_SYMBOL(vio_enable_interrupts); - -int vio_disable_interrupts(struct vio_dev *dev) -{ - int rc = h_vio_signal(dev->unit_address, VIO_IRQ_DISABLE); - if (rc != H_Success) { - printk(KERN_ERR "vio: Error 0x%x disabling interrupts\n", rc); - } - return rc; -} -EXPORT_SYMBOL(vio_disable_interrupts); -#endif - static dma_addr_t vio_map_single(struct device *dev, void *vaddr, size_t size, enum dma_data_direction direction) { @@ -526,8 +264,6 @@ static int vio_bus_match(struct device *dev, struct device_driver *drv) const struct vio_device_id *ids = vio_drv->id_table; const struct vio_device_id *found_id; - DBGENTER(); - if (!ids) return 0; diff --git a/include/asm-ppc64/vio.h b/include/asm-ppc64/vio.h index 70644a2..a82e87c 100644 --- a/include/asm-ppc64/vio.h +++ b/include/asm-ppc64/vio.h @@ -106,6 +106,8 @@ static inline struct vio_dev *to_vio_dev(struct device *dev) } extern int vio_bus_init(int (*is_match)(const struct vio_device_id *id, - const struct vio_dev *dev)); + const struct vio_dev *dev), + void (*)(struct vio_dev *), + void (*)(struct device *)); #endif /* _ASM_VIO_H */ -- cgit v0.10.2 From 2e2446ea0758cd57dd065962d9544e3f4d44ea2b Mon Sep 17 00:00:00 2001 From: David Gibson Date: Fri, 19 Aug 2005 14:52:31 +1000 Subject: [PATCH] Remove NACA fixed address constraint Comments in head.S suggest that the iSeries naca has a fixed address, because tools expect to find it there. The only tool which appears to access the naca is addRamDisk, but both the in-kernel version and the version used in RHEL and SuSE in fact locate the NACA the same way as the hypervisor does, by following the pointer in the hvReleaseData structure. Since the requirement for a fixed address seems to be obsolete, this patch removes the naca from head.S and replaces it with a normal C initializer. For good measure, it removes an old version of addRamDisk.c which was sitting, unused, in the ppc32 tree. Signed-off-by: David Gibson Signed-off-by: Paul Mackerras diff --git a/arch/ppc/boot/utils/addRamDisk.c b/arch/ppc/boot/utils/addRamDisk.c deleted file mode 100644 index 93400df..0000000 --- a/arch/ppc/boot/utils/addRamDisk.c +++ /dev/null @@ -1,203 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - -#define ElfHeaderSize (64 * 1024) -#define ElfPages (ElfHeaderSize / 4096) -#define KERNELBASE (0xc0000000) - -void get4k(FILE *file, char *buf ) -{ - unsigned j; - unsigned num = fread(buf, 1, 4096, file); - for ( j=num; j<4096; ++j ) - buf[j] = 0; -} - -void put4k(FILE *file, char *buf ) -{ - fwrite(buf, 1, 4096, file); -} - -void death(const char *msg, FILE *fdesc, const char *fname) -{ - printf(msg); - fclose(fdesc); - unlink(fname); - exit(1); -} - -int main(int argc, char **argv) -{ - char inbuf[4096]; - FILE *ramDisk = NULL; - FILE *inputVmlinux = NULL; - FILE *outputVmlinux = NULL; - unsigned i = 0; - u_int32_t ramFileLen = 0; - u_int32_t ramLen = 0; - u_int32_t roundR = 0; - u_int32_t kernelLen = 0; - u_int32_t actualKernelLen = 0; - u_int32_t round = 0; - u_int32_t roundedKernelLen = 0; - u_int32_t ramStartOffs = 0; - u_int32_t ramPages = 0; - u_int32_t roundedKernelPages = 0; - u_int32_t hvReleaseData = 0; - u_int32_t eyeCatcher = 0xc8a5d9c4; - u_int32_t naca = 0; - u_int32_t xRamDisk = 0; - u_int32_t xRamDiskSize = 0; - if ( argc < 2 ) { - printf("Name of RAM disk file missing.\n"); - exit(1); - } - - if ( argc < 3 ) { - printf("Name of vmlinux file missing.\n"); - exit(1); - } - - if ( argc < 4 ) { - printf("Name of vmlinux output file missing.\n"); - exit(1); - } - - ramDisk = fopen(argv[1], "r"); - if ( ! ramDisk ) { - printf("RAM disk file \"%s\" failed to open.\n", argv[1]); - exit(1); - } - inputVmlinux = fopen(argv[2], "r"); - if ( ! inputVmlinux ) { - printf("vmlinux file \"%s\" failed to open.\n", argv[2]); - exit(1); - } - outputVmlinux = fopen(argv[3], "w+"); - if ( ! outputVmlinux ) { - printf("output vmlinux file \"%s\" failed to open.\n", argv[3]); - exit(1); - } - fseek(ramDisk, 0, SEEK_END); - ramFileLen = ftell(ramDisk); - fseek(ramDisk, 0, SEEK_SET); - printf("%s file size = %d\n", argv[1], ramFileLen); - - ramLen = ramFileLen; - - roundR = 4096 - (ramLen % 4096); - if ( roundR ) { - printf("Rounding RAM disk file up to a multiple of 4096, adding %d\n", roundR); - ramLen += roundR; - } - - printf("Rounded RAM disk size is %d\n", ramLen); - fseek(inputVmlinux, 0, SEEK_END); - kernelLen = ftell(inputVmlinux); - fseek(inputVmlinux, 0, SEEK_SET); - printf("kernel file size = %d\n", kernelLen); - if ( kernelLen == 0 ) { - printf("You must have a linux kernel specified as argv[2]\n"); - exit(1); - } - - actualKernelLen = kernelLen - ElfHeaderSize; - - printf("actual kernel length (minus ELF header) = %d\n", actualKernelLen); - - round = actualKernelLen % 4096; - roundedKernelLen = actualKernelLen; - if ( round ) - roundedKernelLen += (4096 - round); - - printf("actual kernel length rounded up to a 4k multiple = %d\n", roundedKernelLen); - - ramStartOffs = roundedKernelLen; - ramPages = ramLen / 4096; - - printf("RAM disk pages to copy = %d\n", ramPages); - - // Copy 64K ELF header - for (i=0; i<(ElfPages); ++i) { - get4k( inputVmlinux, inbuf ); - put4k( outputVmlinux, inbuf ); - } - - roundedKernelPages = roundedKernelLen / 4096; - - fseek(inputVmlinux, ElfHeaderSize, SEEK_SET); - - for ( i=0; i #include #include -#include #include #include #include @@ -511,24 +510,10 @@ _GLOBAL(do_stab_bolted_pSeries) mfspr r12,SPRG2 EXCEPTION_PROLOG_PSERIES(PACA_EXSLB, .do_stab_bolted) - - /* Space for the naca. Architected to be located at real address - * NACA_PHYS_ADDR. Various tools rely on this location being fixed. - * The first dword of the naca is required by iSeries LPAR to - * point to itVpdAreas. On pSeries native, this value is not used. - */ - . = NACA_PHYS_ADDR - .globl __end_interrupts -__end_interrupts: -#ifdef CONFIG_PPC_ISERIES - .globl naca -naca: - .llong itVpdAreas - .llong 0 /* xRamDisk */ - .llong 0 /* xRamDiskSize */ . = 0x6100 +#ifdef CONFIG_PPC_ISERIES /*** ISeries-LPAR interrupt handlers ***/ STD_EXCEPTION_ISERIES(0x200, machine_check, PACA_EXMC) diff --git a/include/asm-ppc64/naca.h b/include/asm-ppc64/naca.h index bfb7caa..d2afe64 100644 --- a/include/asm-ppc64/naca.h +++ b/include/asm-ppc64/naca.h @@ -12,8 +12,6 @@ #include -#ifndef __ASSEMBLY__ - struct naca_struct { /* Kernel only data - undefined for user space */ void *xItVpdAreas; /* VPD Data 0x00 */ @@ -23,9 +21,4 @@ struct naca_struct { extern struct naca_struct naca; -#endif /* __ASSEMBLY__ */ - -#define NACA_PAGE 0x4 -#define NACA_PHYS_ADDR (NACA_PAGE< Date: Fri, 19 Aug 2005 14:52:31 +1000 Subject: [PATCH] Move iSeries and common vectors into unused space in head.S In the ppc64 kernel head.S there is currently quite a lot of unused space between the naca (at fixed address 0x4000) and the fwnmi data area (at fixed address 0x7000). This patch moves various exception vectors and support code into this region to use the wasted space. The functions load_up_fpu and load_up_altivec are moved down as well, since they are essentially continuations of the fp_unavailable_common and altivec_unavailable_common vectors, respectively. Likewise, the fwnmi vectors themselves are moved down into this area, because while the location of the fwnmi data area is fixed by the RPA, the vectors themselves can be anywhere sufficiently low. Signed-off-by: David Gibson Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/kernel/head.S b/arch/ppc64/kernel/head.S index 13c0364..eb54f05 100644 --- a/arch/ppc64/kernel/head.S +++ b/arch/ppc64/kernel/head.S @@ -52,9 +52,8 @@ * We layout physical memory as follows: * 0x0000 - 0x00ff : Secondary processor spin code * 0x0100 - 0x2fff : pSeries Interrupt prologs - * 0x3000 - 0x3fff : Interrupt support - * 0x4000 - 0x4fff : NACA - * 0x6000 : iSeries and common interrupt prologs + * 0x3000 - 0x6fff : interrupt support, iSeries and common interrupt prologs + * 0x7000 - 0x7fff : FWNMI data area * 0x9000 - 0x9fff : Initial segment table */ @@ -501,17 +500,35 @@ system_call_pSeries: STD_EXCEPTION_PSERIES(0x1300, instruction_breakpoint) STD_EXCEPTION_PSERIES(0x1700, altivec_assist) + . = 0x3000 + +/*** pSeries interrupt support ***/ + /* moved from 0xf00 */ - STD_EXCEPTION_PSERIES(0x3000, performance_monitor) + STD_EXCEPTION_PSERIES(., performance_monitor) - . = 0x3100 + .align 7 _GLOBAL(do_stab_bolted_pSeries) mtcrf 0x80,r12 mfspr r12,SPRG2 EXCEPTION_PROLOG_PSERIES(PACA_EXSLB, .do_stab_bolted) +/* + * Vectors for the FWNMI option. Share common code. + */ + .globl system_reset_fwnmi +system_reset_fwnmi: + HMT_MEDIUM + mtspr SPRG1,r13 /* save r13 */ + RUNLATCH_ON(r13) + EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common) - . = 0x6100 + .globl machine_check_fwnmi +machine_check_fwnmi: + HMT_MEDIUM + mtspr SPRG1,r13 /* save r13 */ + RUNLATCH_ON(r13) + EXCEPTION_PROLOG_PSERIES(PACA_EXMC, machine_check_common) #ifdef CONFIG_PPC_ISERIES /*** ISeries-LPAR interrupt handlers ***/ @@ -656,51 +673,8 @@ hardware_interrupt_iSeries_masked: ld r13,PACA_EXGEN+EX_R13(r13) rfid b . /* prevent speculative execution */ -#endif - -/* - * Data area reserved for FWNMI option. - */ - .= 0x7000 - .globl fwnmi_data_area -fwnmi_data_area: - -#ifdef CONFIG_PPC_ISERIES - . = LPARMAP_PHYS -#include "lparmap.s" #endif /* CONFIG_PPC_ISERIES */ -/* - * Vectors for the FWNMI option. Share common code. - */ - . = 0x8000 - .globl system_reset_fwnmi -system_reset_fwnmi: - HMT_MEDIUM - mtspr SPRG1,r13 /* save r13 */ - RUNLATCH_ON(r13) - EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common) - .globl machine_check_fwnmi -machine_check_fwnmi: - HMT_MEDIUM - mtspr SPRG1,r13 /* save r13 */ - RUNLATCH_ON(r13) - EXCEPTION_PROLOG_PSERIES(PACA_EXMC, machine_check_common) - - /* - * Space for the initial segment table - * For LPAR, the hypervisor must fill in at least one entry - * before we get control (with relocate on) - */ - . = STAB0_PHYS_ADDR - .globl __start_stab -__start_stab: - - . = (STAB0_PHYS_ADDR + PAGE_SIZE) - .globl __end_stab -__end_stab: - - /*** Common interrupt handlers ***/ STD_EXCEPTION_COMMON(0x100, system_reset, .system_reset_exception) @@ -891,6 +865,62 @@ fp_unavailable_common: bl .kernel_fp_unavailable_exception BUG_OPCODE +/* + * load_up_fpu(unused, unused, tsk) + * Disable FP for the task which had the FPU previously, + * and save its floating-point registers in its thread_struct. + * Enables the FPU for use in the kernel on return. + * On SMP we know the fpu is free, since we give it up every + * switch (ie, no lazy save of the FP registers). + * On entry: r13 == 'current' && last_task_used_math != 'current' + */ +_STATIC(load_up_fpu) + mfmsr r5 /* grab the current MSR */ + ori r5,r5,MSR_FP + mtmsrd r5 /* enable use of fpu now */ + isync +/* + * For SMP, we don't do lazy FPU switching because it just gets too + * horrendously complex, especially when a task switches from one CPU + * to another. Instead we call giveup_fpu in switch_to. + * + */ +#ifndef CONFIG_SMP + ld r3,last_task_used_math@got(r2) + ld r4,0(r3) + cmpdi 0,r4,0 + beq 1f + /* Save FP state to last_task_used_math's THREAD struct */ + addi r4,r4,THREAD + SAVE_32FPRS(0, r4) + mffs fr0 + stfd fr0,THREAD_FPSCR(r4) + /* Disable FP for last_task_used_math */ + ld r5,PT_REGS(r4) + ld r4,_MSR-STACK_FRAME_OVERHEAD(r5) + li r6,MSR_FP|MSR_FE0|MSR_FE1 + andc r4,r4,r6 + std r4,_MSR-STACK_FRAME_OVERHEAD(r5) +1: +#endif /* CONFIG_SMP */ + /* enable use of FP after return */ + ld r4,PACACURRENT(r13) + addi r5,r4,THREAD /* Get THREAD */ + ld r4,THREAD_FPEXC_MODE(r5) + ori r12,r12,MSR_FP + or r12,r12,r4 + std r12,_MSR(r1) + lfd fr0,THREAD_FPSCR(r5) + mtfsf 0xff,fr0 + REST_32FPRS(0, r5) +#ifndef CONFIG_SMP + /* Update last_task_used_math to 'current' */ + subi r4,r5,THREAD /* Back to 'current' */ + std r4,0(r3) +#endif /* CONFIG_SMP */ + /* restore registers and return */ + b fast_exception_return + .align 7 .globl altivec_unavailable_common altivec_unavailable_common: @@ -906,6 +936,80 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) bl .altivec_unavailable_exception b .ret_from_except +#ifdef CONFIG_ALTIVEC +/* + * load_up_altivec(unused, unused, tsk) + * Disable VMX for the task which had it previously, + * and save its vector registers in its thread_struct. + * Enables the VMX for use in the kernel on return. + * On SMP we know the VMX is free, since we give it up every + * switch (ie, no lazy save of the vector registers). + * On entry: r13 == 'current' && last_task_used_altivec != 'current' + */ +_STATIC(load_up_altivec) + mfmsr r5 /* grab the current MSR */ + oris r5,r5,MSR_VEC@h + mtmsrd r5 /* enable use of VMX now */ + isync + +/* + * For SMP, we don't do lazy VMX switching because it just gets too + * horrendously complex, especially when a task switches from one CPU + * to another. Instead we call giveup_altvec in switch_to. + * VRSAVE isn't dealt with here, that is done in the normal context + * switch code. Note that we could rely on vrsave value to eventually + * avoid saving all of the VREGs here... + */ +#ifndef CONFIG_SMP + ld r3,last_task_used_altivec@got(r2) + ld r4,0(r3) + cmpdi 0,r4,0 + beq 1f + /* Save VMX state to last_task_used_altivec's THREAD struct */ + addi r4,r4,THREAD + SAVE_32VRS(0,r5,r4) + mfvscr vr0 + li r10,THREAD_VSCR + stvx vr0,r10,r4 + /* Disable VMX for last_task_used_altivec */ + ld r5,PT_REGS(r4) + ld r4,_MSR-STACK_FRAME_OVERHEAD(r5) + lis r6,MSR_VEC@h + andc r4,r4,r6 + std r4,_MSR-STACK_FRAME_OVERHEAD(r5) +1: +#endif /* CONFIG_SMP */ + /* Hack: if we get an altivec unavailable trap with VRSAVE + * set to all zeros, we assume this is a broken application + * that fails to set it properly, and thus we switch it to + * all 1's + */ + mfspr r4,SPRN_VRSAVE + cmpdi 0,r4,0 + bne+ 1f + li r4,-1 + mtspr SPRN_VRSAVE,r4 +1: + /* enable use of VMX after return */ + ld r4,PACACURRENT(r13) + addi r5,r4,THREAD /* Get THREAD */ + oris r12,r12,MSR_VEC@h + std r12,_MSR(r1) + li r4,1 + li r10,THREAD_VSCR + stw r4,THREAD_USED_VR(r5) + lvx vr0,r10,r5 + mtvscr vr0 + REST_32VRS(0,r4,r5) +#ifndef CONFIG_SMP + /* Update last_task_used_math to 'current' */ + subi r4,r5,THREAD /* Back to 'current' */ + std r4,0(r3) +#endif /* CONFIG_SMP */ + /* restore registers and return */ + b fast_exception_return +#endif /* CONFIG_ALTIVEC */ + /* * Hash table stuff */ @@ -1152,6 +1256,27 @@ unrecov_slb: bl .unrecoverable_exception b 1b +/* + * Data area reserved for FWNMI option. + * This address (0x7000) is fixed by the RPA. + */ + .= 0x7000 + .globl fwnmi_data_area +fwnmi_data_area: + .space PAGE_SIZE + + /* + * Space for the initial segment table + * For LPAR, the hypervisor must fill in at least one entry + * before we get control (with relocate on) + */ + . = STAB0_PHYS_ADDR + .globl __start_stab +__start_stab: + + . = (STAB0_PHYS_ADDR + PAGE_SIZE) + .globl __end_stab +__end_stab: /* * On pSeries, secondary processors spin in the following code. @@ -1416,62 +1541,6 @@ _GLOBAL(copy_and_flush) copy_to_here: /* - * load_up_fpu(unused, unused, tsk) - * Disable FP for the task which had the FPU previously, - * and save its floating-point registers in its thread_struct. - * Enables the FPU for use in the kernel on return. - * On SMP we know the fpu is free, since we give it up every - * switch (ie, no lazy save of the FP registers). - * On entry: r13 == 'current' && last_task_used_math != 'current' - */ -_STATIC(load_up_fpu) - mfmsr r5 /* grab the current MSR */ - ori r5,r5,MSR_FP - mtmsrd r5 /* enable use of fpu now */ - isync -/* - * For SMP, we don't do lazy FPU switching because it just gets too - * horrendously complex, especially when a task switches from one CPU - * to another. Instead we call giveup_fpu in switch_to. - * - */ -#ifndef CONFIG_SMP - ld r3,last_task_used_math@got(r2) - ld r4,0(r3) - cmpdi 0,r4,0 - beq 1f - /* Save FP state to last_task_used_math's THREAD struct */ - addi r4,r4,THREAD - SAVE_32FPRS(0, r4) - mffs fr0 - stfd fr0,THREAD_FPSCR(r4) - /* Disable FP for last_task_used_math */ - ld r5,PT_REGS(r4) - ld r4,_MSR-STACK_FRAME_OVERHEAD(r5) - li r6,MSR_FP|MSR_FE0|MSR_FE1 - andc r4,r4,r6 - std r4,_MSR-STACK_FRAME_OVERHEAD(r5) -1: -#endif /* CONFIG_SMP */ - /* enable use of FP after return */ - ld r4,PACACURRENT(r13) - addi r5,r4,THREAD /* Get THREAD */ - ld r4,THREAD_FPEXC_MODE(r5) - ori r12,r12,MSR_FP - or r12,r12,r4 - std r12,_MSR(r1) - lfd fr0,THREAD_FPSCR(r5) - mtfsf 0xff,fr0 - REST_32FPRS(0, r5) -#ifndef CONFIG_SMP - /* Update last_task_used_math to 'current' */ - subi r4,r5,THREAD /* Back to 'current' */ - std r4,0(r3) -#endif /* CONFIG_SMP */ - /* restore registers and return */ - b fast_exception_return - -/* * disable_kernel_fp() * Disable the FPU. */ @@ -1515,81 +1584,7 @@ _GLOBAL(giveup_fpu) #endif /* CONFIG_SMP */ blr - #ifdef CONFIG_ALTIVEC - -/* - * load_up_altivec(unused, unused, tsk) - * Disable VMX for the task which had it previously, - * and save its vector registers in its thread_struct. - * Enables the VMX for use in the kernel on return. - * On SMP we know the VMX is free, since we give it up every - * switch (ie, no lazy save of the vector registers). - * On entry: r13 == 'current' && last_task_used_altivec != 'current' - */ -_STATIC(load_up_altivec) - mfmsr r5 /* grab the current MSR */ - oris r5,r5,MSR_VEC@h - mtmsrd r5 /* enable use of VMX now */ - isync - -/* - * For SMP, we don't do lazy VMX switching because it just gets too - * horrendously complex, especially when a task switches from one CPU - * to another. Instead we call giveup_altvec in switch_to. - * VRSAVE isn't dealt with here, that is done in the normal context - * switch code. Note that we could rely on vrsave value to eventually - * avoid saving all of the VREGs here... - */ -#ifndef CONFIG_SMP - ld r3,last_task_used_altivec@got(r2) - ld r4,0(r3) - cmpdi 0,r4,0 - beq 1f - /* Save VMX state to last_task_used_altivec's THREAD struct */ - addi r4,r4,THREAD - SAVE_32VRS(0,r5,r4) - mfvscr vr0 - li r10,THREAD_VSCR - stvx vr0,r10,r4 - /* Disable VMX for last_task_used_altivec */ - ld r5,PT_REGS(r4) - ld r4,_MSR-STACK_FRAME_OVERHEAD(r5) - lis r6,MSR_VEC@h - andc r4,r4,r6 - std r4,_MSR-STACK_FRAME_OVERHEAD(r5) -1: -#endif /* CONFIG_SMP */ - /* Hack: if we get an altivec unavailable trap with VRSAVE - * set to all zeros, we assume this is a broken application - * that fails to set it properly, and thus we switch it to - * all 1's - */ - mfspr r4,SPRN_VRSAVE - cmpdi 0,r4,0 - bne+ 1f - li r4,-1 - mtspr SPRN_VRSAVE,r4 -1: - /* enable use of VMX after return */ - ld r4,PACACURRENT(r13) - addi r5,r4,THREAD /* Get THREAD */ - oris r12,r12,MSR_VEC@h - std r12,_MSR(r1) - li r4,1 - li r10,THREAD_VSCR - stw r4,THREAD_USED_VR(r5) - lvx vr0,r10,r5 - mtvscr vr0 - REST_32VRS(0,r4,r5) -#ifndef CONFIG_SMP - /* Update last_task_used_math to 'current' */ - subi r4,r5,THREAD /* Back to 'current' */ - std r4,0(r3) -#endif /* CONFIG_SMP */ - /* restore registers and return */ - b fast_exception_return - /* * disable_kernel_altivec() * Disable the VMX. -- cgit v0.10.2 From c59c464a3e29830bcfae5eea1777cad9e00087f3 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Fri, 19 Aug 2005 14:52:31 +1000 Subject: [PATCH] Change address of ppc64 initial segment table On ppc64 machines with segment tables, CPU0's segment table is at a fixed address, currently 0x9000. This patch moves it to the free space at 0x6000, just below the fwnmi data area. This saves 8k of space in vmlinux and the runtime kernel image. Signed-off-by: David Gibson Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/kernel/head.S b/arch/ppc64/kernel/head.S index eb54f05..7de38eb 100644 --- a/arch/ppc64/kernel/head.S +++ b/arch/ppc64/kernel/head.S @@ -52,9 +52,10 @@ * We layout physical memory as follows: * 0x0000 - 0x00ff : Secondary processor spin code * 0x0100 - 0x2fff : pSeries Interrupt prologs - * 0x3000 - 0x6fff : interrupt support, iSeries and common interrupt prologs + * 0x3000 - 0x5fff : interrupt support, iSeries and common interrupt prologs + * 0x6000 - 0x6fff : Initial (CPU0) segment table * 0x7000 - 0x7fff : FWNMI data area - * 0x9000 - 0x9fff : Initial segment table + * 0x8000 - : Early init and support code */ /* @@ -1257,6 +1258,20 @@ unrecov_slb: b 1b /* + * Space for CPU0's segment table. + * + * On iSeries, the hypervisor must fill in at least one entry before + * we get control (with relocate on). The address is give to the hv + * as a page number (see xLparMap in LparData.c), so this must be at a + * fixed address (the linker can't compute (u64)&initial_stab >> + * PAGE_SHIFT). + */ + . = STAB0_PHYS_ADDR /* 0x6000 */ + .globl initial_stab +initial_stab: + .space 4096 + +/* * Data area reserved for FWNMI option. * This address (0x7000) is fixed by the RPA. */ @@ -1265,19 +1280,6 @@ unrecov_slb: fwnmi_data_area: .space PAGE_SIZE - /* - * Space for the initial segment table - * For LPAR, the hypervisor must fill in at least one entry - * before we get control (with relocate on) - */ - . = STAB0_PHYS_ADDR - .globl __start_stab -__start_stab: - - . = (STAB0_PHYS_ADDR + PAGE_SIZE) - .globl __end_stab -__end_stab: - /* * On pSeries, secondary processors spin in the following code. * At entry, r3 = this processor's number (physical cpu id) diff --git a/arch/ppc64/kernel/pacaData.c b/arch/ppc64/kernel/pacaData.c index 6316188..6182a2c 100644 --- a/arch/ppc64/kernel/pacaData.c +++ b/arch/ppc64/kernel/pacaData.c @@ -78,7 +78,7 @@ extern unsigned long __toc_start; #define BOOTCPU_PACA_INIT(number) \ { \ - PACA_INIT_COMMON(number, 1, 0, STAB0_VIRT_ADDR) \ + PACA_INIT_COMMON(number, 1, 0, (u64)&initial_stab) \ PACA_INIT_ISERIES(number) \ } @@ -90,7 +90,7 @@ extern unsigned long __toc_start; #define BOOTCPU_PACA_INIT(number) \ { \ - PACA_INIT_COMMON(number, 1, STAB0_PHYS_ADDR, STAB0_VIRT_ADDR) \ + PACA_INIT_COMMON(number, 1, STAB0_PHYS_ADDR, (u64)&initial_stab) \ } #endif diff --git a/include/asm-ppc64/mmu.h b/include/asm-ppc64/mmu.h index 959a4bf..789c269 100644 --- a/include/asm-ppc64/mmu.h +++ b/include/asm-ppc64/mmu.h @@ -28,9 +28,12 @@ #define STE_VSID_SHIFT 12 /* Location of cpu0's segment table */ -#define STAB0_PAGE 0x9 +#define STAB0_PAGE 0x6 #define STAB0_PHYS_ADDR (STAB0_PAGE< Date: Fri, 19 Aug 2005 14:52:31 +1000 Subject: [PATCH] Remove general use functions from head.S As well as the interrupt vectors and initialization code, head.S contains several asm functions which are used during runtime. This patch moves these to misc.S, a more sensible location for random asm support code. A couple The functions moved are: disable_kernel_fp giveup_fpu disable_kernel_altivec giveup_altivec __setup_cpu_power3 (empty function) Signed-off-by: David Gibson Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/kernel/head.S b/arch/ppc64/kernel/head.S index 7de38eb..70e1040 100644 --- a/arch/ppc64/kernel/head.S +++ b/arch/ppc64/kernel/head.S @@ -1542,98 +1542,6 @@ _GLOBAL(copy_and_flush) .align 8 copy_to_here: -/* - * disable_kernel_fp() - * Disable the FPU. - */ -_GLOBAL(disable_kernel_fp) - mfmsr r3 - rldicl r0,r3,(63-MSR_FP_LG),1 - rldicl r3,r0,(MSR_FP_LG+1),0 - mtmsrd r3 /* disable use of fpu now */ - isync - blr - -/* - * giveup_fpu(tsk) - * Disable FP for the task given as the argument, - * and save the floating-point registers in its thread_struct. - * Enables the FPU for use in the kernel on return. - */ -_GLOBAL(giveup_fpu) - mfmsr r5 - ori r5,r5,MSR_FP - mtmsrd r5 /* enable use of fpu now */ - isync - cmpdi 0,r3,0 - beqlr- /* if no previous owner, done */ - addi r3,r3,THREAD /* want THREAD of task */ - ld r5,PT_REGS(r3) - cmpdi 0,r5,0 - SAVE_32FPRS(0, r3) - mffs fr0 - stfd fr0,THREAD_FPSCR(r3) - beq 1f - ld r4,_MSR-STACK_FRAME_OVERHEAD(r5) - li r3,MSR_FP|MSR_FE0|MSR_FE1 - andc r4,r4,r3 /* disable FP for previous task */ - std r4,_MSR-STACK_FRAME_OVERHEAD(r5) -1: -#ifndef CONFIG_SMP - li r5,0 - ld r4,last_task_used_math@got(r2) - std r5,0(r4) -#endif /* CONFIG_SMP */ - blr - -#ifdef CONFIG_ALTIVEC -/* - * disable_kernel_altivec() - * Disable the VMX. - */ -_GLOBAL(disable_kernel_altivec) - mfmsr r3 - rldicl r0,r3,(63-MSR_VEC_LG),1 - rldicl r3,r0,(MSR_VEC_LG+1),0 - mtmsrd r3 /* disable use of VMX now */ - isync - blr - -/* - * giveup_altivec(tsk) - * Disable VMX for the task given as the argument, - * and save the vector registers in its thread_struct. - * Enables the VMX for use in the kernel on return. - */ -_GLOBAL(giveup_altivec) - mfmsr r5 - oris r5,r5,MSR_VEC@h - mtmsrd r5 /* enable use of VMX now */ - isync - cmpdi 0,r3,0 - beqlr- /* if no previous owner, done */ - addi r3,r3,THREAD /* want THREAD of task */ - ld r5,PT_REGS(r3) - cmpdi 0,r5,0 - SAVE_32VRS(0,r4,r3) - mfvscr vr0 - li r4,THREAD_VSCR - stvx vr0,r4,r3 - beq 1f - ld r4,_MSR-STACK_FRAME_OVERHEAD(r5) - lis r3,MSR_VEC@h - andc r4,r4,r3 /* disable FP for previous task */ - std r4,_MSR-STACK_FRAME_OVERHEAD(r5) -1: -#ifndef CONFIG_SMP - li r5,0 - ld r4,last_task_used_altivec@got(r2) - std r5,0(r4) -#endif /* CONFIG_SMP */ - blr - -#endif /* CONFIG_ALTIVEC */ - #ifdef CONFIG_SMP #ifdef CONFIG_PPC_PMAC /* @@ -1984,9 +1892,6 @@ _STATIC(start_here_common) bl .start_kernel -_GLOBAL(__setup_cpu_power3) - blr - _GLOBAL(hmt_init) #ifdef CONFIG_HMT LOADADDR(r5, hmt_thread_data) diff --git a/arch/ppc64/kernel/misc.S b/arch/ppc64/kernel/misc.S index a05b50b..474df0a 100644 --- a/arch/ppc64/kernel/misc.S +++ b/arch/ppc64/kernel/misc.S @@ -680,6 +680,104 @@ _GLOBAL(kernel_thread) ld r30,-16(r1) blr +/* + * disable_kernel_fp() + * Disable the FPU. + */ +_GLOBAL(disable_kernel_fp) + mfmsr r3 + rldicl r0,r3,(63-MSR_FP_LG),1 + rldicl r3,r0,(MSR_FP_LG+1),0 + mtmsrd r3 /* disable use of fpu now */ + isync + blr + +/* + * giveup_fpu(tsk) + * Disable FP for the task given as the argument, + * and save the floating-point registers in its thread_struct. + * Enables the FPU for use in the kernel on return. + */ +_GLOBAL(giveup_fpu) + mfmsr r5 + ori r5,r5,MSR_FP + mtmsrd r5 /* enable use of fpu now */ + isync + cmpdi 0,r3,0 + beqlr- /* if no previous owner, done */ + addi r3,r3,THREAD /* want THREAD of task */ + ld r5,PT_REGS(r3) + cmpdi 0,r5,0 + SAVE_32FPRS(0, r3) + mffs fr0 + stfd fr0,THREAD_FPSCR(r3) + beq 1f + ld r4,_MSR-STACK_FRAME_OVERHEAD(r5) + li r3,MSR_FP|MSR_FE0|MSR_FE1 + andc r4,r4,r3 /* disable FP for previous task */ + std r4,_MSR-STACK_FRAME_OVERHEAD(r5) +1: +#ifndef CONFIG_SMP + li r5,0 + ld r4,last_task_used_math@got(r2) + std r5,0(r4) +#endif /* CONFIG_SMP */ + blr + +#ifdef CONFIG_ALTIVEC + +#if 0 /* this has no callers for now */ +/* + * disable_kernel_altivec() + * Disable the VMX. + */ +_GLOBAL(disable_kernel_altivec) + mfmsr r3 + rldicl r0,r3,(63-MSR_VEC_LG),1 + rldicl r3,r0,(MSR_VEC_LG+1),0 + mtmsrd r3 /* disable use of VMX now */ + isync + blr +#endif /* 0 */ + +/* + * giveup_altivec(tsk) + * Disable VMX for the task given as the argument, + * and save the vector registers in its thread_struct. + * Enables the VMX for use in the kernel on return. + */ +_GLOBAL(giveup_altivec) + mfmsr r5 + oris r5,r5,MSR_VEC@h + mtmsrd r5 /* enable use of VMX now */ + isync + cmpdi 0,r3,0 + beqlr- /* if no previous owner, done */ + addi r3,r3,THREAD /* want THREAD of task */ + ld r5,PT_REGS(r3) + cmpdi 0,r5,0 + SAVE_32VRS(0,r4,r3) + mfvscr vr0 + li r4,THREAD_VSCR + stvx vr0,r4,r3 + beq 1f + ld r4,_MSR-STACK_FRAME_OVERHEAD(r5) + lis r3,MSR_VEC@h + andc r4,r4,r3 /* disable FP for previous task */ + std r4,_MSR-STACK_FRAME_OVERHEAD(r5) +1: +#ifndef CONFIG_SMP + li r5,0 + ld r4,last_task_used_altivec@got(r2) + std r5,0(r4) +#endif /* CONFIG_SMP */ + blr + +#endif /* CONFIG_ALTIVEC */ + +_GLOBAL(__setup_cpu_power3) + blr + /* kexec_wait(phys_cpu) * * wait for the flag to change, indicating this kernel is going away but -- cgit v0.10.2 From 60ba44945714d9b7dae8b85ab0926f6f13809c73 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Fri, 19 Aug 2005 14:52:32 +1000 Subject: [PATCH] Fix apparent code overlap in ppc64 head.S An #if/#else construct near the top of ppc64's head.S appears to create overlapping sections of code for iSeries and pSeries (i.e. one thing on iSeries and something different in the same place on pSeries). In fact, checking the various absolute offsets, it doesn't. This patch unravels the #ifdefs to make it more obvious what's going on. This accomplishes another microstep towards a single kernel image which can boot both iSeries and pSeries. Signed-off-by: David Gibson Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/kernel/head.S b/arch/ppc64/kernel/head.S index 70e1040..1cc9f20 100644 --- a/arch/ppc64/kernel/head.S +++ b/arch/ppc64/kernel/head.S @@ -93,6 +93,7 @@ END_FTR_SECTION(0, 1) /* Catch branch to 0 in real mode */ trap + #ifdef CONFIG_PPC_ISERIES /* * At offset 0x20, there is a pointer to iSeries LPAR data. @@ -119,7 +120,7 @@ embedded_sysmap_start: embedded_sysmap_end: .llong 0 -#else /* CONFIG_PPC_ISERIES */ +#endif /* CONFIG_PPC_ISERIES */ /* Secondary processors spin on this value until it goes to 1. */ .globl __secondary_hold_spinloop @@ -169,7 +170,6 @@ _GLOBAL(__secondary_hold) BUG_OPCODE #endif #endif -#endif /* This value is used to mark exception frames on the stack. */ .section ".toc","aw" -- cgit v0.10.2 From 1d086e6bd605ac44154e019fe96ae3568e8b2ba2 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Fri, 19 Aug 2005 14:52:32 +1000 Subject: [PATCH] Remove unneeded #defines in head.S arch/ppc64/kernel/head.S #defines SECONDARY_PROCESSORS then has some #ifdefs based on it. Whatever purpose this had is long lost, this patch removes it. Likewise, head.S defines H_SET_ASR, which is now defined, along with other hypervisor call numbers in hvcall.h. This patch deletes it, as well, from head.S. Signed-off-by: David Gibson Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/kernel/head.S b/arch/ppc64/kernel/head.S index 1cc9f20..bf2345e 100644 --- a/arch/ppc64/kernel/head.S +++ b/arch/ppc64/kernel/head.S @@ -23,8 +23,6 @@ * 2 of the License, or (at your option) any later version. */ -#define SECONDARY_PROCESSORS - #include #include #include @@ -44,11 +42,6 @@ #endif /* - * hcall interface to pSeries LPAR - */ -#define H_SET_ASR 0x30 - -/* * We layout physical memory as follows: * 0x0000 - 0x00ff : Secondary processor spin code * 0x0100 - 0x2fff : pSeries Interrupt prologs @@ -629,9 +622,7 @@ system_reset_iSeries: cmpwi 0,r23,0 beq iSeries_secondary_smp_loop /* Loop until told to go */ -#ifdef SECONDARY_PROCESSORS bne .__secondary_start /* Loop until told to go */ -#endif iSeries_secondary_smp_loop: /* Let the Hypervisor know we are alive */ /* 8002 is a call to HvCallCfg::getLps, a harmless Hypervisor function */ @@ -1325,10 +1316,8 @@ _GLOBAL(pSeries_secondary_smp_init) cmpwi 0,r23,0 #ifdef CONFIG_SMP -#ifdef SECONDARY_PROCESSORS bne .__secondary_start #endif -#endif b 3b /* Loop until told to go */ #ifdef CONFIG_PPC_ISERIES -- cgit v0.10.2 From 91a57fc6723d778e12686b5106a38583072fd767 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Fri, 19 Aug 2005 14:52:32 +1000 Subject: [PATCH] Tweak comments in ppc64 head.S This patch adjust some comments in head.S for accuracy, clarity, and spelling. Signed-off-by: David Gibson Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/kernel/head.S b/arch/ppc64/kernel/head.S index bf2345e..c7462fa 100644 --- a/arch/ppc64/kernel/head.S +++ b/arch/ppc64/kernel/head.S @@ -148,7 +148,7 @@ _GLOBAL(__secondary_hold) std r24,__secondary_hold_acknowledge@l(0) sync - /* All secondary cpu's wait here until told to start. */ + /* All secondary cpus wait here until told to start. */ 100: ld r4,__secondary_hold_spinloop@l(0) cmpdi 0,r4,1 bne 100b @@ -703,8 +703,8 @@ machine_check_common: * R9 contains the saved CR, r13 points to the paca, * r10 contains the (bad) kernel stack pointer, * r11 and r12 contain the saved SRR0 and SRR1. - * We switch to using the paca guard page as an emergency stack, - * save the registers there, and call kernel_bad_stack(), which panics. + * We switch to using an emergency stack, save the registers there, + * and call kernel_bad_stack(), which panics. */ bad_stack: ld r1,PACAEMERGSP(r13) @@ -1303,7 +1303,7 @@ _GLOBAL(pSeries_secondary_smp_init) b .kexec_wait /* next kernel might do better */ 2: mtspr SPRG3,r13 /* Save vaddr of paca in SPRG3 */ - /* From now on, r24 is expected to be logica cpuid */ + /* From now on, r24 is expected to be logical cpuid */ mr r24,r5 3: HMT_LOW lbz r23,PACAPROCSTART(r13) /* Test if this processor should */ -- cgit v0.10.2 From 6fbb49d56d228b666cb4534bbc3c2dfe833c8053 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Fri, 19 Aug 2005 14:52:32 +1000 Subject: [PATCH] Move variables in ppc64 head.S from .data to .bss The ppc64 head.S defines several zero-initialized structures, such as the empty_zero_page and the kernel top-level pagetable. Currently they are defined to be in the data section. However, they're not used until after the bss is cleared, so this patch moves them to the bss, saving two and a half pages from the vmlinux. Signed-off-by: David Gibson Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/kernel/head.S b/arch/ppc64/kernel/head.S index c7462fa..a0ff707 100644 --- a/arch/ppc64/kernel/head.S +++ b/arch/ppc64/kernel/head.S @@ -1971,20 +1971,19 @@ _GLOBAL(smp_release_cpus) /* * We put a few things here that have to be page-aligned. - * This stuff goes at the beginning of the data segment, - * which is page-aligned. + * This stuff goes at the beginning of the bss, which is page-aligned. */ - .data + .section ".bss" + .align 12 - .globl sdata -sdata: + .globl empty_zero_page empty_zero_page: - .space 4096 + .space PAGE_SIZE .globl swapper_pg_dir swapper_pg_dir: - .space 4096 + .space PAGE_SIZE /* * This space gets a copy of optional info passed to us by the bootstrap -- cgit v0.10.2 From 7a6af5e38054d8e658a4b1b703902331a845de1a Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Wed, 3 Aug 2005 14:32:30 +1000 Subject: [PATCH] ppc64: remove firmware features from cpu_spec The firmware_features field of struct cpu_spec should really be a separate variable as the firmware features do not depend on the chip and the bitmask is constructed independently. By removing it, we save 112 bytes from the cpu_specs array and we access the bitmask directly instead of via the cur_cpu_spec pointer. Signed-off-by: Stephen Rothwell Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/kernel/cputable.c b/arch/ppc64/kernel/cputable.c index 77cec42..84fdd27 100644 --- a/arch/ppc64/kernel/cputable.c +++ b/arch/ppc64/kernel/cputable.c @@ -23,6 +23,7 @@ struct cpu_spec* cur_cpu_spec = NULL; EXPORT_SYMBOL(cur_cpu_spec); +unsigned long ppc64_firmware_features; /* NOTE: * Unlike ppc32, ppc64 will only call this once for the boot CPU, it's @@ -60,7 +61,6 @@ struct cpu_spec cpu_specs[] = { .icache_bsize = 128, .dcache_bsize = 128, .cpu_setup = __setup_cpu_power3, - .firmware_features = COMMON_PPC64_FW, }, { /* Power3+ */ .pvr_mask = 0xffff0000, @@ -73,7 +73,6 @@ struct cpu_spec cpu_specs[] = { .icache_bsize = 128, .dcache_bsize = 128, .cpu_setup = __setup_cpu_power3, - .firmware_features = COMMON_PPC64_FW, }, { /* Northstar */ .pvr_mask = 0xffff0000, @@ -86,7 +85,6 @@ struct cpu_spec cpu_specs[] = { .icache_bsize = 128, .dcache_bsize = 128, .cpu_setup = __setup_cpu_power3, - .firmware_features = COMMON_PPC64_FW, }, { /* Pulsar */ .pvr_mask = 0xffff0000, @@ -99,7 +97,6 @@ struct cpu_spec cpu_specs[] = { .icache_bsize = 128, .dcache_bsize = 128, .cpu_setup = __setup_cpu_power3, - .firmware_features = COMMON_PPC64_FW, }, { /* I-star */ .pvr_mask = 0xffff0000, @@ -112,7 +109,6 @@ struct cpu_spec cpu_specs[] = { .icache_bsize = 128, .dcache_bsize = 128, .cpu_setup = __setup_cpu_power3, - .firmware_features = COMMON_PPC64_FW, }, { /* S-star */ .pvr_mask = 0xffff0000, @@ -125,7 +121,6 @@ struct cpu_spec cpu_specs[] = { .icache_bsize = 128, .dcache_bsize = 128, .cpu_setup = __setup_cpu_power3, - .firmware_features = COMMON_PPC64_FW, }, { /* Power4 */ .pvr_mask = 0xffff0000, @@ -138,7 +133,6 @@ struct cpu_spec cpu_specs[] = { .icache_bsize = 128, .dcache_bsize = 128, .cpu_setup = __setup_cpu_power4, - .firmware_features = COMMON_PPC64_FW, }, { /* Power4+ */ .pvr_mask = 0xffff0000, @@ -151,7 +145,6 @@ struct cpu_spec cpu_specs[] = { .icache_bsize = 128, .dcache_bsize = 128, .cpu_setup = __setup_cpu_power4, - .firmware_features = COMMON_PPC64_FW, }, { /* PPC970 */ .pvr_mask = 0xffff0000, @@ -166,7 +159,6 @@ struct cpu_spec cpu_specs[] = { .icache_bsize = 128, .dcache_bsize = 128, .cpu_setup = __setup_cpu_ppc970, - .firmware_features = COMMON_PPC64_FW, }, { /* PPC970FX */ .pvr_mask = 0xffff0000, @@ -181,7 +173,6 @@ struct cpu_spec cpu_specs[] = { .icache_bsize = 128, .dcache_bsize = 128, .cpu_setup = __setup_cpu_ppc970, - .firmware_features = COMMON_PPC64_FW, }, { /* PPC970MP */ .pvr_mask = 0xffff0000, @@ -196,7 +187,6 @@ struct cpu_spec cpu_specs[] = { .icache_bsize = 128, .dcache_bsize = 128, .cpu_setup = __setup_cpu_ppc970, - .firmware_features = COMMON_PPC64_FW, }, { /* Power5 */ .pvr_mask = 0xffff0000, @@ -211,7 +201,6 @@ struct cpu_spec cpu_specs[] = { .icache_bsize = 128, .dcache_bsize = 128, .cpu_setup = __setup_cpu_power4, - .firmware_features = COMMON_PPC64_FW, }, { /* Power5 */ .pvr_mask = 0xffff0000, @@ -226,7 +215,6 @@ struct cpu_spec cpu_specs[] = { .icache_bsize = 128, .dcache_bsize = 128, .cpu_setup = __setup_cpu_power4, - .firmware_features = COMMON_PPC64_FW, }, { /* BE DD1.x */ .pvr_mask = 0xffff0000, @@ -241,7 +229,6 @@ struct cpu_spec cpu_specs[] = { .icache_bsize = 128, .dcache_bsize = 128, .cpu_setup = __setup_cpu_be, - .firmware_features = COMMON_PPC64_FW, }, { /* default match */ .pvr_mask = 0x00000000, @@ -254,7 +241,6 @@ struct cpu_spec cpu_specs[] = { .icache_bsize = 128, .dcache_bsize = 128, .cpu_setup = __setup_cpu_power4, - .firmware_features = COMMON_PPC64_FW, } }; diff --git a/arch/ppc64/kernel/lparcfg.c b/arch/ppc64/kernel/lparcfg.c index 02e9662..9383538 100644 --- a/arch/ppc64/kernel/lparcfg.c +++ b/arch/ppc64/kernel/lparcfg.c @@ -377,7 +377,7 @@ static int lparcfg_data(struct seq_file *m, void *v) partition_active_processors = lparcfg_count_active_processors(); - if (cur_cpu_spec->firmware_features & FW_FEATURE_SPLPAR) { + if (ppc64_firmware_features & FW_FEATURE_SPLPAR) { unsigned long h_entitled, h_unallocated; unsigned long h_aggregation, h_resource; unsigned long pool_idle_time, pool_procs; @@ -571,7 +571,7 @@ int __init lparcfg_init(void) mode_t mode = S_IRUSR; /* Allow writing if we have FW_FEATURE_SPLPAR */ - if (cur_cpu_spec->firmware_features & FW_FEATURE_SPLPAR) { + if (ppc64_firmware_features & FW_FEATURE_SPLPAR) { lparcfg_fops.write = lparcfg_write; mode |= S_IWUSR; } diff --git a/arch/ppc64/kernel/pSeries_iommu.c b/arch/ppc64/kernel/pSeries_iommu.c index 6913052..a5786be 100644 --- a/arch/ppc64/kernel/pSeries_iommu.c +++ b/arch/ppc64/kernel/pSeries_iommu.c @@ -546,7 +546,7 @@ void iommu_init_early_pSeries(void) } if (systemcfg->platform & PLATFORM_LPAR) { - if (cur_cpu_spec->firmware_features & FW_FEATURE_MULTITCE) { + if (ppc64_firmware_features & FW_FEATURE_MULTITCE) { ppc_md.tce_build = tce_buildmulti_pSeriesLP; ppc_md.tce_free = tce_freemulti_pSeriesLP; } else { diff --git a/arch/ppc64/kernel/pSeries_setup.c b/arch/ppc64/kernel/pSeries_setup.c index 5bec956..d3975ac 100644 --- a/arch/ppc64/kernel/pSeries_setup.c +++ b/arch/ppc64/kernel/pSeries_setup.c @@ -231,11 +231,11 @@ static void __init pSeries_setup_arch(void) pSeries_nvram_init(); - if (cur_cpu_spec->firmware_features & FW_FEATURE_SPLPAR) + if (ppc64_firmware_features & FW_FEATURE_SPLPAR) vpa_init(boot_cpuid); /* Choose an idle loop */ - if (cur_cpu_spec->firmware_features & FW_FEATURE_SPLPAR) { + if (ppc64_firmware_features & FW_FEATURE_SPLPAR) { if (get_paca()->lppaca.shared_proc) { printk(KERN_INFO "Using shared processor idle loop\n"); ppc_md.idle_loop = pseries_shared_idle; @@ -260,7 +260,7 @@ static int __init pSeries_init_panel(void) arch_initcall(pSeries_init_panel); -/* Build up the firmware_features bitmask field +/* Build up the ppc64_firmware_features bitmask field * using contents of device-tree/ibm,hypertas-functions. * Ultimately this functionality may be moved into prom.c prom_init(). */ @@ -272,7 +272,7 @@ void __init fw_feature_init(void) DBG(" -> fw_feature_init()\n"); - cur_cpu_spec->firmware_features = 0; + ppc64_firmware_features = 0; dn = of_find_node_by_path("/rtas"); if (dn == NULL) { printk(KERN_ERR "WARNING ! Cannot find RTAS in device-tree !\n"); @@ -288,7 +288,7 @@ void __init fw_feature_init(void) if ((firmware_features_table[i].name) && (strcmp(firmware_features_table[i].name,hypertas))==0) { /* we have a match */ - cur_cpu_spec->firmware_features |= + ppc64_firmware_features |= (firmware_features_table[i].val); break; } @@ -302,7 +302,7 @@ void __init fw_feature_init(void) of_node_put(dn); no_rtas: printk(KERN_INFO "firmware_features = 0x%lx\n", - cur_cpu_spec->firmware_features); + ppc64_firmware_features); DBG(" <- fw_feature_init()\n"); } diff --git a/arch/ppc64/kernel/pSeries_smp.c b/arch/ppc64/kernel/pSeries_smp.c index 62c55a1..8312d32 100644 --- a/arch/ppc64/kernel/pSeries_smp.c +++ b/arch/ppc64/kernel/pSeries_smp.c @@ -326,7 +326,7 @@ static void __devinit smp_xics_setup_cpu(int cpu) if (cpu != boot_cpuid) xics_setup_cpu(); - if (cur_cpu_spec->firmware_features & FW_FEATURE_SPLPAR) + if (ppc64_firmware_features & FW_FEATURE_SPLPAR) vpa_init(cpu); cpu_clear(cpu, of_spin_map); diff --git a/arch/ppc64/kernel/process.c b/arch/ppc64/kernel/process.c index f7cae05..390f434 100644 --- a/arch/ppc64/kernel/process.c +++ b/arch/ppc64/kernel/process.c @@ -206,7 +206,7 @@ struct task_struct *__switch_to(struct task_struct *prev, /* purr is nothing but processor time base */ #if defined(CONFIG_PPC_PSERIES) - if (cur_cpu_spec->firmware_features & FW_FEATURE_SPLPAR) { + if (ppc64_firmware_features & FW_FEATURE_SPLPAR) { struct cpu_usage *cu = &__get_cpu_var(cpu_usage_array); long unsigned start_tb, current_tb; start_tb = old_thread->start_tb; diff --git a/arch/ppc64/kernel/sysfs.c b/arch/ppc64/kernel/sysfs.c index 02b8ac4..90b653c 100644 --- a/arch/ppc64/kernel/sysfs.c +++ b/arch/ppc64/kernel/sysfs.c @@ -154,7 +154,7 @@ void ppc64_enable_pmcs(void) #ifdef CONFIG_PPC_PSERIES /* instruct hypervisor to maintain PMCs */ - if (cur_cpu_spec->firmware_features & FW_FEATURE_SPLPAR) + if (ppc64_firmware_features & FW_FEATURE_SPLPAR) get_paca()->lppaca.pmcregs_in_use = 1; #endif /* CONFIG_PPC_PSERIES */ } diff --git a/arch/ppc64/kernel/time.c b/arch/ppc64/kernel/time.c index 909462e..1c05cee 100644 --- a/arch/ppc64/kernel/time.c +++ b/arch/ppc64/kernel/time.c @@ -372,7 +372,7 @@ int timer_interrupt(struct pt_regs * regs) /* collect purr register values often, for accurate calculations */ #if defined(CONFIG_PPC_PSERIES) - if (cur_cpu_spec->firmware_features & FW_FEATURE_SPLPAR) { + if (ppc64_firmware_features & FW_FEATURE_SPLPAR) { struct cpu_usage *cu = &__get_cpu_var(cpu_usage_array); cu->current_tb = mfspr(SPRN_PURR); } diff --git a/include/asm-ppc64/cputable.h b/include/asm-ppc64/cputable.h index d67fa9e..d55698a 100644 --- a/include/asm-ppc64/cputable.h +++ b/include/asm-ppc64/cputable.h @@ -56,11 +56,6 @@ struct cpu_spec { * BHT, SPD, etc... from head.S before branching to identify_machine */ cpu_setup_t cpu_setup; - - /* This is used to identify firmware features which are available - * to the kernel. - */ - unsigned long firmware_features; }; extern struct cpu_spec cpu_specs[]; @@ -72,6 +67,11 @@ static inline unsigned long cpu_has_feature(unsigned long feature) } +/* This is used to identify firmware features which are available + * to the kernel. + */ +extern unsigned long ppc64_firmware_features; + /* firmware feature bitmask values */ #define FIRMWARE_MAX_FEATURES 63 -- cgit v0.10.2 From 1ababe11480d59d75be806804c71fa55d203a5a6 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Wed, 3 Aug 2005 14:35:25 +1000 Subject: [PATCH] ppc64: create firmware_has_feature() Create the firmware_has_feature() inline and move the firmware feature stuff into its own header file. Signed-off-by: Stephen Rothwell Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/kernel/cputable.c b/arch/ppc64/kernel/cputable.c index 84fdd27..6294fc7 100644 --- a/arch/ppc64/kernel/cputable.c +++ b/arch/ppc64/kernel/cputable.c @@ -20,6 +20,7 @@ #include #include +#include struct cpu_spec* cur_cpu_spec = NULL; EXPORT_SYMBOL(cur_cpu_spec); diff --git a/arch/ppc64/kernel/lparcfg.c b/arch/ppc64/kernel/lparcfg.c index 9383538..9d034ff 100644 --- a/arch/ppc64/kernel/lparcfg.c +++ b/arch/ppc64/kernel/lparcfg.c @@ -29,7 +29,7 @@ #include #include #include -#include +#include #include #include #include @@ -377,7 +377,7 @@ static int lparcfg_data(struct seq_file *m, void *v) partition_active_processors = lparcfg_count_active_processors(); - if (ppc64_firmware_features & FW_FEATURE_SPLPAR) { + if (firmware_has_feature(FW_FEATURE_SPLPAR)) { unsigned long h_entitled, h_unallocated; unsigned long h_aggregation, h_resource; unsigned long pool_idle_time, pool_procs; @@ -571,7 +571,7 @@ int __init lparcfg_init(void) mode_t mode = S_IRUSR; /* Allow writing if we have FW_FEATURE_SPLPAR */ - if (ppc64_firmware_features & FW_FEATURE_SPLPAR) { + if (firmware_has_feature(FW_FEATURE_SPLPAR)) { lparcfg_fops.write = lparcfg_write; mode |= S_IWUSR; } diff --git a/arch/ppc64/kernel/pSeries_iommu.c b/arch/ppc64/kernel/pSeries_iommu.c index a5786be..9d5e1e7 100644 --- a/arch/ppc64/kernel/pSeries_iommu.c +++ b/arch/ppc64/kernel/pSeries_iommu.c @@ -45,6 +45,7 @@ #include #include #include +#include #include "pci.h" #define DBG(fmt...) @@ -546,7 +547,7 @@ void iommu_init_early_pSeries(void) } if (systemcfg->platform & PLATFORM_LPAR) { - if (ppc64_firmware_features & FW_FEATURE_MULTITCE) { + if (firmware_has_feature(FW_FEATURE_MULTITCE)) { ppc_md.tce_build = tce_buildmulti_pSeriesLP; ppc_md.tce_free = tce_freemulti_pSeriesLP; } else { diff --git a/arch/ppc64/kernel/pSeries_setup.c b/arch/ppc64/kernel/pSeries_setup.c index d3975ac..0058f32 100644 --- a/arch/ppc64/kernel/pSeries_setup.c +++ b/arch/ppc64/kernel/pSeries_setup.c @@ -60,7 +60,7 @@ #include #include #include -#include +#include #include "i8259.h" #include "mpic.h" @@ -231,11 +231,11 @@ static void __init pSeries_setup_arch(void) pSeries_nvram_init(); - if (ppc64_firmware_features & FW_FEATURE_SPLPAR) + if (firmware_has_feature(FW_FEATURE_SPLPAR)) vpa_init(boot_cpuid); /* Choose an idle loop */ - if (ppc64_firmware_features & FW_FEATURE_SPLPAR) { + if (firmware_has_feature(FW_FEATURE_SPLPAR)) { if (get_paca()->lppaca.shared_proc) { printk(KERN_INFO "Using shared processor idle loop\n"); ppc_md.idle_loop = pseries_shared_idle; diff --git a/arch/ppc64/kernel/pSeries_smp.c b/arch/ppc64/kernel/pSeries_smp.c index 8312d32..79c7f32 100644 --- a/arch/ppc64/kernel/pSeries_smp.c +++ b/arch/ppc64/kernel/pSeries_smp.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -326,7 +327,7 @@ static void __devinit smp_xics_setup_cpu(int cpu) if (cpu != boot_cpuid) xics_setup_cpu(); - if (ppc64_firmware_features & FW_FEATURE_SPLPAR) + if (firmware_has_feature(FW_FEATURE_SPLPAR)) vpa_init(cpu); cpu_clear(cpu, of_spin_map); diff --git a/arch/ppc64/kernel/process.c b/arch/ppc64/kernel/process.c index 390f434..9bad983 100644 --- a/arch/ppc64/kernel/process.c +++ b/arch/ppc64/kernel/process.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include #include @@ -206,7 +207,7 @@ struct task_struct *__switch_to(struct task_struct *prev, /* purr is nothing but processor time base */ #if defined(CONFIG_PPC_PSERIES) - if (ppc64_firmware_features & FW_FEATURE_SPLPAR) { + if (firmware_has_feature(FW_FEATURE_SPLPAR)) { struct cpu_usage *cu = &__get_cpu_var(cpu_usage_array); long unsigned start_tb, current_tb; start_tb = old_thread->start_tb; diff --git a/arch/ppc64/kernel/sysfs.c b/arch/ppc64/kernel/sysfs.c index 90b653c..e399963 100644 --- a/arch/ppc64/kernel/sysfs.c +++ b/arch/ppc64/kernel/sysfs.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -154,7 +155,7 @@ void ppc64_enable_pmcs(void) #ifdef CONFIG_PPC_PSERIES /* instruct hypervisor to maintain PMCs */ - if (ppc64_firmware_features & FW_FEATURE_SPLPAR) + if (firmware_has_feature(FW_FEATURE_SPLPAR)) get_paca()->lppaca.pmcregs_in_use = 1; #endif /* CONFIG_PPC_PSERIES */ } diff --git a/arch/ppc64/kernel/time.c b/arch/ppc64/kernel/time.c index 1c05cee..d523056 100644 --- a/arch/ppc64/kernel/time.c +++ b/arch/ppc64/kernel/time.c @@ -67,6 +67,7 @@ #include #include #include +#include u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; @@ -372,7 +373,7 @@ int timer_interrupt(struct pt_regs * regs) /* collect purr register values often, for accurate calculations */ #if defined(CONFIG_PPC_PSERIES) - if (ppc64_firmware_features & FW_FEATURE_SPLPAR) { + if (firmware_has_feature(FW_FEATURE_SPLPAR)) { struct cpu_usage *cu = &__get_cpu_var(cpu_usage_array); cu->current_tb = mfspr(SPRN_PURR); } diff --git a/include/asm-ppc64/cputable.h b/include/asm-ppc64/cputable.h index d55698a..ae6cf38 100644 --- a/include/asm-ppc64/cputable.h +++ b/include/asm-ppc64/cputable.h @@ -66,44 +66,6 @@ static inline unsigned long cpu_has_feature(unsigned long feature) return cur_cpu_spec->cpu_features & feature; } - -/* This is used to identify firmware features which are available - * to the kernel. - */ -extern unsigned long ppc64_firmware_features; - -/* firmware feature bitmask values */ -#define FIRMWARE_MAX_FEATURES 63 - -#define FW_FEATURE_PFT (1UL<<0) -#define FW_FEATURE_TCE (1UL<<1) -#define FW_FEATURE_SPRG0 (1UL<<2) -#define FW_FEATURE_DABR (1UL<<3) -#define FW_FEATURE_COPY (1UL<<4) -#define FW_FEATURE_ASR (1UL<<5) -#define FW_FEATURE_DEBUG (1UL<<6) -#define FW_FEATURE_TERM (1UL<<7) -#define FW_FEATURE_PERF (1UL<<8) -#define FW_FEATURE_DUMP (1UL<<9) -#define FW_FEATURE_INTERRUPT (1UL<<10) -#define FW_FEATURE_MIGRATE (1UL<<11) -#define FW_FEATURE_PERFMON (1UL<<12) -#define FW_FEATURE_CRQ (1UL<<13) -#define FW_FEATURE_VIO (1UL<<14) -#define FW_FEATURE_RDMA (1UL<<15) -#define FW_FEATURE_LLAN (1UL<<16) -#define FW_FEATURE_BULK (1UL<<17) -#define FW_FEATURE_XDABR (1UL<<18) -#define FW_FEATURE_MULTITCE (1UL<<19) -#define FW_FEATURE_SPLPAR (1UL<<20) - -typedef struct { - unsigned long val; - char * name; -} firmware_feature_t; - -extern firmware_feature_t firmware_features_table[]; - #endif /* __ASSEMBLY__ */ /* CPU kernel features */ @@ -140,10 +102,8 @@ extern firmware_feature_t firmware_features_table[]; #define CPU_FTR_MMCRA_SIHV ASM_CONST(0x0000080000000000) #define CPU_FTR_CTRL ASM_CONST(0x0000100000000000) -/* Platform firmware features */ -#define FW_FTR_ ASM_CONST(0x0000000000000001) - #ifndef __ASSEMBLY__ + #define COMMON_USER_PPC64 (PPC_FEATURE_32 | PPC_FEATURE_64 | \ PPC_FEATURE_HAS_FPU | PPC_FEATURE_HAS_MMU) @@ -156,10 +116,9 @@ extern firmware_feature_t firmware_features_table[]; #define CPU_FTR_PPCAS_ARCH_V2 (CPU_FTR_PPCAS_ARCH_V2_BASE) #else #define CPU_FTR_PPCAS_ARCH_V2 (CPU_FTR_PPCAS_ARCH_V2_BASE | CPU_FTR_16M_PAGE) -#endif +#endif /* CONFIG_PPC_ISERIES */ -#define COMMON_PPC64_FW (0) -#endif +#endif /* __ASSEMBLY */ #ifdef __ASSEMBLY__ diff --git a/include/asm-ppc64/firmware.h b/include/asm-ppc64/firmware.h new file mode 100644 index 0000000..5bb5bf4 --- /dev/null +++ b/include/asm-ppc64/firmware.h @@ -0,0 +1,67 @@ +/* + * include/asm-ppc64/firmware.h + * + * Extracted from include/asm-ppc64/cputable.h + * + * Copyright (C) 2001 Ben. Herrenschmidt (benh@kernel.crashing.org) + * + * Modifications for ppc64: + * Copyright (C) 2003 Dave Engebretsen + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#ifndef __ASM_PPC_FIRMWARE_H +#define __ASM_PPC_FIRMWARE_H + +#ifdef __KERNEL__ + +#ifndef __ASSEMBLY__ + +/* firmware feature bitmask values */ +#define FIRMWARE_MAX_FEATURES 63 + +#define FW_FEATURE_PFT (1UL<<0) +#define FW_FEATURE_TCE (1UL<<1) +#define FW_FEATURE_SPRG0 (1UL<<2) +#define FW_FEATURE_DABR (1UL<<3) +#define FW_FEATURE_COPY (1UL<<4) +#define FW_FEATURE_ASR (1UL<<5) +#define FW_FEATURE_DEBUG (1UL<<6) +#define FW_FEATURE_TERM (1UL<<7) +#define FW_FEATURE_PERF (1UL<<8) +#define FW_FEATURE_DUMP (1UL<<9) +#define FW_FEATURE_INTERRUPT (1UL<<10) +#define FW_FEATURE_MIGRATE (1UL<<11) +#define FW_FEATURE_PERFMON (1UL<<12) +#define FW_FEATURE_CRQ (1UL<<13) +#define FW_FEATURE_VIO (1UL<<14) +#define FW_FEATURE_RDMA (1UL<<15) +#define FW_FEATURE_LLAN (1UL<<16) +#define FW_FEATURE_BULK (1UL<<17) +#define FW_FEATURE_XDABR (1UL<<18) +#define FW_FEATURE_MULTITCE (1UL<<19) +#define FW_FEATURE_SPLPAR (1UL<<20) + +/* This is used to identify firmware features which are available + * to the kernel. + */ +extern unsigned long ppc64_firmware_features; + +static inline unsigned long firmware_has_feature(unsigned long feature) +{ + return ppc64_firmware_features & feature; +} + +typedef struct { + unsigned long val; + char * name; +} firmware_feature_t; + +extern firmware_feature_t firmware_features_table[]; + +#endif /* __ASSEMBLY__ */ +#endif /* __KERNEL__ */ +#endif /* __ASM_PPC_FIRMWARE_H */ -- cgit v0.10.2 From 8d15a3e55f49678b0900dcf5c1cddb322a129325 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Wed, 3 Aug 2005 14:40:16 +1000 Subject: [PATCH] ppc64: make firmware_has_feature() stronger Make firmware_has_feature() evaluate at compile time for the non pSeries case and tidy up code where possible. Signed-off-by: Stephen Rothwell Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/kernel/Makefile b/arch/ppc64/kernel/Makefile index cbf87dc..f4b3bfc 100644 --- a/arch/ppc64/kernel/Makefile +++ b/arch/ppc64/kernel/Makefile @@ -11,7 +11,7 @@ obj-y := setup.o entry.o traps.o irq.o idle.o dma.o \ udbg.o binfmt_elf32.o sys_ppc32.o ioctl32.o \ ptrace32.o signal32.o rtc.o init_task.o \ lmb.o cputable.o cpu_setup_power4.o idle_power4.o \ - iommu.o sysfs.o vdso.o pmc.o + iommu.o sysfs.o vdso.o pmc.o firmware.o obj-y += vdso32/ vdso64/ obj-$(CONFIG_PPC_OF) += of_device.o diff --git a/arch/ppc64/kernel/cputable.c b/arch/ppc64/kernel/cputable.c index 6294fc7..4847f2a 100644 --- a/arch/ppc64/kernel/cputable.c +++ b/arch/ppc64/kernel/cputable.c @@ -5,7 +5,7 @@ * * Modifications for ppc64: * Copyright (C) 2003 Dave Engebretsen - * + * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version @@ -20,11 +20,9 @@ #include #include -#include struct cpu_spec* cur_cpu_spec = NULL; EXPORT_SYMBOL(cur_cpu_spec); -unsigned long ppc64_firmware_features; /* NOTE: * Unlike ppc32, ppc64 will only call this once for the boot CPU, it's @@ -244,26 +242,3 @@ struct cpu_spec cpu_specs[] = { .cpu_setup = __setup_cpu_power4, } }; - -firmware_feature_t firmware_features_table[FIRMWARE_MAX_FEATURES] = { - {FW_FEATURE_PFT, "hcall-pft"}, - {FW_FEATURE_TCE, "hcall-tce"}, - {FW_FEATURE_SPRG0, "hcall-sprg0"}, - {FW_FEATURE_DABR, "hcall-dabr"}, - {FW_FEATURE_COPY, "hcall-copy"}, - {FW_FEATURE_ASR, "hcall-asr"}, - {FW_FEATURE_DEBUG, "hcall-debug"}, - {FW_FEATURE_PERF, "hcall-perf"}, - {FW_FEATURE_DUMP, "hcall-dump"}, - {FW_FEATURE_INTERRUPT, "hcall-interrupt"}, - {FW_FEATURE_MIGRATE, "hcall-migrate"}, - {FW_FEATURE_PERFMON, "hcall-perfmon"}, - {FW_FEATURE_CRQ, "hcall-crq"}, - {FW_FEATURE_VIO, "hcall-vio"}, - {FW_FEATURE_RDMA, "hcall-rdma"}, - {FW_FEATURE_LLAN, "hcall-lLAN"}, - {FW_FEATURE_BULK, "hcall-bulk"}, - {FW_FEATURE_XDABR, "hcall-xdabr"}, - {FW_FEATURE_MULTITCE, "hcall-multi-tce"}, - {FW_FEATURE_SPLPAR, "hcall-splpar"}, -}; diff --git a/arch/ppc64/kernel/firmware.c b/arch/ppc64/kernel/firmware.c new file mode 100644 index 0000000..d8432c0 --- /dev/null +++ b/arch/ppc64/kernel/firmware.c @@ -0,0 +1,47 @@ +/* + * arch/ppc64/kernel/firmware.c + * + * Extracted from cputable.c + * + * Copyright (C) 2001 Ben. Herrenschmidt (benh@kernel.crashing.org) + * + * Modifications for ppc64: + * Copyright (C) 2003 Dave Engebretsen + * Copyright (C) 2005 Stephen Rothwell, IBM Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include + +#include + +unsigned long ppc64_firmware_features; + +#ifdef CONFIG_PPC_PSERIES +firmware_feature_t firmware_features_table[FIRMWARE_MAX_FEATURES] = { + {FW_FEATURE_PFT, "hcall-pft"}, + {FW_FEATURE_TCE, "hcall-tce"}, + {FW_FEATURE_SPRG0, "hcall-sprg0"}, + {FW_FEATURE_DABR, "hcall-dabr"}, + {FW_FEATURE_COPY, "hcall-copy"}, + {FW_FEATURE_ASR, "hcall-asr"}, + {FW_FEATURE_DEBUG, "hcall-debug"}, + {FW_FEATURE_PERF, "hcall-perf"}, + {FW_FEATURE_DUMP, "hcall-dump"}, + {FW_FEATURE_INTERRUPT, "hcall-interrupt"}, + {FW_FEATURE_MIGRATE, "hcall-migrate"}, + {FW_FEATURE_PERFMON, "hcall-perfmon"}, + {FW_FEATURE_CRQ, "hcall-crq"}, + {FW_FEATURE_VIO, "hcall-vio"}, + {FW_FEATURE_RDMA, "hcall-rdma"}, + {FW_FEATURE_LLAN, "hcall-lLAN"}, + {FW_FEATURE_BULK, "hcall-bulk"}, + {FW_FEATURE_XDABR, "hcall-xdabr"}, + {FW_FEATURE_MULTITCE, "hcall-multi-tce"}, + {FW_FEATURE_SPLPAR, "hcall-splpar"}, +}; +#endif diff --git a/arch/ppc64/kernel/pSeries_setup.c b/arch/ppc64/kernel/pSeries_setup.c index 0058f32..7ae7a2c 100644 --- a/arch/ppc64/kernel/pSeries_setup.c +++ b/arch/ppc64/kernel/pSeries_setup.c @@ -231,11 +231,9 @@ static void __init pSeries_setup_arch(void) pSeries_nvram_init(); - if (firmware_has_feature(FW_FEATURE_SPLPAR)) - vpa_init(boot_cpuid); - /* Choose an idle loop */ if (firmware_has_feature(FW_FEATURE_SPLPAR)) { + vpa_init(boot_cpuid); if (get_paca()->lppaca.shared_proc) { printk(KERN_INFO "Using shared processor idle loop\n"); ppc_md.idle_loop = pseries_shared_idle; diff --git a/arch/ppc64/kernel/process.c b/arch/ppc64/kernel/process.c index 9bad983..7a7e027 100644 --- a/arch/ppc64/kernel/process.c +++ b/arch/ppc64/kernel/process.c @@ -203,10 +203,9 @@ struct task_struct *__switch_to(struct task_struct *prev, new_thread = &new->thread; old_thread = ¤t->thread; -/* Collect purr utilization data per process and per processor wise */ -/* purr is nothing but processor time base */ - -#if defined(CONFIG_PPC_PSERIES) + /* Collect purr utilization data per process and per processor + * wise purr is nothing but processor time base + */ if (firmware_has_feature(FW_FEATURE_SPLPAR)) { struct cpu_usage *cu = &__get_cpu_var(cpu_usage_array); long unsigned start_tb, current_tb; @@ -215,8 +214,6 @@ struct task_struct *__switch_to(struct task_struct *prev, old_thread->accum_tb += (current_tb - start_tb); new_thread->start_tb = current_tb; } -#endif - local_irq_save(flags); last = _switch(old_thread, new_thread); diff --git a/arch/ppc64/kernel/sysfs.c b/arch/ppc64/kernel/sysfs.c index e399963..eca15d2 100644 --- a/arch/ppc64/kernel/sysfs.c +++ b/arch/ppc64/kernel/sysfs.c @@ -153,11 +153,9 @@ void ppc64_enable_pmcs(void) break; } -#ifdef CONFIG_PPC_PSERIES /* instruct hypervisor to maintain PMCs */ if (firmware_has_feature(FW_FEATURE_SPLPAR)) get_paca()->lppaca.pmcregs_in_use = 1; -#endif /* CONFIG_PPC_PSERIES */ } #else diff --git a/arch/ppc64/kernel/time.c b/arch/ppc64/kernel/time.c index d523056..1696e1b 100644 --- a/arch/ppc64/kernel/time.c +++ b/arch/ppc64/kernel/time.c @@ -371,13 +371,11 @@ int timer_interrupt(struct pt_regs * regs) process_hvlpevents(regs); #endif -/* collect purr register values often, for accurate calculations */ -#if defined(CONFIG_PPC_PSERIES) + /* collect purr register values often, for accurate calculations */ if (firmware_has_feature(FW_FEATURE_SPLPAR)) { struct cpu_usage *cu = &__get_cpu_var(cpu_usage_array); cu->current_tb = mfspr(SPRN_PURR); } -#endif irq_exit(); diff --git a/include/asm-ppc64/firmware.h b/include/asm-ppc64/firmware.h index 5bb5bf4..e3725f3 100644 --- a/include/asm-ppc64/firmware.h +++ b/include/asm-ppc64/firmware.h @@ -45,6 +45,22 @@ #define FW_FEATURE_MULTITCE (1UL<<19) #define FW_FEATURE_SPLPAR (1UL<<20) +enum { + FW_FEATURE_PSERIES = FW_FEATURE_PFT | FW_FEATURE_TCE | + FW_FEATURE_SPRG0 | FW_FEATURE_DABR | FW_FEATURE_COPY | + FW_FEATURE_ASR | FW_FEATURE_DEBUG | FW_FEATURE_TERM | + FW_FEATURE_PERF | FW_FEATURE_DUMP | FW_FEATURE_INTERRUPT | + FW_FEATURE_MIGRATE | FW_FEATURE_PERFMON | FW_FEATURE_CRQ | + FW_FEATURE_VIO | FW_FEATURE_RDMA | FW_FEATURE_LLAN | + FW_FEATURE_BULK | FW_FEATURE_XDABR | FW_FEATURE_MULTITCE | + FW_FEATURE_SPLPAR, + FW_FEATURE_POSSIBLE = +#ifdef CONFIG_PPC_PSERIES + FW_FEATURE_PSERIES | +#endif + 0, +}; + /* This is used to identify firmware features which are available * to the kernel. */ @@ -52,15 +68,17 @@ extern unsigned long ppc64_firmware_features; static inline unsigned long firmware_has_feature(unsigned long feature) { - return ppc64_firmware_features & feature; + return ppc64_firmware_features & feature & FW_FEATURE_POSSIBLE; } +#ifdef CONFIG_PPC_PSERIES typedef struct { unsigned long val; char * name; } firmware_feature_t; extern firmware_feature_t firmware_features_table[]; +#endif #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ -- cgit v0.10.2 From aed31351941aa990fb0865c186565a589c56d3fe Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Wed, 3 Aug 2005 14:43:21 +1000 Subject: [PATCH] ppc64: introduce FW_FEATURE_ISERIES Signed-off-by: Stephen Rothwell Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/kernel/iSeries_setup.c b/arch/ppc64/kernel/iSeries_setup.c index a649edb..460e7df 100644 --- a/arch/ppc64/kernel/iSeries_setup.c +++ b/arch/ppc64/kernel/iSeries_setup.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include "iSeries_setup.h" @@ -314,6 +315,8 @@ static void __init iSeries_init_early(void) DBG(" -> iSeries_init_early()\n"); + ppc64_firmware_features = FW_FEATURE_ISERIES; + ppcdbg_initialize(); #if defined(CONFIG_BLK_DEV_INITRD) diff --git a/arch/ppc64/kernel/pSeries_lpar.c b/arch/ppc64/kernel/pSeries_lpar.c index 74dd144..5684554 100644 --- a/arch/ppc64/kernel/pSeries_lpar.c +++ b/arch/ppc64/kernel/pSeries_lpar.c @@ -52,7 +52,6 @@ EXPORT_SYMBOL(plpar_hcall_4out); EXPORT_SYMBOL(plpar_hcall_norets); EXPORT_SYMBOL(plpar_hcall_8arg_2ret); -extern void fw_feature_init(void); extern void pSeries_find_serial_port(void); diff --git a/arch/ppc64/kernel/pSeries_setup.c b/arch/ppc64/kernel/pSeries_setup.c index 7ae7a2c..54e0651 100644 --- a/arch/ppc64/kernel/pSeries_setup.c +++ b/arch/ppc64/kernel/pSeries_setup.c @@ -262,7 +262,7 @@ arch_initcall(pSeries_init_panel); * using contents of device-tree/ibm,hypertas-functions. * Ultimately this functionality may be moved into prom.c prom_init(). */ -void __init fw_feature_init(void) +static void __init fw_feature_init(void) { struct device_node * dn; char * hypertas; diff --git a/include/asm-ppc64/firmware.h b/include/asm-ppc64/firmware.h index e3725f3..22bb85c 100644 --- a/include/asm-ppc64/firmware.h +++ b/include/asm-ppc64/firmware.h @@ -44,9 +44,10 @@ #define FW_FEATURE_XDABR (1UL<<18) #define FW_FEATURE_MULTITCE (1UL<<19) #define FW_FEATURE_SPLPAR (1UL<<20) +#define FW_FEATURE_ISERIES (1UL<<21) enum { - FW_FEATURE_PSERIES = FW_FEATURE_PFT | FW_FEATURE_TCE | + FW_FEATURE_PSERIES_POSSIBLE = FW_FEATURE_PFT | FW_FEATURE_TCE | FW_FEATURE_SPRG0 | FW_FEATURE_DABR | FW_FEATURE_COPY | FW_FEATURE_ASR | FW_FEATURE_DEBUG | FW_FEATURE_TERM | FW_FEATURE_PERF | FW_FEATURE_DUMP | FW_FEATURE_INTERRUPT | @@ -54,11 +55,25 @@ enum { FW_FEATURE_VIO | FW_FEATURE_RDMA | FW_FEATURE_LLAN | FW_FEATURE_BULK | FW_FEATURE_XDABR | FW_FEATURE_MULTITCE | FW_FEATURE_SPLPAR, + FW_FEATURE_PSERIES_ALWAYS = 0, + FW_FEATURE_ISERIES_POSSIBLE = FW_FEATURE_ISERIES, + FW_FEATURE_ISERIES_ALWAYS = FW_FEATURE_ISERIES, FW_FEATURE_POSSIBLE = #ifdef CONFIG_PPC_PSERIES - FW_FEATURE_PSERIES | + FW_FEATURE_PSERIES_POSSIBLE | +#endif +#ifdef CONFIG_PPC_ISERIES + FW_FEATURE_ISERIES_POSSIBLE | #endif 0, + FW_FEATURE_ALWAYS = +#ifdef CONFIG_PPC_PSERIES + FW_FEATURE_PSERIES_ALWAYS & +#endif +#ifdef CONFIG_PPC_ISERIES + FW_FEATURE_ISERIES_ALWAYS & +#endif + FW_FEATURE_POSSIBLE, }; /* This is used to identify firmware features which are available @@ -68,7 +83,8 @@ extern unsigned long ppc64_firmware_features; static inline unsigned long firmware_has_feature(unsigned long feature) { - return ppc64_firmware_features & feature & FW_FEATURE_POSSIBLE; + return (FW_FEATURE_ALWAYS & feature) || + (FW_FEATURE_POSSIBLE & ppc64_firmware_features & feature); } #ifdef CONFIG_PPC_PSERIES -- cgit v0.10.2 From 38e85dc18036804ada8698951cfad4e6114fec1b Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 3 Aug 2005 20:21:23 +1000 Subject: [PATCH] ppc64: Remove PTRRELOC() from msChunks code The msChunks code was written to work on pSeries, but now it's only used on iSeries. This means there's no need to do PTRRELOC anymore, so remove it all. A few places were getting "extern reloc_offset()" from abs_addr.h, move it into system.h instead. Signed-off-by: Michael Ellerman Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/kernel/LparData.c b/arch/ppc64/kernel/LparData.c index 3b9a260..0ed77b2 100644 --- a/arch/ppc64/kernel/LparData.c +++ b/arch/ppc64/kernel/LparData.c @@ -229,24 +229,16 @@ struct ItVpdAreas itVpdAreas = { struct msChunks msChunks; EXPORT_SYMBOL(msChunks); -/* Depending on whether this is called from iSeries or pSeries setup - * code, the location of the msChunks struct may or may not have - * to be reloc'd, so we force the caller to do that for us by passing - * in a pointer to the structure. - */ unsigned long msChunks_alloc(unsigned long mem, unsigned long num_chunks, unsigned long chunk_size) { - unsigned long offset = reloc_offset(); - struct msChunks *_msChunks = PTRRELOC(&msChunks); - _msChunks->num_chunks = num_chunks; _msChunks->chunk_size = chunk_size; _msChunks->chunk_shift = __ilog2(chunk_size); _msChunks->chunk_mask = (1UL<<_msChunks->chunk_shift)-1; mem = _ALIGN(mem, sizeof(msChunks_entry)); - _msChunks->abs = (msChunks_entry *)(mem + offset); + _msChunks->abs = (msChunks_entry *)mem; mem += num_chunks * sizeof(msChunks_entry); return mem; diff --git a/include/asm-ppc64/abs_addr.h b/include/asm-ppc64/abs_addr.h index 6d4e8e7..93dc63e 100644 --- a/include/asm-ppc64/abs_addr.h +++ b/include/asm-ppc64/abs_addr.h @@ -29,46 +29,30 @@ struct msChunks { extern struct msChunks msChunks; extern unsigned long msChunks_alloc(unsigned long, unsigned long, unsigned long); -extern unsigned long reloc_offset(void); #ifdef CONFIG_MSCHUNKS -static inline unsigned long -chunk_to_addr(unsigned long chunk) +static inline unsigned long chunk_to_addr(unsigned long chunk) { - unsigned long offset = reloc_offset(); - struct msChunks *_msChunks = PTRRELOC(&msChunks); - - return chunk << _msChunks->chunk_shift; + return chunk << msChunks.chunk_shift; } -static inline unsigned long -addr_to_chunk(unsigned long addr) +static inline unsigned long addr_to_chunk(unsigned long addr) { - unsigned long offset = reloc_offset(); - struct msChunks *_msChunks = PTRRELOC(&msChunks); - - return addr >> _msChunks->chunk_shift; + return addr >> msChunks.chunk_shift; } -static inline unsigned long -chunk_offset(unsigned long addr) +static inline unsigned long chunk_offset(unsigned long addr) { - unsigned long offset = reloc_offset(); - struct msChunks *_msChunks = PTRRELOC(&msChunks); - - return addr & _msChunks->chunk_mask; + return addr & msChunks.chunk_mask; } -static inline unsigned long -abs_chunk(unsigned long pchunk) +static inline unsigned long abs_chunk(unsigned long pchunk) { - unsigned long offset = reloc_offset(); - struct msChunks *_msChunks = PTRRELOC(&msChunks); - if ( pchunk >= _msChunks->num_chunks ) { + if (pchunk >= msChunks.num_chunks) return pchunk; - } - return PTRRELOC(_msChunks->abs)[pchunk]; + + return msChunks.abs[pchunk]; } /* A macro so it can take pointers or unsigned long. */ diff --git a/include/asm-ppc64/system.h b/include/asm-ppc64/system.h index 98d120c..4104a5d 100644 --- a/include/asm-ppc64/system.h +++ b/include/asm-ppc64/system.h @@ -302,5 +302,7 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new, int size) #define arch_align_stack(x) (x) +extern unsigned long reloc_offset(void); + #endif /* __KERNEL__ */ #endif -- cgit v0.10.2 From 34c8f6961fc601294a38c5bd5ca12131b2e52674 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 3 Aug 2005 20:21:23 +1000 Subject: [PATCH] ppc64: msChunks cleanups Chunks are 256KB, so use constants for the size/shift/mask, rather than getting them from the msChunks struct. The iSeries debugger (??) might still need access to the values in the msChunks struct, so we keep them around for now, but set them from the constant values. Replace msChunks_entry typedef with regular u32. Simplify msChunks_alloc() to manipulate klimit directly, rather than via a parameter. Move msChunks_alloc() and msChunks into iSeries_setup.c, as that's where they're used. Signed-off-by: Michael Ellerman Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/kernel/LparData.c b/arch/ppc64/kernel/LparData.c index 0ed77b2..0a9c23c 100644 --- a/arch/ppc64/kernel/LparData.c +++ b/arch/ppc64/kernel/LparData.c @@ -225,21 +225,3 @@ struct ItVpdAreas itVpdAreas = { 0,0 } }; - -struct msChunks msChunks; -EXPORT_SYMBOL(msChunks); - -unsigned long -msChunks_alloc(unsigned long mem, unsigned long num_chunks, unsigned long chunk_size) -{ - _msChunks->num_chunks = num_chunks; - _msChunks->chunk_size = chunk_size; - _msChunks->chunk_shift = __ilog2(chunk_size); - _msChunks->chunk_mask = (1UL<<_msChunks->chunk_shift)-1; - - mem = _ALIGN(mem, sizeof(msChunks_entry)); - _msChunks->abs = (msChunks_entry *)mem; - mem += num_chunks * sizeof(msChunks_entry); - - return mem; -} diff --git a/arch/ppc64/kernel/iSeries_setup.c b/arch/ppc64/kernel/iSeries_setup.c index 460e7df..e47984b 100644 --- a/arch/ppc64/kernel/iSeries_setup.c +++ b/arch/ppc64/kernel/iSeries_setup.c @@ -415,6 +415,22 @@ static void __init iSeries_init_early(void) DBG(" <- iSeries_init_early()\n"); } +struct msChunks msChunks = { + /* XXX We don't use these, but Piranha might need them. */ + .chunk_size = MSCHUNKS_CHUNK_SIZE, + .chunk_shift = MSCHUNKS_CHUNK_SHIFT, + .chunk_mask = MSCHUNKS_OFFSET_MASK, +}; +EXPORT_SYMBOL(msChunks); + +void msChunks_alloc(unsigned long num_chunks) +{ + klimit = _ALIGN(klimit, sizeof(u32)); + msChunks.abs = (u32 *)klimit; + klimit += num_chunks * sizeof(u32); + msChunks.num_chunks = num_chunks; +} + /* * The iSeries may have very large memories ( > 128 GB ) and a partition * may get memory in "chunks" that may be anywhere in the 2**52 real @@ -452,7 +468,7 @@ static void __init build_iSeries_Memory_Map(void) /* Chunk size on iSeries is 256K bytes */ totalChunks = (u32)HvLpConfig_getMsChunks(); - klimit = msChunks_alloc(klimit, totalChunks, 1UL << 18); + msChunks_alloc(totalChunks); /* * Get absolute address of our load area @@ -498,7 +514,7 @@ static void __init build_iSeries_Memory_Map(void) */ hptFirstChunk = (u32)addr_to_chunk(HvCallHpt_getHptAddress()); hptSizePages = (u32)HvCallHpt_getHptPages(); - hptSizeChunks = hptSizePages >> (msChunks.chunk_shift - PAGE_SHIFT); + hptSizeChunks = hptSizePages >> (MSCHUNKS_CHUNK_SHIFT - PAGE_SHIFT); hptLastChunk = hptFirstChunk + hptSizeChunks - 1; printk("HPT absolute addr = %016lx, size = %dK\n", diff --git a/include/asm-ppc64/abs_addr.h b/include/asm-ppc64/abs_addr.h index 93dc63e..2276567 100644 --- a/include/asm-ppc64/abs_addr.h +++ b/include/asm-ppc64/abs_addr.h @@ -17,34 +17,37 @@ #include #include -typedef u32 msChunks_entry; struct msChunks { unsigned long num_chunks; unsigned long chunk_size; unsigned long chunk_shift; unsigned long chunk_mask; - msChunks_entry *abs; + u32 *abs; }; extern struct msChunks msChunks; -extern unsigned long msChunks_alloc(unsigned long, unsigned long, unsigned long); #ifdef CONFIG_MSCHUNKS +/* Chunks are 256 KB */ +#define MSCHUNKS_CHUNK_SHIFT (18) +#define MSCHUNKS_CHUNK_SIZE (1UL << MSCHUNKS_CHUNK_SHIFT) +#define MSCHUNKS_OFFSET_MASK (MSCHUNKS_CHUNK_SIZE - 1) + static inline unsigned long chunk_to_addr(unsigned long chunk) { - return chunk << msChunks.chunk_shift; + return chunk << MSCHUNKS_CHUNK_SHIFT; } static inline unsigned long addr_to_chunk(unsigned long addr) { - return addr >> msChunks.chunk_shift; + return addr >> MSCHUNKS_CHUNK_SHIFT; } static inline unsigned long chunk_offset(unsigned long addr) { - return addr & msChunks.chunk_mask; + return addr & MSCHUNKS_OFFSET_MASK; } static inline unsigned long abs_chunk(unsigned long pchunk) -- cgit v0.10.2 From 56e97b71bf55edb69dc8e9715553972ce50b1564 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 3 Aug 2005 20:21:23 +1000 Subject: [PATCH] ppc64: Rename msChunks structure Rename the msChunks struct to get rid of the StUdlY caps and make it a bit clearer what it's for. Signed-off-by: Michael Ellerman Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/kernel/head.S b/arch/ppc64/kernel/head.S index a0ff707..cccec49 100644 --- a/arch/ppc64/kernel/head.S +++ b/arch/ppc64/kernel/head.S @@ -96,12 +96,12 @@ END_FTR_SECTION(0, 1) .llong hvReleaseData-KERNELBASE /* - * At offset 0x28 and 0x30 are offsets to the msChunks + * At offset 0x28 and 0x30 are offsets to the mschunks_map * array (used by the iSeries LPAR debugger to do translation * between physical addresses and absolute addresses) and * to the pidhash table (also used by the debugger) */ - .llong msChunks-KERNELBASE + .llong mschunks_map-KERNELBASE .llong 0 /* pidhash-KERNELBASE SFRXXX */ /* Offset 0x38 - Pointer to start of embedded System.map */ diff --git a/arch/ppc64/kernel/iSeries_setup.c b/arch/ppc64/kernel/iSeries_setup.c index e47984b..b384a6a 100644 --- a/arch/ppc64/kernel/iSeries_setup.c +++ b/arch/ppc64/kernel/iSeries_setup.c @@ -415,20 +415,20 @@ static void __init iSeries_init_early(void) DBG(" <- iSeries_init_early()\n"); } -struct msChunks msChunks = { +struct mschunks_map mschunks_map = { /* XXX We don't use these, but Piranha might need them. */ .chunk_size = MSCHUNKS_CHUNK_SIZE, .chunk_shift = MSCHUNKS_CHUNK_SHIFT, .chunk_mask = MSCHUNKS_OFFSET_MASK, }; -EXPORT_SYMBOL(msChunks); +EXPORT_SYMBOL(mschunks_map); -void msChunks_alloc(unsigned long num_chunks) +void mschunks_alloc(unsigned long num_chunks) { klimit = _ALIGN(klimit, sizeof(u32)); - msChunks.abs = (u32 *)klimit; + mschunks_map.mapping = (u32 *)klimit; klimit += num_chunks * sizeof(u32); - msChunks.num_chunks = num_chunks; + mschunks_map.num_chunks = num_chunks; } /* @@ -468,7 +468,7 @@ static void __init build_iSeries_Memory_Map(void) /* Chunk size on iSeries is 256K bytes */ totalChunks = (u32)HvLpConfig_getMsChunks(); - msChunks_alloc(totalChunks); + mschunks_alloc(totalChunks); /* * Get absolute address of our load area @@ -505,7 +505,7 @@ static void __init build_iSeries_Memory_Map(void) printk("Load area size %dK\n", loadAreaSize * 256); for (nextPhysChunk = 0; nextPhysChunk < loadAreaSize; ++nextPhysChunk) - msChunks.abs[nextPhysChunk] = + mschunks_map.mapping[nextPhysChunk] = loadAreaFirstChunk + nextPhysChunk; /* @@ -571,7 +571,8 @@ static void __init build_iSeries_Memory_Map(void) (absChunk > hptLastChunk)) && ((absChunk < loadAreaFirstChunk) || (absChunk > loadAreaLastChunk))) { - msChunks.abs[nextPhysChunk] = absChunk; + mschunks_map.mapping[nextPhysChunk] = + absChunk; ++nextPhysChunk; } } diff --git a/include/asm-ppc64/abs_addr.h b/include/asm-ppc64/abs_addr.h index 2276567..05414a9 100644 --- a/include/asm-ppc64/abs_addr.h +++ b/include/asm-ppc64/abs_addr.h @@ -17,18 +17,17 @@ #include #include -struct msChunks { +#ifdef CONFIG_MSCHUNKS + +struct mschunks_map { unsigned long num_chunks; unsigned long chunk_size; unsigned long chunk_shift; unsigned long chunk_mask; - u32 *abs; + u32 *mapping; }; -extern struct msChunks msChunks; - - -#ifdef CONFIG_MSCHUNKS +extern struct mschunks_map mschunks_map; /* Chunks are 256 KB */ #define MSCHUNKS_CHUNK_SHIFT (18) @@ -52,10 +51,10 @@ static inline unsigned long chunk_offset(unsigned long addr) static inline unsigned long abs_chunk(unsigned long pchunk) { - if (pchunk >= msChunks.num_chunks) + if (pchunk >= mschunks_map.num_chunks) return pchunk; - return msChunks.abs[pchunk]; + return mschunks_map.mapping[pchunk]; } /* A macro so it can take pointers or unsigned long. */ -- cgit v0.10.2 From ce21795275ab469b97384faa36462350af17eca0 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 3 Aug 2005 20:21:23 +1000 Subject: [PATCH] ppc64: Consolidate some macros The only caller of chunk_offset() and abs_chunk() is phys_to_abs(), so fold the former two into the latter. Signed-off-by: Michael Ellerman Signed-off-by: Paul Mackerras diff --git a/include/asm-ppc64/abs_addr.h b/include/asm-ppc64/abs_addr.h index 05414a9..0255277 100644 --- a/include/asm-ppc64/abs_addr.h +++ b/include/asm-ppc64/abs_addr.h @@ -44,24 +44,17 @@ static inline unsigned long addr_to_chunk(unsigned long addr) return addr >> MSCHUNKS_CHUNK_SHIFT; } -static inline unsigned long chunk_offset(unsigned long addr) +static inline unsigned long phys_to_abs(unsigned long pa) { - return addr & MSCHUNKS_OFFSET_MASK; -} + unsigned long chunk; -static inline unsigned long abs_chunk(unsigned long pchunk) -{ - if (pchunk >= mschunks_map.num_chunks) - return pchunk; + chunk = addr_to_chunk(pa); - return mschunks_map.mapping[pchunk]; -} + if (chunk < mschunks_map.num_chunks) + chunk = mschunks_map.mapping[chunk]; -/* A macro so it can take pointers or unsigned long. */ -#define phys_to_abs(pa) \ - ({ unsigned long _pa = (unsigned long)(pa); \ - chunk_to_addr(abs_chunk(addr_to_chunk(_pa))) + chunk_offset(_pa); \ - }) + return chunk_to_addr(chunk) + (pa & MSCHUNKS_OFFSET_MASK); +} static inline unsigned long physRpn_to_absRpn(unsigned long rpn) -- cgit v0.10.2 From aefd16b0c5a594b5feaba23954ad74061f45c8a5 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 3 Aug 2005 20:21:24 +1000 Subject: [PATCH] ppc64: Remove redundant uses of physRpn_to_absRpn physRpn_to_absRpn is a no-op on non-iSeries platforms, remove the two redundant calls. There's only one caller on iSeries so fold the logic in there so we can get rid of it completely. Signed-off-by: Michael Ellerman Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/kernel/iSeries_htab.c b/arch/ppc64/kernel/iSeries_htab.c index b0250ae..2192055 100644 --- a/arch/ppc64/kernel/iSeries_htab.c +++ b/arch/ppc64/kernel/iSeries_htab.c @@ -41,6 +41,7 @@ static long iSeries_hpte_insert(unsigned long hpte_group, unsigned long va, unsigned long prpn, unsigned long vflags, unsigned long rflags) { + unsigned long arpn; long slot; hpte_t lhpte; int secondary = 0; @@ -70,8 +71,10 @@ static long iSeries_hpte_insert(unsigned long hpte_group, unsigned long va, slot &= 0x7fffffffffffffff; } + arpn = phys_to_abs(prpn << PAGE_SHIFT) >> PAGE_SHIFT; + lhpte.v = (va >> 23) << HPTE_V_AVPN_SHIFT | vflags | HPTE_V_VALID; - lhpte.r = (physRpn_to_absRpn(prpn) << HPTE_R_RPN_SHIFT) | rflags; + lhpte.r = (arpn << HPTE_R_RPN_SHIFT) | rflags; /* Now fill in the actual HPTE */ HvCallHpt_addValidate(slot, secondary, &lhpte); diff --git a/arch/ppc64/kernel/pSeries_lpar.c b/arch/ppc64/kernel/pSeries_lpar.c index 5684554..0a3ddc9 100644 --- a/arch/ppc64/kernel/pSeries_lpar.c +++ b/arch/ppc64/kernel/pSeries_lpar.c @@ -278,7 +278,6 @@ long pSeries_lpar_hpte_insert(unsigned long hpte_group, unsigned long va, unsigned long prpn, unsigned long vflags, unsigned long rflags) { - unsigned long arpn = physRpn_to_absRpn(prpn); unsigned long lpar_rc; unsigned long flags; unsigned long slot; @@ -289,7 +288,7 @@ long pSeries_lpar_hpte_insert(unsigned long hpte_group, if (vflags & HPTE_V_LARGE) hpte_v &= ~(1UL << HPTE_V_AVPN_SHIFT); - hpte_r = (arpn << HPTE_R_RPN_SHIFT) | rflags; + hpte_r = (prpn << HPTE_R_RPN_SHIFT) | rflags; /* Now fill in the actual HPTE */ /* Set CEC cookie to 0 */ diff --git a/arch/ppc64/mm/hash_native.c b/arch/ppc64/mm/hash_native.c index a6abd3a..7626bb5 100644 --- a/arch/ppc64/mm/hash_native.c +++ b/arch/ppc64/mm/hash_native.c @@ -51,7 +51,6 @@ long native_hpte_insert(unsigned long hpte_group, unsigned long va, unsigned long prpn, unsigned long vflags, unsigned long rflags) { - unsigned long arpn = physRpn_to_absRpn(prpn); hpte_t *hptep = htab_address + hpte_group; unsigned long hpte_v, hpte_r; int i; @@ -74,7 +73,7 @@ long native_hpte_insert(unsigned long hpte_group, unsigned long va, hpte_v = (va >> 23) << HPTE_V_AVPN_SHIFT | vflags | HPTE_V_VALID; if (vflags & HPTE_V_LARGE) va &= ~(1UL << HPTE_V_AVPN_SHIFT); - hpte_r = (arpn << HPTE_R_RPN_SHIFT) | rflags; + hpte_r = (prpn << HPTE_R_RPN_SHIFT) | rflags; hptep->r = hpte_r; /* Guarantee the second dword is visible before the valid bit */ diff --git a/include/asm-ppc64/abs_addr.h b/include/asm-ppc64/abs_addr.h index 0255277..ab4320c 100644 --- a/include/asm-ppc64/abs_addr.h +++ b/include/asm-ppc64/abs_addr.h @@ -56,14 +56,6 @@ static inline unsigned long phys_to_abs(unsigned long pa) return chunk_to_addr(chunk) + (pa & MSCHUNKS_OFFSET_MASK); } -static inline unsigned long -physRpn_to_absRpn(unsigned long rpn) -{ - unsigned long pa = rpn << PAGE_SHIFT; - unsigned long aa = phys_to_abs(pa); - return (aa >> PAGE_SHIFT); -} - /* A macro so it can take pointers or unsigned long. */ #define abs_to_phys(aa) lmb_abs_to_phys((unsigned long)(aa)) -- cgit v0.10.2 From a4a0f97020444f83bf22bb9c8c20d8af2b4e6317 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 3 Aug 2005 20:21:24 +1000 Subject: [PATCH] ppc64: Remove redundant use of pointers in lmb code The lmb code is all written to use a pointer to an lmb struct. But it's always the same lmb struct, called "lmb". So we take the address of lmb, call it _lmb and then start using _lmb->foo everywhere, which is silly. This patch removes the _lmb pointers and replaces them with direct references to the one "lmb" struct. We do the same for some _mem and _rsv pointers which point to lmb.memory and lmb.reserved respectively. This patch looks quite busy, but it's basically just: s/_lmb->/lmb./g s/_mem->/lmb.memory./g s/_rsv->/lmb.reserved./g s/_rsv/&lmb.reserved/g s/mem->/lmb.memory./g Signed-off-by: Michael Ellerman Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/kernel/lmb.c b/arch/ppc64/kernel/lmb.c index d6c6bd0..6cb2756 100644 --- a/arch/ppc64/kernel/lmb.c +++ b/arch/ppc64/kernel/lmb.c @@ -28,33 +28,32 @@ void lmb_dump_all(void) { #ifdef DEBUG unsigned long i; - struct lmb *_lmb = &lmb; udbg_printf("lmb_dump_all:\n"); udbg_printf(" memory.cnt = 0x%lx\n", - _lmb->memory.cnt); + lmb.memory.cnt); udbg_printf(" memory.size = 0x%lx\n", - _lmb->memory.size); - for (i=0; i < _lmb->memory.cnt ;i++) { + lmb.memory.size); + for (i=0; i < lmb.memory.cnt ;i++) { udbg_printf(" memory.region[0x%x].base = 0x%lx\n", - i, _lmb->memory.region[i].base); + i, lmb.memory.region[i].base); udbg_printf(" .physbase = 0x%lx\n", - _lmb->memory.region[i].physbase); + lmb.memory.region[i].physbase); udbg_printf(" .size = 0x%lx\n", - _lmb->memory.region[i].size); + lmb.memory.region[i].size); } udbg_printf("\n reserved.cnt = 0x%lx\n", - _lmb->reserved.cnt); + lmb.reserved.cnt); udbg_printf(" reserved.size = 0x%lx\n", - _lmb->reserved.size); - for (i=0; i < _lmb->reserved.cnt ;i++) { + lmb.reserved.size); + for (i=0; i < lmb.reserved.cnt ;i++) { udbg_printf(" reserved.region[0x%x].base = 0x%lx\n", - i, _lmb->reserved.region[i].base); + i, lmb.reserved.region[i].base); udbg_printf(" .physbase = 0x%lx\n", - _lmb->reserved.region[i].physbase); + lmb.reserved.region[i].physbase); udbg_printf(" .size = 0x%lx\n", - _lmb->reserved.region[i].size); + lmb.reserved.region[i].size); } #endif /* DEBUG */ } @@ -108,19 +107,17 @@ lmb_coalesce_regions(struct lmb_region *rgn, unsigned long r1, unsigned long r2) void __init lmb_init(void) { - struct lmb *_lmb = &lmb; - /* Create a dummy zero size LMB which will get coalesced away later. * This simplifies the lmb_add() code below... */ - _lmb->memory.region[0].base = 0; - _lmb->memory.region[0].size = 0; - _lmb->memory.cnt = 1; + lmb.memory.region[0].base = 0; + lmb.memory.region[0].size = 0; + lmb.memory.cnt = 1; /* Ditto. */ - _lmb->reserved.region[0].base = 0; - _lmb->reserved.region[0].size = 0; - _lmb->reserved.cnt = 1; + lmb.reserved.region[0].base = 0; + lmb.reserved.region[0].size = 0; + lmb.reserved.cnt = 1; } /* This routine called with relocation disabled. */ @@ -130,27 +127,26 @@ lmb_analyze(void) unsigned long i; unsigned long mem_size = 0; unsigned long size_mask = 0; - struct lmb *_lmb = &lmb; #ifdef CONFIG_MSCHUNKS unsigned long physbase = 0; #endif - for (i=0; i < _lmb->memory.cnt; i++) { + for (i=0; i < lmb.memory.cnt; i++) { unsigned long lmb_size; - lmb_size = _lmb->memory.region[i].size; + lmb_size = lmb.memory.region[i].size; #ifdef CONFIG_MSCHUNKS - _lmb->memory.region[i].physbase = physbase; + lmb.memory.region[i].physbase = physbase; physbase += lmb_size; #else - _lmb->memory.region[i].physbase = _lmb->memory.region[i].base; + lmb.memory.region[i].physbase = lmb.memory.region[i].base; #endif mem_size += lmb_size; size_mask |= lmb_size; } - _lmb->memory.size = mem_size; + lmb.memory.size = mem_size; } /* This routine called with relocation disabled. */ @@ -213,12 +209,11 @@ lmb_add_region(struct lmb_region *rgn, unsigned long base, unsigned long size) long __init lmb_add(unsigned long base, unsigned long size) { - struct lmb *_lmb = &lmb; - struct lmb_region *_rgn = &(_lmb->memory); + struct lmb_region *_rgn = &(lmb.memory); /* On pSeries LPAR systems, the first LMB is our RMO region. */ if ( base == 0 ) - _lmb->rmo_size = size; + lmb.rmo_size = size; return lmb_add_region(_rgn, base, size); @@ -227,8 +222,7 @@ lmb_add(unsigned long base, unsigned long size) long __init lmb_reserve(unsigned long base, unsigned long size) { - struct lmb *_lmb = &lmb; - struct lmb_region *_rgn = &(_lmb->reserved); + struct lmb_region *_rgn = &(lmb.reserved); return lmb_add_region(_rgn, base, size); } @@ -260,13 +254,10 @@ lmb_alloc_base(unsigned long size, unsigned long align, unsigned long max_addr) { long i, j; unsigned long base = 0; - struct lmb *_lmb = &lmb; - struct lmb_region *_mem = &(_lmb->memory); - struct lmb_region *_rsv = &(_lmb->reserved); - for (i=_mem->cnt-1; i >= 0; i--) { - unsigned long lmbbase = _mem->region[i].base; - unsigned long lmbsize = _mem->region[i].size; + for (i=lmb.memory.cnt-1; i >= 0; i--) { + unsigned long lmbbase = lmb.memory.region[i].base; + unsigned long lmbsize = lmb.memory.region[i].size; if ( max_addr == LMB_ALLOC_ANYWHERE ) base = _ALIGN_DOWN(lmbbase+lmbsize-size, align); @@ -276,8 +267,8 @@ lmb_alloc_base(unsigned long size, unsigned long align, unsigned long max_addr) continue; while ( (lmbbase <= base) && - ((j = lmb_overlaps_region(_rsv,base,size)) >= 0) ) { - base = _ALIGN_DOWN(_rsv->region[j].base-size, align); + ((j = lmb_overlaps_region(&lmb.reserved,base,size)) >= 0) ) { + base = _ALIGN_DOWN(lmb.reserved.region[j].base-size, align); } if ( (base != 0) && (lmbbase <= base) ) @@ -287,7 +278,7 @@ lmb_alloc_base(unsigned long size, unsigned long align, unsigned long max_addr) if ( i < 0 ) return 0; - lmb_add_region(_rsv, base, size); + lmb_add_region(&lmb.reserved, base, size); return base; } @@ -295,17 +286,15 @@ lmb_alloc_base(unsigned long size, unsigned long align, unsigned long max_addr) unsigned long __init lmb_phys_mem_size(void) { - struct lmb *_lmb = &lmb; #ifdef CONFIG_MSCHUNKS - return _lmb->memory.size; + return lmb.memory.size; #else - struct lmb_region *_mem = &(_lmb->memory); unsigned long total = 0; int i; /* add all physical memory to the bootmem map */ - for (i=0; i < _mem->cnt; i++) - total += _mem->region[i].size; + for (i=0; i < lmb.memory.cnt; i++) + total += lmb.memory.region[i].size; return total; #endif /* CONFIG_MSCHUNKS */ } @@ -313,14 +302,12 @@ lmb_phys_mem_size(void) unsigned long __init lmb_end_of_DRAM(void) { - struct lmb *_lmb = &lmb; - struct lmb_region *_mem = &(_lmb->memory); - int idx = _mem->cnt - 1; + int idx = lmb.memory.cnt - 1; #ifdef CONFIG_MSCHUNKS - return (_mem->region[idx].physbase + _mem->region[idx].size); + return (lmb.memory.region[idx].physbase + lmb.memory.region[idx].size); #else - return (_mem->region[idx].base + _mem->region[idx].size); + return (lmb.memory.region[idx].base + lmb.memory.region[idx].size); #endif /* CONFIG_MSCHUNKS */ return 0; @@ -353,20 +340,19 @@ void __init lmb_enforce_memory_limit(void) { extern unsigned long memory_limit; unsigned long i, limit; - struct lmb_region *mem = &(lmb.memory); if (! memory_limit) return; limit = memory_limit; - for (i = 0; i < mem->cnt; i++) { - if (limit > mem->region[i].size) { - limit -= mem->region[i].size; + for (i = 0; i < lmb.memory.cnt; i++) { + if (limit > lmb.memory.region[i].size) { + limit -= lmb.memory.region[i].size; continue; } - mem->region[i].size = limit; - mem->cnt = i + 1; + lmb.memory.region[i].size = limit; + lmb.memory.cnt = i + 1; break; } } -- cgit v0.10.2 From e88bcd1b29f63738b702e57d831758706162347e Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 3 Aug 2005 20:21:25 +1000 Subject: [PATCH] ppc64: Remove redundant abs_to_phys() macro abs_to_phys() is a macro that turns out to do nothing, and also has the unfortunate property that it's not the inverse of phys_to_abs() on iSeries. The following is for my benefit as much as everyone else. With CONFIG_MSCHUNKS enabled, the lmb code is changed such that it keeps a physbase variable for each lmb region. This is used to take the possibly discontiguous lmb regions and present them as a contiguous address space beginning from zero. In this context each lmb region's base address is its "absolute" base address, and its physbase is it's "physical" address (from Linux's point of view). The abs_to_phys() macro does the mapping from "absolute" to "physical". Note: This is not related to the iSeries mapping of physical to absolute (ie. Hypervisor) addresses which is maintained with the msChunks structure. And the msChunks structure is not controlled via CONFIG_MSCHUNKS. Once upon a time you could compile for non-iSeries with CONFIG_MSCHUNKS enabled. But these days CONFIG_MSCHUNKS depends on CONFIG_PPC_ISERIES, so for non-iSeries code abs_to_phys() is a no-op. On iSeries we always have one lmb region which spans from 0 to systemcfg->physicalMemorySize (arch/ppc64/kernel/iSeries_setup.c line 383). This region has a base (ie. absolute) address of 0, and a physbase address of 0 (as calculated in lmb_analyze() (arch/ppc64/kernel/lmb.c line 144)). On iSeries, abs_to_phys(aa) is defined as lmb_abs_to_phys(aa), which finds the lmb region containing aa (and there's only one, ie. 0), and then does: return lmb.memory.region[0].physbase + (aa - lmb.memory.region[0].base) physbase == base == 0, so you're left with "return aa". So remove abs_to_phys(), and lmb_abs_to_phys() which is the implementation of abs_to_phys() for iSeries. Signed-off-by: Michael Ellerman Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/kernel/lmb.c b/arch/ppc64/kernel/lmb.c index 6cb2756..111da73 100644 --- a/arch/ppc64/kernel/lmb.c +++ b/arch/ppc64/kernel/lmb.c @@ -313,25 +313,6 @@ lmb_end_of_DRAM(void) return 0; } -unsigned long __init -lmb_abs_to_phys(unsigned long aa) -{ - unsigned long i, pa = aa; - struct lmb *_lmb = &lmb; - struct lmb_region *_mem = &(_lmb->memory); - - for (i=0; i < _mem->cnt; i++) { - unsigned long lmbbase = _mem->region[i].base; - unsigned long lmbsize = _mem->region[i].size; - if ( lmb_addrs_overlap(aa,1,lmbbase,lmbsize) ) { - pa = _mem->region[i].physbase + (aa - lmbbase); - break; - } - } - - return pa; -} - /* * Truncate the lmb list to memory_limit if it's set * You must call lmb_analyze() after this. diff --git a/arch/ppc64/mm/init.c b/arch/ppc64/mm/init.c index 9edfe26..a16cf12 100644 --- a/arch/ppc64/mm/init.c +++ b/arch/ppc64/mm/init.c @@ -42,7 +42,6 @@ #include #include -#include #include #include #include @@ -167,7 +166,6 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags) ptep = pte_alloc_kernel(&init_mm, pmdp, ea); if (!ptep) return -ENOMEM; - pa = abs_to_phys(pa); set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, __pgprot(flags))); spin_unlock(&init_mm.page_table_lock); @@ -547,7 +545,7 @@ void __init do_init_bootmem(void) */ bootmap_pages = bootmem_bootmap_pages(total_pages); - start = abs_to_phys(lmb_alloc(bootmap_pages<> PAGE_SHIFT, total_pages); diff --git a/include/asm-ppc64/abs_addr.h b/include/asm-ppc64/abs_addr.h index ab4320c..200db1c 100644 --- a/include/asm-ppc64/abs_addr.h +++ b/include/asm-ppc64/abs_addr.h @@ -56,9 +56,6 @@ static inline unsigned long phys_to_abs(unsigned long pa) return chunk_to_addr(chunk) + (pa & MSCHUNKS_OFFSET_MASK); } -/* A macro so it can take pointers or unsigned long. */ -#define abs_to_phys(aa) lmb_abs_to_phys((unsigned long)(aa)) - #else /* !CONFIG_MSCHUNKS */ #define chunk_to_addr(chunk) ((unsigned long)(chunk)) @@ -68,12 +65,11 @@ static inline unsigned long phys_to_abs(unsigned long pa) #define phys_to_abs(pa) (pa) #define physRpn_to_absRpn(rpn) (rpn) -#define abs_to_phys(aa) (aa) #endif /* !CONFIG_MSCHUNKS */ /* Convenience macros */ #define virt_to_abs(va) phys_to_abs(__pa(va)) -#define abs_to_virt(aa) __va(abs_to_phys(aa)) +#define abs_to_virt(aa) __va(aa) #endif /* _ABS_ADDR_H */ -- cgit v0.10.2 From 180379dcefb39e8bd05d562b0685e9084dffcc0a Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 3 Aug 2005 20:21:26 +1000 Subject: [PATCH] ppc64: Remove physbase from the lmb_property struct We no longer need the lmb code to know about abs and phys addresses, so remove the physbase variable from the lmb_property struct. Signed-off-by: Michael Ellerman Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/kernel/lmb.c b/arch/ppc64/kernel/lmb.c index 111da73..6ed6312 100644 --- a/arch/ppc64/kernel/lmb.c +++ b/arch/ppc64/kernel/lmb.c @@ -37,8 +37,6 @@ void lmb_dump_all(void) for (i=0; i < lmb.memory.cnt ;i++) { udbg_printf(" memory.region[0x%x].base = 0x%lx\n", i, lmb.memory.region[i].base); - udbg_printf(" .physbase = 0x%lx\n", - lmb.memory.region[i].physbase); udbg_printf(" .size = 0x%lx\n", lmb.memory.region[i].size); } @@ -50,8 +48,6 @@ void lmb_dump_all(void) for (i=0; i < lmb.reserved.cnt ;i++) { udbg_printf(" reserved.region[0x%x].base = 0x%lx\n", i, lmb.reserved.region[i].base); - udbg_printf(" .physbase = 0x%lx\n", - lmb.reserved.region[i].physbase); udbg_printf(" .size = 0x%lx\n", lmb.reserved.region[i].size); } @@ -97,7 +93,6 @@ lmb_coalesce_regions(struct lmb_region *rgn, unsigned long r1, unsigned long r2) rgn->region[r1].size += rgn->region[r2].size; for (i=r2; i < rgn->cnt-1; i++) { rgn->region[i].base = rgn->region[i+1].base; - rgn->region[i].physbase = rgn->region[i+1].physbase; rgn->region[i].size = rgn->region[i+1].size; } rgn->cnt--; @@ -127,21 +122,12 @@ lmb_analyze(void) unsigned long i; unsigned long mem_size = 0; unsigned long size_mask = 0; -#ifdef CONFIG_MSCHUNKS - unsigned long physbase = 0; -#endif for (i=0; i < lmb.memory.cnt; i++) { unsigned long lmb_size; lmb_size = lmb.memory.region[i].size; -#ifdef CONFIG_MSCHUNKS - lmb.memory.region[i].physbase = physbase; - physbase += lmb_size; -#else - lmb.memory.region[i].physbase = lmb.memory.region[i].base; -#endif mem_size += lmb_size; size_mask |= lmb_size; } @@ -164,7 +150,6 @@ lmb_add_region(struct lmb_region *rgn, unsigned long base, unsigned long size) adjacent = lmb_addrs_adjacent(base,size,rgnbase,rgnsize); if ( adjacent > 0 ) { rgn->region[i].base -= size; - rgn->region[i].physbase -= size; rgn->region[i].size += size; coalesced++; break; @@ -191,11 +176,9 @@ lmb_add_region(struct lmb_region *rgn, unsigned long base, unsigned long size) for (i=rgn->cnt-1; i >= 0; i--) { if (base < rgn->region[i].base) { rgn->region[i+1].base = rgn->region[i].base; - rgn->region[i+1].physbase = rgn->region[i].physbase; rgn->region[i+1].size = rgn->region[i].size; } else { rgn->region[i+1].base = base; - rgn->region[i+1].physbase = lmb_abs_to_phys(base); rgn->region[i+1].size = size; break; } @@ -304,13 +287,7 @@ lmb_end_of_DRAM(void) { int idx = lmb.memory.cnt - 1; -#ifdef CONFIG_MSCHUNKS - return (lmb.memory.region[idx].physbase + lmb.memory.region[idx].size); -#else return (lmb.memory.region[idx].base + lmb.memory.region[idx].size); -#endif /* CONFIG_MSCHUNKS */ - - return 0; } /* diff --git a/arch/ppc64/mm/hash_utils.c b/arch/ppc64/mm/hash_utils.c index 65d6e85..09475c8 100644 --- a/arch/ppc64/mm/hash_utils.c +++ b/arch/ppc64/mm/hash_utils.c @@ -210,7 +210,7 @@ void __init htab_initialize(void) /* create bolted the linear mapping in the hash table */ for (i=0; i < lmb.memory.cnt; i++) { - base = lmb.memory.region[i].physbase + KERNELBASE; + base = lmb.memory.region[i].base + KERNELBASE; size = lmb.memory.region[i].size; DBG("creating mapping for region: %lx : %lx\n", base, size); diff --git a/arch/ppc64/mm/init.c b/arch/ppc64/mm/init.c index a16cf12..c02dc98 100644 --- a/arch/ppc64/mm/init.c +++ b/arch/ppc64/mm/init.c @@ -482,9 +482,9 @@ void __init mm_init_ppc64(void) for (i = 1; i < lmb.memory.cnt; i++) { unsigned long base, prevbase, prevsize; - prevbase = lmb.memory.region[i-1].physbase; + prevbase = lmb.memory.region[i-1].base; prevsize = lmb.memory.region[i-1].size; - base = lmb.memory.region[i].physbase; + base = lmb.memory.region[i].base; if (base > (prevbase + prevsize)) { io_hole_start = prevbase + prevsize; io_hole_size = base - (prevbase + prevsize); @@ -511,11 +511,8 @@ int page_is_ram(unsigned long pfn) for (i=0; i < lmb.memory.cnt; i++) { unsigned long base; -#ifdef CONFIG_MSCHUNKS - base = lmb.memory.region[i].physbase; -#else base = lmb.memory.region[i].base; -#endif + if ((paddr >= base) && (paddr < (base + lmb.memory.region[i].size))) { return 1; @@ -556,25 +553,25 @@ void __init do_init_bootmem(void) * present. */ for (i=0; i < lmb.memory.cnt; i++) { - unsigned long physbase, size; + unsigned long base, size; unsigned long start_pfn, end_pfn; - physbase = lmb.memory.region[i].physbase; + base = lmb.memory.region[i].base; size = lmb.memory.region[i].size; - start_pfn = physbase >> PAGE_SHIFT; + start_pfn = base >> PAGE_SHIFT; end_pfn = start_pfn + (size >> PAGE_SHIFT); memory_present(0, start_pfn, end_pfn); - free_bootmem(physbase, size); + free_bootmem(base, size); } /* reserve the sections we're already using */ for (i=0; i < lmb.reserved.cnt; i++) { - unsigned long physbase = lmb.reserved.region[i].physbase; + unsigned long base = lmb.reserved.region[i].base; unsigned long size = lmb.reserved.region[i].size; - reserve_bootmem(physbase, size); + reserve_bootmem(base, size); } } @@ -613,10 +610,10 @@ static int __init setup_kcore(void) int i; for (i=0; i < lmb.memory.cnt; i++) { - unsigned long physbase, size; + unsigned long base, size; struct kcore_list *kcore_mem; - physbase = lmb.memory.region[i].physbase; + base = lmb.memory.region[i].base; size = lmb.memory.region[i].size; /* GFP_ATOMIC to avoid might_sleep warnings during boot */ @@ -624,7 +621,7 @@ static int __init setup_kcore(void) if (!kcore_mem) panic("mem_init: kmalloc failed\n"); - kclist_add(kcore_mem, __va(physbase), size); + kclist_add(kcore_mem, __va(base), size); } kclist_add(&kcore_vmem, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START); diff --git a/arch/ppc64/mm/numa.c b/arch/ppc64/mm/numa.c index 0b191f2..c3116f0 100644 --- a/arch/ppc64/mm/numa.c +++ b/arch/ppc64/mm/numa.c @@ -671,7 +671,7 @@ new_range: * Mark reserved regions on this node */ for (i = 0; i < lmb.reserved.cnt; i++) { - unsigned long physbase = lmb.reserved.region[i].physbase; + unsigned long physbase = lmb.reserved.region[i].base; unsigned long size = lmb.reserved.region[i].size; if (pa_to_nid(physbase) != nid && diff --git a/include/asm-ppc64/lmb.h b/include/asm-ppc64/lmb.h index a6cbca2..cb368bf 100644 --- a/include/asm-ppc64/lmb.h +++ b/include/asm-ppc64/lmb.h @@ -22,7 +22,6 @@ struct lmb_property { unsigned long base; - unsigned long physbase; unsigned long size; }; -- cgit v0.10.2 From 71e1f55ad4bc4c8bcfe696400a950a34263a750e Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 3 Aug 2005 20:21:26 +1000 Subject: [PATCH] ppc64: Simplify some lmb functions lmb_phys_mem_size() can always return lmb.memory.size, as long as it's called after lmb_analyze(), which it is. There's no need to recalculate the size on every call. lmb_analyze() was calculating a few things we then threw away, so just don't calculate them to start with. Signed-off-by: Michael Ellerman Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/kernel/lmb.c b/arch/ppc64/kernel/lmb.c index 6ed6312..5adaca2 100644 --- a/arch/ppc64/kernel/lmb.c +++ b/arch/ppc64/kernel/lmb.c @@ -119,20 +119,12 @@ lmb_init(void) void __init lmb_analyze(void) { - unsigned long i; - unsigned long mem_size = 0; - unsigned long size_mask = 0; - - for (i=0; i < lmb.memory.cnt; i++) { - unsigned long lmb_size; + int i; - lmb_size = lmb.memory.region[i].size; + lmb.memory.size = 0; - mem_size += lmb_size; - size_mask |= lmb_size; - } - - lmb.memory.size = mem_size; + for (i = 0; i < lmb.memory.cnt; i++) + lmb.memory.size += lmb.memory.region[i].size; } /* This routine called with relocation disabled. */ @@ -266,20 +258,11 @@ lmb_alloc_base(unsigned long size, unsigned long align, unsigned long max_addr) return base; } +/* You must call lmb_analyze() before this. */ unsigned long __init lmb_phys_mem_size(void) { -#ifdef CONFIG_MSCHUNKS return lmb.memory.size; -#else - unsigned long total = 0; - int i; - - /* add all physical memory to the bootmem map */ - for (i=0; i < lmb.memory.cnt; i++) - total += lmb.memory.region[i].size; - return total; -#endif /* CONFIG_MSCHUNKS */ } unsigned long __init -- cgit v0.10.2 From bef5686229810709091fb6e505071f4aa41e3760 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 3 Aug 2005 20:21:26 +1000 Subject: [PATCH] ppc64: Remove CONFIG_MSCHUNKS We can now remove CONFIG_MSCHUNKS as it doesn't do anything interesting anymore. The only macro in abs_addr.h which is called by non-iSeries code is phys_to_abs(), so remove the other dummy implementations, and we add a firmware feature check to phys_to_abs(). Signed-off-by: Michael Ellerman Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/Kconfig b/arch/ppc64/Kconfig index 4d4f81c..13b262f 100644 --- a/arch/ppc64/Kconfig +++ b/arch/ppc64/Kconfig @@ -302,12 +302,6 @@ config GENERIC_HARDIRQS bool default y -config MSCHUNKS - bool - depends on PPC_ISERIES - default y - - config PPC_RTAS bool depends on PPC_PSERIES || PPC_BPA diff --git a/arch/ppc64/configs/iSeries_defconfig b/arch/ppc64/configs/iSeries_defconfig index 394ba18..219c667 100644 --- a/arch/ppc64/configs/iSeries_defconfig +++ b/arch/ppc64/configs/iSeries_defconfig @@ -99,7 +99,6 @@ CONFIG_HZ_100=y # CONFIG_HZ_1000 is not set CONFIG_HZ=100 CONFIG_GENERIC_HARDIRQS=y -CONFIG_MSCHUNKS=y CONFIG_LPARCFG=y CONFIG_SECCOMP=y CONFIG_ISA_DMA_API=y diff --git a/include/asm-ppc64/abs_addr.h b/include/asm-ppc64/abs_addr.h index 200db1c..84c24d4 100644 --- a/include/asm-ppc64/abs_addr.h +++ b/include/asm-ppc64/abs_addr.h @@ -16,8 +16,7 @@ #include #include #include - -#ifdef CONFIG_MSCHUNKS +#include struct mschunks_map { unsigned long num_chunks; @@ -48,6 +47,10 @@ static inline unsigned long phys_to_abs(unsigned long pa) { unsigned long chunk; + /* This is a no-op on non-iSeries */ + if (!firmware_has_feature(FW_FEATURE_ISERIES)) + return pa; + chunk = addr_to_chunk(pa); if (chunk < mschunks_map.num_chunks) @@ -56,18 +59,6 @@ static inline unsigned long phys_to_abs(unsigned long pa) return chunk_to_addr(chunk) + (pa & MSCHUNKS_OFFSET_MASK); } -#else /* !CONFIG_MSCHUNKS */ - -#define chunk_to_addr(chunk) ((unsigned long)(chunk)) -#define addr_to_chunk(addr) (addr) -#define chunk_offset(addr) (0) -#define abs_chunk(pchunk) (pchunk) - -#define phys_to_abs(pa) (pa) -#define physRpn_to_absRpn(rpn) (rpn) - -#endif /* !CONFIG_MSCHUNKS */ - /* Convenience macros */ #define virt_to_abs(va) phys_to_abs(__pa(va)) #define abs_to_virt(aa) __va(aa) -- cgit v0.10.2 From b13cfd173f73c3f6f9a307b7b6e64d45fbd756b2 Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Thu, 4 Aug 2005 19:26:42 +0200 Subject: [PATCH] ppc64: allow xmon=off If both CONFIG_XMON and CONFIG_XMON_DEFAULT is enabled in the .config, there is no way to disable xmon again. setup_system calls first xmon_init, later parse_early_param. So a new 'xmon=off' cmdline option will do the right thing. Signed-off-by: Olaf Hering Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/kernel/setup.c b/arch/ppc64/kernel/setup.c index e9c24d2..b3ef8df 100644 --- a/arch/ppc64/kernel/setup.c +++ b/arch/ppc64/kernel/setup.c @@ -627,7 +627,7 @@ void __init setup_system(void) * Initialize xmon */ #ifdef CONFIG_XMON_DEFAULT - xmon_init(); + xmon_init(1); #endif /* * Register early console @@ -1343,11 +1343,13 @@ static int __init early_xmon(char *p) /* ensure xmon is enabled */ if (p) { if (strncmp(p, "on", 2) == 0) - xmon_init(); + xmon_init(1); + if (strncmp(p, "off", 3) == 0) + xmon_init(0); if (strncmp(p, "early", 5) != 0) return 0; } - xmon_init(); + xmon_init(1); debugger(NULL); return 0; diff --git a/arch/ppc64/xmon/start.c b/arch/ppc64/xmon/start.c index a9265bc..f86b584 100644 --- a/arch/ppc64/xmon/start.c +++ b/arch/ppc64/xmon/start.c @@ -27,7 +27,7 @@ static void sysrq_handle_xmon(int key, struct pt_regs *pt_regs, struct tty_struct *tty) { /* ensure xmon is enabled */ - xmon_init(); + xmon_init(1); debugger(pt_regs); } diff --git a/arch/ppc64/xmon/xmon.c b/arch/ppc64/xmon/xmon.c index 0553943..45908b1 100644 --- a/arch/ppc64/xmon/xmon.c +++ b/arch/ppc64/xmon/xmon.c @@ -2496,15 +2496,25 @@ static void dump_stab(void) } } -void xmon_init(void) -{ - __debugger = xmon; - __debugger_ipi = xmon_ipi; - __debugger_bpt = xmon_bpt; - __debugger_sstep = xmon_sstep; - __debugger_iabr_match = xmon_iabr_match; - __debugger_dabr_match = xmon_dabr_match; - __debugger_fault_handler = xmon_fault_handler; +void xmon_init(int enable) +{ + if (enable) { + __debugger = xmon; + __debugger_ipi = xmon_ipi; + __debugger_bpt = xmon_bpt; + __debugger_sstep = xmon_sstep; + __debugger_iabr_match = xmon_iabr_match; + __debugger_dabr_match = xmon_dabr_match; + __debugger_fault_handler = xmon_fault_handler; + } else { + __debugger = NULL; + __debugger_ipi = NULL; + __debugger_bpt = NULL; + __debugger_sstep = NULL; + __debugger_iabr_match = NULL; + __debugger_dabr_match = NULL; + __debugger_fault_handler = NULL; + } } void dump_segments(void) diff --git a/include/asm-ppc64/system.h b/include/asm-ppc64/system.h index 4104a5d..b9e1835 100644 --- a/include/asm-ppc64/system.h +++ b/include/asm-ppc64/system.h @@ -88,7 +88,7 @@ DEBUGGER_BOILERPLATE(debugger_dabr_match) DEBUGGER_BOILERPLATE(debugger_fault_handler) #ifdef CONFIG_XMON -extern void xmon_init(void); +extern void xmon_init(int enable); #endif #else -- cgit v0.10.2 From 180a33627d958d5d9d3602dde6ac74b315e136f0 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 9 Aug 2005 11:13:36 +1000 Subject: [PATCH] ppc64: Move ppc64_enable_pmcs() logic into a ppc_md function This patch moves power4_enable_pmcs() to arch/ppc64/kernel/pmc.c. I've tested it on P5 LPAR and P4. It does what it used to. Signed-off-by: Michael Ellerman Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/kernel/iSeries_setup.c b/arch/ppc64/kernel/iSeries_setup.c index b384a6a..3ffefbb 100644 --- a/arch/ppc64/kernel/iSeries_setup.c +++ b/arch/ppc64/kernel/iSeries_setup.c @@ -964,6 +964,8 @@ void __init iSeries_early_setup(void) ppc_md.calibrate_decr = iSeries_calibrate_decr; ppc_md.progress = iSeries_progress; + /* XXX Implement enable_pmcs for iSeries */ + if (get_paca()->lppaca.shared_proc) { ppc_md.idle_loop = iseries_shared_idle; printk(KERN_INFO "Using shared processor idle loop\n"); diff --git a/arch/ppc64/kernel/pSeries_setup.c b/arch/ppc64/kernel/pSeries_setup.c index 54e0651..f0f0630 100644 --- a/arch/ppc64/kernel/pSeries_setup.c +++ b/arch/ppc64/kernel/pSeries_setup.c @@ -61,6 +61,7 @@ #include #include #include +#include #include "i8259.h" #include "mpic.h" @@ -187,6 +188,21 @@ static void __init pSeries_setup_mpic(void) " MPIC "); } +static void pseries_lpar_enable_pmcs(void) +{ + unsigned long set, reset; + + power4_enable_pmcs(); + + set = 1UL << 63; + reset = 0; + plpar_hcall_norets(H_PERFMON, set, reset); + + /* instruct hypervisor to maintain PMCs */ + if (firmware_has_feature(FW_FEATURE_SPLPAR)) + get_paca()->lppaca.pmcregs_in_use = 1; +} + static void __init pSeries_setup_arch(void) { /* Fixup ppc_md depending on the type of interrupt controller */ @@ -245,6 +261,11 @@ static void __init pSeries_setup_arch(void) printk(KERN_INFO "Using default idle loop\n"); ppc_md.idle_loop = default_idle; } + + if (systemcfg->platform & PLATFORM_LPAR) + ppc_md.enable_pmcs = pseries_lpar_enable_pmcs; + else + ppc_md.enable_pmcs = power4_enable_pmcs; } static int __init pSeries_init_panel(void) diff --git a/arch/ppc64/kernel/pmac_setup.c b/arch/ppc64/kernel/pmac_setup.c index e40877f..8ff86a7 100644 --- a/arch/ppc64/kernel/pmac_setup.c +++ b/arch/ppc64/kernel/pmac_setup.c @@ -71,6 +71,7 @@ #include #include #include +#include #include "pmac.h" #include "mpic.h" @@ -511,4 +512,5 @@ struct machdep_calls __initdata pmac_md = { .progress = pmac_progress, .check_legacy_ioport = pmac_check_legacy_ioport, .idle_loop = native_idle, + .enable_pmcs = power4_enable_pmcs, }; diff --git a/arch/ppc64/kernel/pmc.c b/arch/ppc64/kernel/pmc.c index 67be773..cdfec74 100644 --- a/arch/ppc64/kernel/pmc.c +++ b/arch/ppc64/kernel/pmc.c @@ -65,3 +65,24 @@ void release_pmc_hardware(void) spin_unlock(&pmc_owner_lock); } EXPORT_SYMBOL_GPL(release_pmc_hardware); + +void power4_enable_pmcs(void) +{ + unsigned long hid0; + + hid0 = mfspr(HID0); + hid0 |= 1UL << (63 - 20); + + /* POWER4 requires the following sequence */ + asm volatile( + "sync\n" + "mtspr %1, %0\n" + "mfspr %0, %1\n" + "mfspr %0, %1\n" + "mfspr %0, %1\n" + "mfspr %0, %1\n" + "mfspr %0, %1\n" + "mfspr %0, %1\n" + "isync" : "=&r" (hid0) : "i" (HID0), "0" (hid0): + "memory"); +} diff --git a/arch/ppc64/kernel/sysfs.c b/arch/ppc64/kernel/sysfs.c index eca15d2..f311ee7 100644 --- a/arch/ppc64/kernel/sysfs.c +++ b/arch/ppc64/kernel/sysfs.c @@ -101,6 +101,8 @@ static int __init setup_smt_snooze_delay(char *str) } __setup("smt-snooze-delay=", setup_smt_snooze_delay); +#endif /* CONFIG_PPC_MULTIPLATFORM */ + /* * Enabling PMCs will slow partition context switch times so we only do * it the first time we write to the PMCs. @@ -110,63 +112,15 @@ static DEFINE_PER_CPU(char, pmcs_enabled); void ppc64_enable_pmcs(void) { - unsigned long hid0; -#ifdef CONFIG_PPC_PSERIES - unsigned long set, reset; -#endif /* CONFIG_PPC_PSERIES */ - /* Only need to enable them once */ if (__get_cpu_var(pmcs_enabled)) return; __get_cpu_var(pmcs_enabled) = 1; - switch (systemcfg->platform) { - case PLATFORM_PSERIES: - case PLATFORM_POWERMAC: - hid0 = mfspr(HID0); - hid0 |= 1UL << (63 - 20); - - /* POWER4 requires the following sequence */ - asm volatile( - "sync\n" - "mtspr %1, %0\n" - "mfspr %0, %1\n" - "mfspr %0, %1\n" - "mfspr %0, %1\n" - "mfspr %0, %1\n" - "mfspr %0, %1\n" - "mfspr %0, %1\n" - "isync" : "=&r" (hid0) : "i" (HID0), "0" (hid0): - "memory"); - break; - -#ifdef CONFIG_PPC_PSERIES - case PLATFORM_PSERIES_LPAR: - set = 1UL << 63; - reset = 0; - plpar_hcall_norets(H_PERFMON, set, reset); - break; -#endif /* CONFIG_PPC_PSERIES */ - - default: - break; - } - - /* instruct hypervisor to maintain PMCs */ - if (firmware_has_feature(FW_FEATURE_SPLPAR)) - get_paca()->lppaca.pmcregs_in_use = 1; + if (ppc_md.enable_pmcs) + ppc_md.enable_pmcs(); } - -#else - -/* PMC stuff */ -void ppc64_enable_pmcs(void) -{ - /* XXX Implement for iseries */ -} -#endif /* CONFIG_PPC_MULTIPLATFORM */ - EXPORT_SYMBOL(ppc64_enable_pmcs); /* XXX convert to rusty's on_one_cpu */ diff --git a/include/asm-ppc64/machdep.h b/include/asm-ppc64/machdep.h index f0ef063..ff2c928 100644 --- a/include/asm-ppc64/machdep.h +++ b/include/asm-ppc64/machdep.h @@ -140,6 +140,9 @@ struct machdep_calls { /* Idle loop for this platform, leave empty for default idle loop */ int (*idle_loop)(void); + + /* Function to enable pmcs for this platform, called once per cpu. */ + void (*enable_pmcs)(void); }; extern int default_idle(void); diff --git a/include/asm-ppc64/pmc.h b/include/asm-ppc64/pmc.h index c924748..d1d297d 100644 --- a/include/asm-ppc64/pmc.h +++ b/include/asm-ppc64/pmc.h @@ -26,4 +26,6 @@ typedef void (*perf_irq_t)(struct pt_regs *); int reserve_pmc_hardware(perf_irq_t new_perf_irq); void release_pmc_hardware(void); +void power4_enable_pmcs(void); + #endif /* _PPC64_PMC_H */ -- cgit v0.10.2 From 145ec7d51ae507c7cc8889ad05e24af05bbd9147 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 9 Aug 2005 15:20:18 +1000 Subject: [PATCH] ppc64: Fix a misleading printk in unflatten_dt_node() When unflatten_dt_node() fails to find an OF_DT_END_NODE tag it prints "Weird tag at start of node", this should be "Weird tag at end of node". Signed-off-by: Michael Ellerman arch/ppc64/kernel/prom.c | 2 +- 1 files changed, 1 insertion(+), 1 deletion(-) Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/kernel/prom.c b/arch/ppc64/kernel/prom.c index 255c39a..04b852d 100644 --- a/arch/ppc64/kernel/prom.c +++ b/arch/ppc64/kernel/prom.c @@ -918,7 +918,7 @@ static unsigned long __init unflatten_dt_node(unsigned long mem, tag = *((u32 *)(*p)); } if (tag != OF_DT_END_NODE) { - printk("Weird tag at start of node: %x\n", tag); + printk("Weird tag at end of node: %x\n", tag); return mem; } *p += 4; -- cgit v0.10.2 From 95920324f51b3a12603cf6d9bacbd831f34c5b60 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 9 Aug 2005 15:20:19 +1000 Subject: [PATCH] ppc64: unflatten_device_tree() should check if lmb_alloc() fails unflatten_device_tree() doesn't check if lmb_alloc() succeeds or not, it should. All it can do is panic, but at least there's an error message (assuming you have some sort of console at that point). Signed-off-by: Michael Ellerman arch/ppc64/kernel/prom.c | 9 +++++++-- 1 files changed, 7 insertions(+), 2 deletions(-) Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/kernel/prom.c b/arch/ppc64/kernel/prom.c index 04b852d..b218488 100644 --- a/arch/ppc64/kernel/prom.c +++ b/arch/ppc64/kernel/prom.c @@ -950,8 +950,13 @@ void __init unflatten_device_tree(void) DBG(" size is %lx, allocating...\n", size); /* Allocate memory for the expanded device tree */ - mem = (unsigned long)abs_to_virt(lmb_alloc(size + 4, - __alignof__(struct device_node))); + mem = lmb_alloc(size + 4, __alignof__(struct device_node)); + if (!mem) { + DBG("Couldn't allocate memory with lmb_alloc()!\n"); + panic("Couldn't allocate memory with lmb_alloc()!\n"); + } + mem = (unsigned long)abs_to_virt(mem); + ((u32 *)mem)[size / 4] = 0xdeadbeef; DBG(" unflattening...\n", mem); -- cgit v0.10.2 From 9a5573e378c5c8976c6000a7643b52e2a0481688 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 9 Aug 2005 15:20:20 +1000 Subject: [PATCH] ppc64: Check of_chosen in check_for_initrd() You can't call get_property() on a NULL node, so check if of_chosen is set in check_for_initrd(). Signed-off-by: Michael Ellerman arch/ppc64/kernel/setup.c | 20 ++++++++++++-------- 1 files changed, 12 insertions(+), 8 deletions(-) Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/kernel/setup.c b/arch/ppc64/kernel/setup.c index b3ef8df..ee3b20d 100644 --- a/arch/ppc64/kernel/setup.c +++ b/arch/ppc64/kernel/setup.c @@ -536,15 +536,19 @@ static void __init check_for_initrd(void) DBG(" -> check_for_initrd()\n"); - prop = (u64 *)get_property(of_chosen, "linux,initrd-start", NULL); - if (prop != NULL) { - initrd_start = (unsigned long)__va(*prop); - prop = (u64 *)get_property(of_chosen, "linux,initrd-end", NULL); + if (of_chosen) { + prop = (u64 *)get_property(of_chosen, + "linux,initrd-start", NULL); if (prop != NULL) { - initrd_end = (unsigned long)__va(*prop); - initrd_below_start_ok = 1; - } else - initrd_start = 0; + initrd_start = (unsigned long)__va(*prop); + prop = (u64 *)get_property(of_chosen, + "linux,initrd-end", NULL); + if (prop != NULL) { + initrd_end = (unsigned long)__va(*prop); + initrd_below_start_ok = 1; + } else + initrd_start = 0; + } } /* If we were passed an initrd, set the ROOT_DEV properly if the values -- cgit v0.10.2 From c594adad5653491813959277fb87a2fef54c4e05 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Thu, 11 Aug 2005 16:55:21 +1000 Subject: [PATCH] Dynamic hugepage addresses for ppc64 Paulus, I think this is now a reasonable candidate for the post-2.6.13 queue. Relax address restrictions for hugepages on ppc64 Presently, 64-bit applications on ppc64 may only use hugepages in the address region from 1-1.5T. Furthermore, if hugepages are enabled in the kernel config, they may only use hugepages and never normal pages in this area. This patch relaxes this restriction, allowing any address to be used with hugepages, but with a 1TB granularity. That is if you map a hugepage anywhere in the region 1TB-2TB, that entire area will be reserved exclusively for hugepages for the remainder of the process's lifetime. This works analagously to hugepages in 32-bit applications, where hugepages can be mapped anywhere, but with 256MB (mmu segment) granularity. This patch applies on top of the four level pagetable patch (http://patchwork.ozlabs.org/linuxppc64/patch?id=1936). Signed-off-by: David Gibson Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/kernel/asm-offsets.c b/arch/ppc64/kernel/asm-offsets.c index abb9e5b..17e35d0 100644 --- a/arch/ppc64/kernel/asm-offsets.c +++ b/arch/ppc64/kernel/asm-offsets.c @@ -94,7 +94,8 @@ int main(void) DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr)); DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id)); #ifdef CONFIG_HUGETLB_PAGE - DEFINE(PACAHTLBSEGS, offsetof(struct paca_struct, context.htlb_segs)); + DEFINE(PACALOWHTLBAREAS, offsetof(struct paca_struct, context.low_htlb_areas)); + DEFINE(PACAHIGHHTLBAREAS, offsetof(struct paca_struct, context.high_htlb_areas)); #endif /* CONFIG_HUGETLB_PAGE */ DEFINE(PACADEFAULTDECR, offsetof(struct paca_struct, default_decr)); DEFINE(PACA_EXGEN, offsetof(struct paca_struct, exgen)); diff --git a/arch/ppc64/mm/hugetlbpage.c b/arch/ppc64/mm/hugetlbpage.c index a13e442..e7833c8 100644 --- a/arch/ppc64/mm/hugetlbpage.c +++ b/arch/ppc64/mm/hugetlbpage.c @@ -27,6 +27,9 @@ #include +#define NUM_LOW_AREAS (0x100000000UL >> SID_SHIFT) +#define NUM_HIGH_AREAS (PGTABLE_RANGE >> HTLB_AREA_SHIFT) + /* Modelled after find_linux_pte() */ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) { @@ -129,15 +132,17 @@ int is_aligned_hugepage_range(unsigned long addr, unsigned long len) return 0; } -static void flush_segments(void *parm) +static void flush_low_segments(void *parm) { - u16 segs = (unsigned long) parm; + u16 areas = (unsigned long) parm; unsigned long i; asm volatile("isync" : : : "memory"); - for (i = 0; i < 16; i++) { - if (! (segs & (1U << i))) + BUILD_BUG_ON((sizeof(areas)*8) != NUM_LOW_AREAS); + + for (i = 0; i < NUM_LOW_AREAS; i++) { + if (! (areas & (1U << i))) continue; asm volatile("slbie %0" : : "r" (i << SID_SHIFT)); } @@ -145,13 +150,33 @@ static void flush_segments(void *parm) asm volatile("isync" : : : "memory"); } -static int prepare_low_seg_for_htlb(struct mm_struct *mm, unsigned long seg) +static void flush_high_segments(void *parm) +{ + u16 areas = (unsigned long) parm; + unsigned long i, j; + + asm volatile("isync" : : : "memory"); + + BUILD_BUG_ON((sizeof(areas)*8) != NUM_HIGH_AREAS); + + for (i = 0; i < NUM_HIGH_AREAS; i++) { + if (! (areas & (1U << i))) + continue; + for (j = 0; j < (1UL << (HTLB_AREA_SHIFT-SID_SHIFT)); j++) + asm volatile("slbie %0" + :: "r" ((i << HTLB_AREA_SHIFT) + (j << SID_SHIFT))); + } + + asm volatile("isync" : : : "memory"); +} + +static int prepare_low_area_for_htlb(struct mm_struct *mm, unsigned long area) { - unsigned long start = seg << SID_SHIFT; - unsigned long end = (seg+1) << SID_SHIFT; + unsigned long start = area << SID_SHIFT; + unsigned long end = (area+1) << SID_SHIFT; struct vm_area_struct *vma; - BUG_ON(seg >= 16); + BUG_ON(area >= NUM_LOW_AREAS); /* Check no VMAs are in the region */ vma = find_vma(mm, start); @@ -161,20 +186,69 @@ static int prepare_low_seg_for_htlb(struct mm_struct *mm, unsigned long seg) return 0; } -static int open_low_hpage_segs(struct mm_struct *mm, u16 newsegs) +static int prepare_high_area_for_htlb(struct mm_struct *mm, unsigned long area) +{ + unsigned long start = area << HTLB_AREA_SHIFT; + unsigned long end = (area+1) << HTLB_AREA_SHIFT; + struct vm_area_struct *vma; + + BUG_ON(area >= NUM_HIGH_AREAS); + + /* Check no VMAs are in the region */ + vma = find_vma(mm, start); + if (vma && (vma->vm_start < end)) + return -EBUSY; + + return 0; +} + +static int open_low_hpage_areas(struct mm_struct *mm, u16 newareas) { unsigned long i; - newsegs &= ~(mm->context.htlb_segs); - if (! newsegs) + BUILD_BUG_ON((sizeof(newareas)*8) != NUM_LOW_AREAS); + BUILD_BUG_ON((sizeof(mm->context.low_htlb_areas)*8) != NUM_LOW_AREAS); + + newareas &= ~(mm->context.low_htlb_areas); + if (! newareas) return 0; /* The segments we want are already open */ - for (i = 0; i < 16; i++) - if ((1 << i) & newsegs) - if (prepare_low_seg_for_htlb(mm, i) != 0) + for (i = 0; i < NUM_LOW_AREAS; i++) + if ((1 << i) & newareas) + if (prepare_low_area_for_htlb(mm, i) != 0) + return -EBUSY; + + mm->context.low_htlb_areas |= newareas; + + /* update the paca copy of the context struct */ + get_paca()->context = mm->context; + + /* the context change must make it to memory before the flush, + * so that further SLB misses do the right thing. */ + mb(); + on_each_cpu(flush_low_segments, (void *)(unsigned long)newareas, 0, 1); + + return 0; +} + +static int open_high_hpage_areas(struct mm_struct *mm, u16 newareas) +{ + unsigned long i; + + BUILD_BUG_ON((sizeof(newareas)*8) != NUM_HIGH_AREAS); + BUILD_BUG_ON((sizeof(mm->context.high_htlb_areas)*8) + != NUM_HIGH_AREAS); + + newareas &= ~(mm->context.high_htlb_areas); + if (! newareas) + return 0; /* The areas we want are already open */ + + for (i = 0; i < NUM_HIGH_AREAS; i++) + if ((1 << i) & newareas) + if (prepare_high_area_for_htlb(mm, i) != 0) return -EBUSY; - mm->context.htlb_segs |= newsegs; + mm->context.high_htlb_areas |= newareas; /* update the paca copy of the context struct */ get_paca()->context = mm->context; @@ -182,29 +256,33 @@ static int open_low_hpage_segs(struct mm_struct *mm, u16 newsegs) /* the context change must make it to memory before the flush, * so that further SLB misses do the right thing. */ mb(); - on_each_cpu(flush_segments, (void *)(unsigned long)newsegs, 0, 1); + on_each_cpu(flush_high_segments, (void *)(unsigned long)newareas, 0, 1); return 0; } int prepare_hugepage_range(unsigned long addr, unsigned long len) { - if (within_hugepage_high_range(addr, len)) - return 0; - else if ((addr < 0x100000000UL) && ((addr+len) < 0x100000000UL)) { - int err; - /* Yes, we need both tests, in case addr+len overflows - * 64-bit arithmetic */ - err = open_low_hpage_segs(current->mm, + int err; + + if ( (addr+len) < addr ) + return -EINVAL; + + if ((addr + len) < 0x100000000UL) + err = open_low_hpage_areas(current->mm, LOW_ESID_MASK(addr, len)); - if (err) - printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)" - " failed (segs: 0x%04hx)\n", addr, len, - LOW_ESID_MASK(addr, len)); + else + err = open_high_hpage_areas(current->mm, + HTLB_AREA_MASK(addr, len)); + if (err) { + printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)" + " failed (lowmask: 0x%04hx, highmask: 0x%04hx)\n", + addr, len, + LOW_ESID_MASK(addr, len), HTLB_AREA_MASK(addr, len)); return err; } - return -EINVAL; + return 0; } struct page * @@ -276,8 +354,8 @@ full_search: vma = find_vma(mm, addr); continue; } - if (touches_hugepage_high_range(addr, len)) { - addr = TASK_HPAGE_END; + if (touches_hugepage_high_range(mm, addr, len)) { + addr = ALIGN(addr+1, 1UL<mm, addr); - for (vma = find_vma(current->mm, addr); - addr + len <= TASK_HPAGE_END; - vma = vma->vm_next) { + while (addr + len <= TASK_SIZE_USER64) { BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */ - BUG_ON(! within_hugepage_high_range(addr, len)); + + if (! __within_hugepage_high_range(addr, len, areamask)) { + addr = ALIGN(addr+1, 1UL<mm, addr); + continue; + } if (!vma || (addr + len) <= vma->vm_start) return addr; addr = ALIGN(vma->vm_end, HPAGE_SIZE); - /* Because we're in a hugepage region, this alignment - * should not skip us over any VMAs */ + /* Depending on segmask this might not be a confirmed + * hugepage region, so the ALIGN could have skipped + * some VMAs */ + vma = find_vma(current->mm, addr); } return -ENOMEM; @@ -474,6 +558,9 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { + int lastshift; + u16 areamask, curareas; + if (len & ~HPAGE_MASK) return -EINVAL; @@ -481,31 +568,49 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, return -EINVAL; if (test_thread_flag(TIF_32BIT)) { - int lastshift = 0; - u16 segmask, cursegs = current->mm->context.htlb_segs; + curareas = current->mm->context.low_htlb_areas; /* First see if we can do the mapping in the existing - * low hpage segments */ - addr = htlb_get_low_area(len, cursegs); + * low areas */ + addr = htlb_get_low_area(len, curareas); if (addr != -ENOMEM) return addr; - for (segmask = LOW_ESID_MASK(0x100000000UL-len, len); - ! lastshift; segmask >>=1) { - if (segmask & 1) + lastshift = 0; + for (areamask = LOW_ESID_MASK(0x100000000UL-len, len); + ! lastshift; areamask >>=1) { + if (areamask & 1) lastshift = 1; - addr = htlb_get_low_area(len, cursegs | segmask); + addr = htlb_get_low_area(len, curareas | areamask); if ((addr != -ENOMEM) - && open_low_hpage_segs(current->mm, segmask) == 0) + && open_low_hpage_areas(current->mm, areamask) == 0) return addr; } - printk(KERN_DEBUG "hugetlb_get_unmapped_area() unable to open" - " enough segments\n"); - return -ENOMEM; } else { - return htlb_get_high_area(len); + curareas = current->mm->context.high_htlb_areas; + + /* First see if we can do the mapping in the existing + * high areas */ + addr = htlb_get_high_area(len, curareas); + if (addr != -ENOMEM) + return addr; + + lastshift = 0; + for (areamask = HTLB_AREA_MASK(TASK_SIZE_USER64-len, len); + ! lastshift; areamask >>=1) { + if (areamask & 1) + lastshift = 1; + + addr = htlb_get_high_area(len, curareas | areamask); + if ((addr != -ENOMEM) + && open_high_hpage_areas(current->mm, areamask) == 0) + return addr; + } } + printk(KERN_DEBUG "hugetlb_get_unmapped_area() unable to open" + " enough areas\n"); + return -ENOMEM; } int hash_huge_page(struct mm_struct *mm, unsigned long access, diff --git a/arch/ppc64/mm/slb_low.S b/arch/ppc64/mm/slb_low.S index f20fc52..bab2558 100644 --- a/arch/ppc64/mm/slb_low.S +++ b/arch/ppc64/mm/slb_low.S @@ -89,28 +89,29 @@ END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE) b 9f 0: /* user address: proto-VSID = context<<15 | ESID */ - li r11,SLB_VSID_USER - srdi. r9,r3,USER_ESID_BITS bne- 8f /* invalid ea bits set */ #ifdef CONFIG_HUGETLB_PAGE BEGIN_FTR_SECTION - /* check against the hugepage ranges */ - cmpldi r3,(TASK_HPAGE_END>>SID_SHIFT) - bge 6f /* >= TASK_HPAGE_END */ - cmpldi r3,(TASK_HPAGE_BASE>>SID_SHIFT) - bge 5f /* TASK_HPAGE_BASE..TASK_HPAGE_END */ + lhz r9,PACAHIGHHTLBAREAS(r13) + srdi r11,r3,(HTLB_AREA_SHIFT-SID_SHIFT) + srd r9,r9,r11 + andi. r9,r9,1 + bne 5f + + li r11,SLB_VSID_USER + cmpldi r3,16 - bge 6f /* 4GB..TASK_HPAGE_BASE */ + bge 6f - lhz r9,PACAHTLBSEGS(r13) + lhz r9,PACALOWHTLBAREAS(r13) srd r9,r9,r3 andi. r9,r9,1 + beq 6f -5: /* this is a hugepage user address */ - li r11,(SLB_VSID_USER|SLB_VSID_L) +5: li r11,SLB_VSID_USER|SLB_VSID_L END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE) #endif /* CONFIG_HUGETLB_PAGE */ diff --git a/include/asm-ppc64/mmu.h b/include/asm-ppc64/mmu.h index 789c269..ad36bb2 100644 --- a/include/asm-ppc64/mmu.h +++ b/include/asm-ppc64/mmu.h @@ -307,7 +307,7 @@ typedef unsigned long mm_context_id_t; typedef struct { mm_context_id_t id; #ifdef CONFIG_HUGETLB_PAGE - u16 htlb_segs; /* bitmask */ + u16 low_htlb_areas, high_htlb_areas; #endif } mm_context_t; diff --git a/include/asm-ppc64/page.h b/include/asm-ppc64/page.h index 7e7b18e..a79a08d 100644 --- a/include/asm-ppc64/page.h +++ b/include/asm-ppc64/page.h @@ -37,40 +37,45 @@ #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT) -/* For 64-bit processes the hugepage range is 1T-1.5T */ -#define TASK_HPAGE_BASE ASM_CONST(0x0000010000000000) -#define TASK_HPAGE_END ASM_CONST(0x0000018000000000) +#define HTLB_AREA_SHIFT 40 +#define HTLB_AREA_SIZE (1UL << HTLB_AREA_SHIFT) +#define GET_HTLB_AREA(x) ((x) >> HTLB_AREA_SHIFT) #define LOW_ESID_MASK(addr, len) (((1U << (GET_ESID(addr+len-1)+1)) \ - (1U << GET_ESID(addr))) & 0xffff) +#define HTLB_AREA_MASK(addr, len) (((1U << (GET_HTLB_AREA(addr+len-1)+1)) \ + - (1U << GET_HTLB_AREA(addr))) & 0xffff) #define ARCH_HAS_HUGEPAGE_ONLY_RANGE #define ARCH_HAS_PREPARE_HUGEPAGE_RANGE #define ARCH_HAS_SETCLEAR_HUGE_PTE #define touches_hugepage_low_range(mm, addr, len) \ - (LOW_ESID_MASK((addr), (len)) & mm->context.htlb_segs) -#define touches_hugepage_high_range(addr, len) \ - (((addr) > (TASK_HPAGE_BASE-(len))) && ((addr) < TASK_HPAGE_END)) + (LOW_ESID_MASK((addr), (len)) & (mm)->context.low_htlb_areas) +#define touches_hugepage_high_range(mm, addr, len) \ + (HTLB_AREA_MASK((addr), (len)) & (mm)->context.high_htlb_areas) #define __within_hugepage_low_range(addr, len, segmask) \ ((LOW_ESID_MASK((addr), (len)) | (segmask)) == (segmask)) #define within_hugepage_low_range(addr, len) \ __within_hugepage_low_range((addr), (len), \ - current->mm->context.htlb_segs) -#define within_hugepage_high_range(addr, len) (((addr) >= TASK_HPAGE_BASE) \ - && ((addr)+(len) <= TASK_HPAGE_END) && ((addr)+(len) >= (addr))) + current->mm->context.low_htlb_areas) +#define __within_hugepage_high_range(addr, len, zonemask) \ + ((HTLB_AREA_MASK((addr), (len)) | (zonemask)) == (zonemask)) +#define within_hugepage_high_range(addr, len) \ + __within_hugepage_high_range((addr), (len), \ + current->mm->context.high_htlb_areas) #define is_hugepage_only_range(mm, addr, len) \ - (touches_hugepage_high_range((addr), (len)) || \ + (touches_hugepage_high_range((mm), (addr), (len)) || \ touches_hugepage_low_range((mm), (addr), (len))) #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA #define in_hugepage_area(context, addr) \ (cpu_has_feature(CPU_FTR_16M_PAGE) && \ - ( (((addr) >= TASK_HPAGE_BASE) && ((addr) < TASK_HPAGE_END)) || \ + ( ((1 << GET_HTLB_AREA(addr)) & (context).high_htlb_areas) || \ ( ((addr) < 0x100000000L) && \ - ((1 << GET_ESID(addr)) & (context).htlb_segs) ) ) ) + ((1 << GET_ESID(addr)) & (context).low_htlb_areas) ) ) ) #else /* !CONFIG_HUGETLB_PAGE */ -- cgit v0.10.2 From 2cba582a49f1535c1a12a687cfb3dab713c22cc4 Mon Sep 17 00:00:00 2001 From: Jeff Garzik Date: Mon, 29 Aug 2005 05:12:30 -0400 Subject: [libata sata_promise] Do not attempt to use SATA phy on PATA controllers diff --git a/drivers/scsi/sata_promise.c b/drivers/scsi/sata_promise.c index 919fb31..ad31b8a 100644 --- a/drivers/scsi/sata_promise.c +++ b/drivers/scsi/sata_promise.c @@ -79,7 +79,8 @@ static irqreturn_t pdc_interrupt (int irq, void *dev_instance, struct pt_regs *r static void pdc_eng_timeout(struct ata_port *ap); static int pdc_port_start(struct ata_port *ap); static void pdc_port_stop(struct ata_port *ap); -static void pdc_phy_reset(struct ata_port *ap); +static void pdc_pata_phy_reset(struct ata_port *ap); +static void pdc_sata_phy_reset(struct ata_port *ap); static void pdc_qc_prep(struct ata_queued_cmd *qc); static void pdc_tf_load_mmio(struct ata_port *ap, struct ata_taskfile *tf); static void pdc_exec_command_mmio(struct ata_port *ap, struct ata_taskfile *tf); @@ -106,19 +107,22 @@ static Scsi_Host_Template pdc_ata_sht = { .ordered_flush = 1, }; -static struct ata_port_operations pdc_ata_ops = { +static struct ata_port_operations pdc_sata_ops = { .port_disable = ata_port_disable, .tf_load = pdc_tf_load_mmio, .tf_read = ata_tf_read, .check_status = ata_check_status, .exec_command = pdc_exec_command_mmio, .dev_select = ata_std_dev_select, - .phy_reset = pdc_phy_reset, + + .phy_reset = pdc_sata_phy_reset, + .qc_prep = pdc_qc_prep, .qc_issue = pdc_qc_issue_prot, .eng_timeout = pdc_eng_timeout, .irq_handler = pdc_interrupt, .irq_clear = pdc_irq_clear, + .scr_read = pdc_sata_scr_read, .scr_write = pdc_sata_scr_write, .port_start = pdc_port_start, @@ -126,6 +130,27 @@ static struct ata_port_operations pdc_ata_ops = { .host_stop = ata_host_stop, }; +static struct ata_port_operations pdc_pata_ops = { + .port_disable = ata_port_disable, + .tf_load = pdc_tf_load_mmio, + .tf_read = ata_tf_read, + .check_status = ata_check_status, + .exec_command = pdc_exec_command_mmio, + .dev_select = ata_std_dev_select, + + .phy_reset = pdc_pata_phy_reset, + + .qc_prep = pdc_qc_prep, + .qc_issue = pdc_qc_issue_prot, + .eng_timeout = pdc_eng_timeout, + .irq_handler = pdc_interrupt, + .irq_clear = pdc_irq_clear, + + .port_start = pdc_port_start, + .port_stop = pdc_port_stop, + .host_stop = ata_host_stop, +}; + static struct ata_port_info pdc_port_info[] = { /* board_2037x */ { @@ -135,7 +160,7 @@ static struct ata_port_info pdc_port_info[] = { .pio_mask = 0x1f, /* pio0-4 */ .mwdma_mask = 0x07, /* mwdma0-2 */ .udma_mask = 0x7f, /* udma0-6 ; FIXME */ - .port_ops = &pdc_ata_ops, + .port_ops = &pdc_sata_ops, }, /* board_20319 */ @@ -146,7 +171,7 @@ static struct ata_port_info pdc_port_info[] = { .pio_mask = 0x1f, /* pio0-4 */ .mwdma_mask = 0x07, /* mwdma0-2 */ .udma_mask = 0x7f, /* udma0-6 ; FIXME */ - .port_ops = &pdc_ata_ops, + .port_ops = &pdc_sata_ops, }, /* board_20619 */ @@ -157,7 +182,7 @@ static struct ata_port_info pdc_port_info[] = { .pio_mask = 0x1f, /* pio0-4 */ .mwdma_mask = 0x07, /* mwdma0-2 */ .udma_mask = 0x7f, /* udma0-6 ; FIXME */ - .port_ops = &pdc_ata_ops, + .port_ops = &pdc_pata_ops, }, }; @@ -268,12 +293,23 @@ static void pdc_reset_port(struct ata_port *ap) readl(mmio); /* flush */ } -static void pdc_phy_reset(struct ata_port *ap) +static void pdc_sata_phy_reset(struct ata_port *ap) { pdc_reset_port(ap); sata_phy_reset(ap); } +static void pdc_pata_phy_reset(struct ata_port *ap) +{ + /* FIXME: add cable detect. Don't assume 40-pin cable */ + ap->cbl = ATA_CBL_PATA40; + ap->udma_mask &= ATA_UDMA_MASK_40C; + + pdc_reset_port(ap); + ata_port_probe(ap); + ata_bus_reset(ap); +} + static u32 pdc_sata_scr_read (struct ata_port *ap, unsigned int sc_reg) { if (sc_reg > SCR_CONTROL) -- cgit v0.10.2 From aa7e16d6b88b3b38db0d2ee49ed5e44e7b2045ec Mon Sep 17 00:00:00 2001 From: Jeff Garzik Date: Mon, 29 Aug 2005 15:12:56 -0400 Subject: [libata sata_nv] NVIDIA ok'd license change from OSL+GPL to GPL diff --git a/drivers/scsi/sata_nv.c b/drivers/scsi/sata_nv.c index 1e10370..a61c150 100644 --- a/drivers/scsi/sata_nv.c +++ b/drivers/scsi/sata_nv.c @@ -4,21 +4,20 @@ * Copyright 2004 NVIDIA Corp. All rights reserved. * Copyright 2004 Andrew Chew * - * The contents of this file are subject to the Open - * Software License version 1.1 that can be found at - * http://www.opensource.org/licenses/osl-1.1.txt and is included herein - * by reference. * - * Alternatively, the contents of this file may be used under the terms - * of the GNU General Public License version 2 (the "GPL") as distributed - * in the kernel source COPYING file, in which case the provisions of - * the GPL are applicable instead of the above. If you wish to allow - * the use of your version of this file only under the terms of the - * GPL and not to allow others to use your version of this file under - * the OSL, indicate your decision by deleting the provisions above and - * replace them with the notice and other provisions required by the GPL. - * If you do not delete the provisions above, a recipient may use your - * version of this file under either the OSL or the GPL. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. * * * libata documentation is available via 'make {ps|pdf}docs', -- cgit v0.10.2 From 5ea68e02766c52c153c62fc423cda659a80e45fa Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 29 Aug 2005 12:44:40 -0700 Subject: [SPARC64]: Fix trap state reading for instruction_access_exception. 1) Read ASI_IMMU SFSR not ASI_DMMU. 2) IMMU has no SFAR, read TPC instead 3) Delete old and incorrect comment about the DTLB protection trap having a dependency on the SFSR contents in order to function correctly Signed-off-by: David S. Miller diff --git a/arch/sparc64/kernel/entry.S b/arch/sparc64/kernel/entry.S index 88332f0..6d0476f 100644 --- a/arch/sparc64/kernel/entry.S +++ b/arch/sparc64/kernel/entry.S @@ -690,11 +690,6 @@ netbsd_syscall: retl nop - /* These next few routines must be sure to clear the - * SFSR FaultValid bit so that the fast tlb data protection - * handler does not flush the wrong context and lock up the - * box. - */ .globl __do_data_access_exception .globl __do_data_access_exception_tl1 __do_data_access_exception_tl1: @@ -733,9 +728,8 @@ __do_instruction_access_exception_tl1: rdpr %pstate, %g4 wrpr %g4, PSTATE_MG|PSTATE_AG, %pstate mov TLB_SFSR, %g3 - mov DMMU_SFAR, %g5 - ldxa [%g3] ASI_DMMU, %g4 ! Get SFSR - ldxa [%g5] ASI_DMMU, %g5 ! Get SFAR + ldxa [%g3] ASI_IMMU, %g4 ! Get SFSR + rdpr %tpc, %g5 ! IMMU has no SFAR, use TPC stxa %g0, [%g3] ASI_IMMU ! Clear FaultValid bit membar #Sync sethi %hi(109f), %g7 @@ -752,9 +746,8 @@ __do_instruction_access_exception: rdpr %pstate, %g4 wrpr %g4, PSTATE_MG|PSTATE_AG, %pstate mov TLB_SFSR, %g3 - mov DMMU_SFAR, %g5 - ldxa [%g3] ASI_DMMU, %g4 ! Get SFSR - ldxa [%g5] ASI_DMMU, %g5 ! Get SFAR + ldxa [%g3] ASI_IMMU, %g4 ! Get SFSR + rdpr %tpc, %g5 ! IMMU has no SFAR, use TPC stxa %g0, [%g3] ASI_IMMU ! Clear FaultValid bit membar #Sync sethi %hi(109f), %g7 -- cgit v0.10.2 From bde4e4ee9f90142d550e2684dec2c8df302f5f8e Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 29 Aug 2005 12:44:57 -0700 Subject: [SPARC64]: Do not call winfix_dax blindly Verify we really are taking a data access exception trap, at TL1, from one of the window spill/fill handlers. Else call a new function, data_access_exception_tl1, to log the error. Signed-off-by: David S. Miller diff --git a/arch/sparc64/kernel/entry.S b/arch/sparc64/kernel/entry.S index 6d0476f..214cd0e 100644 --- a/arch/sparc64/kernel/entry.S +++ b/arch/sparc64/kernel/entry.S @@ -701,8 +701,24 @@ __do_data_access_exception_tl1: ldxa [%g5] ASI_DMMU, %g5 ! Get SFAR stxa %g0, [%g3] ASI_DMMU ! Clear SFSR.FaultValid bit membar #Sync + rdpr %tt, %g3 + cmp %g3, 0x80 ! first win spill/fill trap + blu,pn %xcc, 1f + cmp %g3, 0xff ! last win spill/fill trap + bgu,pn %xcc, 1f + nop ba,pt %xcc, winfix_dax rdpr %tpc, %g3 +1: sethi %hi(109f), %g7 + ba,pt %xcc, etraptl1 +109: or %g7, %lo(109b), %g7 + mov %l4, %o1 + mov %l5, %o2 + call data_access_exception_tl1 + add %sp, PTREGS_OFF, %o0 + ba,pt %xcc, rtrap + clr %l6 + __do_data_access_exception: rdpr %pstate, %g4 wrpr %g4, PSTATE_MG|PSTATE_AG, %pstate diff --git a/arch/sparc64/kernel/traps.c b/arch/sparc64/kernel/traps.c index 0c9e54b..210b3e3 100644 --- a/arch/sparc64/kernel/traps.c +++ b/arch/sparc64/kernel/traps.c @@ -220,6 +220,17 @@ void data_access_exception(struct pt_regs *regs, force_sig_info(SIGSEGV, &info, current); } +void data_access_exception_tl1(struct pt_regs *regs, + unsigned long sfsr, unsigned long sfar) +{ + if (notify_die(DIE_TRAP_TL1, "data access exception tl1", regs, + 0, 0x30, SIGTRAP) == NOTIFY_STOP) + return; + + dump_tl1_traplog((struct tl1_traplog *)(regs + 1)); + data_access_exception(regs, sfsr, sfar); +} + #ifdef CONFIG_PCI /* This is really pathetic... */ extern volatile int pci_poke_in_progress; -- cgit v0.10.2 From 6c52a96e6cacb35403b85c3b42db0faf26f3ed85 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 29 Aug 2005 12:45:11 -0700 Subject: [SPARC64]: Revamp Spitfire error trap handling. Current uncorrectable error handling was poor enough that the processor could just loop taking the same trap over and over again. Fix things up so that we at least get a log message and perhaps even some register state. In the process, much consolidation became possible, particularly with the correctable error handler. Prefix assembler and C function names with "spitfire" to indicate that these are for Ultra-I/II/IIi/IIe only. More work is needed to make these routines robust and featureful to the level of the Ultra-III error handlers. Signed-off-by: David S. Miller diff --git a/arch/sparc64/kernel/entry.S b/arch/sparc64/kernel/entry.S index 214cd0e..cecdc0a 100644 --- a/arch/sparc64/kernel/entry.S +++ b/arch/sparc64/kernel/entry.S @@ -21,6 +21,7 @@ #include #include #include +#include #define curptr g6 @@ -690,9 +691,159 @@ netbsd_syscall: retl nop - .globl __do_data_access_exception - .globl __do_data_access_exception_tl1 -__do_data_access_exception_tl1: + /* We need to carefully read the error status, ACK + * the errors, prevent recursive traps, and pass the + * information on to C code for logging. + * + * We pass the AFAR in as-is, and we encode the status + * information as described in asm-sparc64/sfafsr.h + */ + .globl __spitfire_access_error +__spitfire_access_error: + /* Disable ESTATE error reporting so that we do not + * take recursive traps and RED state the processor. + */ + stxa %g0, [%g0] ASI_ESTATE_ERROR_EN + membar #Sync + + mov UDBE_UE, %g1 + ldxa [%g0] ASI_AFSR, %g4 ! Get AFSR + + /* __spitfire_cee_trap branches here with AFSR in %g4 and + * UDBE_CE in %g1. It only clears ESTATE_ERR_CE in the + * ESTATE Error Enable register. + */ +__spitfire_cee_trap_continue: + ldxa [%g0] ASI_AFAR, %g5 ! Get AFAR + + rdpr %tt, %g3 + and %g3, 0x1ff, %g3 ! Paranoia + sllx %g3, SFSTAT_TRAP_TYPE_SHIFT, %g3 + or %g4, %g3, %g4 + rdpr %tl, %g3 + cmp %g3, 1 + mov 1, %g3 + bleu %xcc, 1f + sllx %g3, SFSTAT_TL_GT_ONE_SHIFT, %g3 + + or %g4, %g3, %g4 + + /* Read in the UDB error register state, clearing the + * sticky error bits as-needed. We only clear them if + * the UE bit is set. Likewise, __spitfire_cee_trap + * below will only do so if the CE bit is set. + * + * NOTE: UltraSparc-I/II have high and low UDB error + * registers, corresponding to the two UDB units + * present on those chips. UltraSparc-IIi only + * has a single UDB, called "SDB" in the manual. + * For IIi the upper UDB register always reads + * as zero so for our purposes things will just + * work with the checks below. + */ +1: ldxa [%g0] ASI_UDBH_ERROR_R, %g3 + and %g3, 0x3ff, %g7 ! Paranoia + sllx %g7, SFSTAT_UDBH_SHIFT, %g7 + or %g4, %g7, %g4 + andcc %g3, %g1, %g3 ! UDBE_UE or UDBE_CE + be,pn %xcc, 1f + nop + stxa %g3, [%g0] ASI_UDB_ERROR_W + membar #Sync + +1: mov 0x18, %g3 + ldxa [%g3] ASI_UDBL_ERROR_R, %g3 + and %g3, 0x3ff, %g7 ! Paranoia + sllx %g7, SFSTAT_UDBL_SHIFT, %g7 + or %g4, %g7, %g4 + andcc %g3, %g1, %g3 ! UDBE_UE or UDBE_CE + be,pn %xcc, 1f + nop + mov 0x18, %g7 + stxa %g3, [%g7] ASI_UDB_ERROR_W + membar #Sync + +1: /* Ok, now that we've latched the error state, + * clear the sticky bits in the AFSR. + */ + stxa %g4, [%g0] ASI_AFSR + membar #Sync + + rdpr %tl, %g2 + cmp %g2, 1 + rdpr %pil, %g2 + bleu,pt %xcc, 1f + wrpr %g0, 15, %pil + + ba,pt %xcc, etraptl1 + rd %pc, %g7 + + ba,pt %xcc, 2f + nop + +1: ba,pt %xcc, etrap_irq + rd %pc, %g7 + +2: mov %l4, %o1 + mov %l5, %o2 + call spitfire_access_error + add %sp, PTREGS_OFF, %o0 + ba,pt %xcc, rtrap + clr %l6 + + /* This is the trap handler entry point for ECC correctable + * errors. They are corrected, but we listen for the trap + * so that the event can be logged. + * + * Disrupting errors are either: + * 1) single-bit ECC errors during UDB reads to system + * memory + * 2) data parity errors during write-back events + * + * As far as I can make out from the manual, the CEE trap + * is only for correctable errors during memory read + * accesses by the front-end of the processor. + * + * The code below is only for trap level 1 CEE events, + * as it is the only situation where we can safely record + * and log. For trap level >1 we just clear the CE bit + * in the AFSR and return. + * + * This is just like __spiftire_access_error above, but it + * specifically handles correctable errors. If an + * uncorrectable error is indicated in the AFSR we + * will branch directly above to __spitfire_access_error + * to handle it instead. Uncorrectable therefore takes + * priority over correctable, and the error logging + * C code will notice this case by inspecting the + * trap type. + */ + .globl __spitfire_cee_trap +__spitfire_cee_trap: + ldxa [%g0] ASI_AFSR, %g4 ! Get AFSR + mov 1, %g3 + sllx %g3, SFAFSR_UE_SHIFT, %g3 + andcc %g4, %g3, %g0 ! Check for UE + bne,pn %xcc, __spitfire_access_error + nop + + /* Ok, in this case we only have a correctable error. + * Indicate we only wish to capture that state in register + * %g1, and we only disable CE error reporting unlike UE + * handling which disables all errors. + */ + ldxa [%g0] ASI_ESTATE_ERROR_EN, %g3 + andn %g3, ESTATE_ERR_CE, %g3 + stxa %g3, [%g0] ASI_ESTATE_ERROR_EN + membar #Sync + + /* Preserve AFSR in %g4, indicate UDB state to capture in %g1 */ + ba,pt %xcc, __spitfire_cee_trap_continue + mov UDBE_CE, %g1 + + .globl __spitfire_data_access_exception + .globl __spitfire_data_access_exception_tl1 +__spitfire_data_access_exception_tl1: rdpr %pstate, %g4 wrpr %g4, PSTATE_MG|PSTATE_AG, %pstate mov TLB_SFSR, %g3 @@ -714,12 +865,12 @@ __do_data_access_exception_tl1: 109: or %g7, %lo(109b), %g7 mov %l4, %o1 mov %l5, %o2 - call data_access_exception_tl1 + call spitfire_data_access_exception_tl1 add %sp, PTREGS_OFF, %o0 ba,pt %xcc, rtrap clr %l6 -__do_data_access_exception: +__spitfire_data_access_exception: rdpr %pstate, %g4 wrpr %g4, PSTATE_MG|PSTATE_AG, %pstate mov TLB_SFSR, %g3 @@ -733,14 +884,14 @@ __do_data_access_exception: 109: or %g7, %lo(109b), %g7 mov %l4, %o1 mov %l5, %o2 - call data_access_exception + call spitfire_data_access_exception add %sp, PTREGS_OFF, %o0 ba,pt %xcc, rtrap clr %l6 - .globl __do_instruction_access_exception - .globl __do_instruction_access_exception_tl1 -__do_instruction_access_exception_tl1: + .globl __spitfire_insn_access_exception + .globl __spitfire_insn_access_exception_tl1 +__spitfire_insn_access_exception_tl1: rdpr %pstate, %g4 wrpr %g4, PSTATE_MG|PSTATE_AG, %pstate mov TLB_SFSR, %g3 @@ -753,12 +904,12 @@ __do_instruction_access_exception_tl1: 109: or %g7, %lo(109b), %g7 mov %l4, %o1 mov %l5, %o2 - call instruction_access_exception_tl1 + call spitfire_insn_access_exception_tl1 add %sp, PTREGS_OFF, %o0 ba,pt %xcc, rtrap clr %l6 -__do_instruction_access_exception: +__spitfire_insn_access_exception: rdpr %pstate, %g4 wrpr %g4, PSTATE_MG|PSTATE_AG, %pstate mov TLB_SFSR, %g3 @@ -771,102 +922,11 @@ __do_instruction_access_exception: 109: or %g7, %lo(109b), %g7 mov %l4, %o1 mov %l5, %o2 - call instruction_access_exception + call spitfire_insn_access_exception add %sp, PTREGS_OFF, %o0 ba,pt %xcc, rtrap clr %l6 - /* This is the trap handler entry point for ECC correctable - * errors. They are corrected, but we listen for the trap - * so that the event can be logged. - * - * Disrupting errors are either: - * 1) single-bit ECC errors during UDB reads to system - * memory - * 2) data parity errors during write-back events - * - * As far as I can make out from the manual, the CEE trap - * is only for correctable errors during memory read - * accesses by the front-end of the processor. - * - * The code below is only for trap level 1 CEE events, - * as it is the only situation where we can safely record - * and log. For trap level >1 we just clear the CE bit - * in the AFSR and return. - */ - - /* Our trap handling infrastructure allows us to preserve - * two 64-bit values during etrap for arguments to - * subsequent C code. Therefore we encode the information - * as follows: - * - * value 1) Full 64-bits of AFAR - * value 2) Low 33-bits of AFSR, then bits 33-->42 - * are UDBL error status and bits 43-->52 - * are UDBH error status - */ - .align 64 - .globl cee_trap -cee_trap: - ldxa [%g0] ASI_AFSR, %g1 ! Read AFSR - ldxa [%g0] ASI_AFAR, %g2 ! Read AFAR - sllx %g1, 31, %g1 ! Clear reserved bits - srlx %g1, 31, %g1 ! in AFSR - - /* NOTE: UltraSparc-I/II have high and low UDB error - * registers, corresponding to the two UDB units - * present on those chips. UltraSparc-IIi only - * has a single UDB, called "SDB" in the manual. - * For IIi the upper UDB register always reads - * as zero so for our purposes things will just - * work with the checks below. - */ - ldxa [%g0] ASI_UDBL_ERROR_R, %g3 ! Read UDB-Low error status - andcc %g3, (1 << 8), %g4 ! Check CE bit - sllx %g3, (64 - 10), %g3 ! Clear reserved bits - srlx %g3, (64 - 10), %g3 ! in UDB-Low error status - - sllx %g3, (33 + 0), %g3 ! Shift up to encoding area - or %g1, %g3, %g1 ! Or it in - be,pn %xcc, 1f ! Branch if CE bit was clear - nop - stxa %g4, [%g0] ASI_UDB_ERROR_W ! Clear CE sticky bit in UDBL - membar #Sync ! Synchronize ASI stores -1: mov 0x18, %g5 ! Addr of UDB-High error status - ldxa [%g5] ASI_UDBH_ERROR_R, %g3 ! Read it - - andcc %g3, (1 << 8), %g4 ! Check CE bit - sllx %g3, (64 - 10), %g3 ! Clear reserved bits - srlx %g3, (64 - 10), %g3 ! in UDB-High error status - sllx %g3, (33 + 10), %g3 ! Shift up to encoding area - or %g1, %g3, %g1 ! Or it in - be,pn %xcc, 1f ! Branch if CE bit was clear - nop - nop - - stxa %g4, [%g5] ASI_UDB_ERROR_W ! Clear CE sticky bit in UDBH - membar #Sync ! Synchronize ASI stores -1: mov 1, %g5 ! AFSR CE bit is - sllx %g5, 20, %g5 ! bit 20 - stxa %g5, [%g0] ASI_AFSR ! Clear CE sticky bit in AFSR - membar #Sync ! Synchronize ASI stores - sllx %g2, (64 - 41), %g2 ! Clear reserved bits - srlx %g2, (64 - 41), %g2 ! in latched AFAR - - andn %g2, 0x0f, %g2 ! Finish resv bit clearing - mov %g1, %g4 ! Move AFSR+UDB* into save reg - mov %g2, %g5 ! Move AFAR into save reg - rdpr %pil, %g2 - wrpr %g0, 15, %pil - ba,pt %xcc, etrap_irq - rd %pc, %g7 - mov %l4, %o0 - - mov %l5, %o1 - call cee_log - add %sp, PTREGS_OFF, %o2 - ba,a,pt %xcc, rtrap_irq - /* Capture I/D/E-cache state into per-cpu error scoreboard. * * %g1: (TL>=0) ? 1 : 0 diff --git a/arch/sparc64/kernel/traps.c b/arch/sparc64/kernel/traps.c index 210b3e3..b280b2e 100644 --- a/arch/sparc64/kernel/traps.c +++ b/arch/sparc64/kernel/traps.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -143,8 +144,7 @@ void do_BUG(const char *file, int line) } #endif -void instruction_access_exception(struct pt_regs *regs, - unsigned long sfsr, unsigned long sfar) +void spitfire_insn_access_exception(struct pt_regs *regs, unsigned long sfsr, unsigned long sfar) { siginfo_t info; @@ -153,8 +153,8 @@ void instruction_access_exception(struct pt_regs *regs, return; if (regs->tstate & TSTATE_PRIV) { - printk("instruction_access_exception: SFSR[%016lx] SFAR[%016lx], going.\n", - sfsr, sfar); + printk("spitfire_insn_access_exception: SFSR[%016lx] " + "SFAR[%016lx], going.\n", sfsr, sfar); die_if_kernel("Iax", regs); } if (test_thread_flag(TIF_32BIT)) { @@ -169,19 +169,17 @@ void instruction_access_exception(struct pt_regs *regs, force_sig_info(SIGSEGV, &info, current); } -void instruction_access_exception_tl1(struct pt_regs *regs, - unsigned long sfsr, unsigned long sfar) +void spitfire_insn_access_exception_tl1(struct pt_regs *regs, unsigned long sfsr, unsigned long sfar) { if (notify_die(DIE_TRAP_TL1, "instruction access exception tl1", regs, 0, 0x8, SIGTRAP) == NOTIFY_STOP) return; dump_tl1_traplog((struct tl1_traplog *)(regs + 1)); - instruction_access_exception(regs, sfsr, sfar); + spitfire_insn_access_exception(regs, sfsr, sfar); } -void data_access_exception(struct pt_regs *regs, - unsigned long sfsr, unsigned long sfar) +void spitfire_data_access_exception(struct pt_regs *regs, unsigned long sfsr, unsigned long sfar) { siginfo_t info; @@ -207,8 +205,8 @@ void data_access_exception(struct pt_regs *regs, return; } /* Shit... */ - printk("data_access_exception: SFSR[%016lx] SFAR[%016lx], going.\n", - sfsr, sfar); + printk("spitfire_data_access_exception: SFSR[%016lx] " + "SFAR[%016lx], going.\n", sfsr, sfar); die_if_kernel("Dax", regs); } @@ -220,15 +218,14 @@ void data_access_exception(struct pt_regs *regs, force_sig_info(SIGSEGV, &info, current); } -void data_access_exception_tl1(struct pt_regs *regs, - unsigned long sfsr, unsigned long sfar) +void spitfire_data_access_exception_tl1(struct pt_regs *regs, unsigned long sfsr, unsigned long sfar) { if (notify_die(DIE_TRAP_TL1, "data access exception tl1", regs, 0, 0x30, SIGTRAP) == NOTIFY_STOP) return; dump_tl1_traplog((struct tl1_traplog *)(regs + 1)); - data_access_exception(regs, sfsr, sfar); + spitfire_data_access_exception(regs, sfsr, sfar); } #ifdef CONFIG_PCI @@ -264,54 +261,13 @@ static void spitfire_clean_and_reenable_l1_caches(void) : "memory"); } -void do_iae(struct pt_regs *regs) +static void spitfire_enable_estate_errors(void) { - siginfo_t info; - - spitfire_clean_and_reenable_l1_caches(); - - if (notify_die(DIE_TRAP, "instruction access exception", regs, - 0, 0x8, SIGTRAP) == NOTIFY_STOP) - return; - - info.si_signo = SIGBUS; - info.si_errno = 0; - info.si_code = BUS_OBJERR; - info.si_addr = (void *)0; - info.si_trapno = 0; - force_sig_info(SIGBUS, &info, current); -} - -void do_dae(struct pt_regs *regs) -{ - siginfo_t info; - -#ifdef CONFIG_PCI - if (pci_poke_in_progress && pci_poke_cpu == smp_processor_id()) { - spitfire_clean_and_reenable_l1_caches(); - - pci_poke_faulted = 1; - - /* Why the fuck did they have to change this? */ - if (tlb_type == cheetah || tlb_type == cheetah_plus) - regs->tpc += 4; - - regs->tnpc = regs->tpc + 4; - return; - } -#endif - spitfire_clean_and_reenable_l1_caches(); - - if (notify_die(DIE_TRAP, "data access exception", regs, - 0, 0x30, SIGTRAP) == NOTIFY_STOP) - return; - - info.si_signo = SIGBUS; - info.si_errno = 0; - info.si_code = BUS_OBJERR; - info.si_addr = (void *)0; - info.si_trapno = 0; - force_sig_info(SIGBUS, &info, current); + __asm__ __volatile__("stxa %0, [%%g0] %1\n\t" + "membar #Sync" + : /* no outputs */ + : "r" (ESTATE_ERR_ALL), + "i" (ASI_ESTATE_ERROR_EN)); } static char ecc_syndrome_table[] = { @@ -349,65 +305,15 @@ static char ecc_syndrome_table[] = { 0x0b, 0x48, 0x48, 0x4b, 0x48, 0x4b, 0x4b, 0x4a }; -/* cee_trap in entry.S encodes AFSR/UDBH/UDBL error status - * in the following format. The AFAR is left as is, with - * reserved bits cleared, and is a raw 40-bit physical - * address. - */ -#define CE_STATUS_UDBH_UE (1UL << (43 + 9)) -#define CE_STATUS_UDBH_CE (1UL << (43 + 8)) -#define CE_STATUS_UDBH_ESYNDR (0xffUL << 43) -#define CE_STATUS_UDBH_SHIFT 43 -#define CE_STATUS_UDBL_UE (1UL << (33 + 9)) -#define CE_STATUS_UDBL_CE (1UL << (33 + 8)) -#define CE_STATUS_UDBL_ESYNDR (0xffUL << 33) -#define CE_STATUS_UDBL_SHIFT 33 -#define CE_STATUS_AFSR_MASK (0x1ffffffffUL) -#define CE_STATUS_AFSR_ME (1UL << 32) -#define CE_STATUS_AFSR_PRIV (1UL << 31) -#define CE_STATUS_AFSR_ISAP (1UL << 30) -#define CE_STATUS_AFSR_ETP (1UL << 29) -#define CE_STATUS_AFSR_IVUE (1UL << 28) -#define CE_STATUS_AFSR_TO (1UL << 27) -#define CE_STATUS_AFSR_BERR (1UL << 26) -#define CE_STATUS_AFSR_LDP (1UL << 25) -#define CE_STATUS_AFSR_CP (1UL << 24) -#define CE_STATUS_AFSR_WP (1UL << 23) -#define CE_STATUS_AFSR_EDP (1UL << 22) -#define CE_STATUS_AFSR_UE (1UL << 21) -#define CE_STATUS_AFSR_CE (1UL << 20) -#define CE_STATUS_AFSR_ETS (0xfUL << 16) -#define CE_STATUS_AFSR_ETS_SHIFT 16 -#define CE_STATUS_AFSR_PSYND (0xffffUL << 0) -#define CE_STATUS_AFSR_PSYND_SHIFT 0 - -/* Layout of Ecache TAG Parity Syndrome of AFSR */ -#define AFSR_ETSYNDROME_7_0 0x1UL /* E$-tag bus bits <7:0> */ -#define AFSR_ETSYNDROME_15_8 0x2UL /* E$-tag bus bits <15:8> */ -#define AFSR_ETSYNDROME_21_16 0x4UL /* E$-tag bus bits <21:16> */ -#define AFSR_ETSYNDROME_24_22 0x8UL /* E$-tag bus bits <24:22> */ - static char *syndrome_unknown = ""; -asmlinkage void cee_log(unsigned long ce_status, - unsigned long afar, - struct pt_regs *regs) +static void spitfire_log_udb_syndrome(unsigned long afar, unsigned long udbh, unsigned long udbl, unsigned long bit) { - char memmod_str[64]; - char *p; - unsigned short scode, udb_reg; + unsigned short scode; + char memmod_str[64], *p; - printk(KERN_WARNING "CPU[%d]: Correctable ECC Error " - "AFSR[%lx] AFAR[%016lx] UDBL[%lx] UDBH[%lx]\n", - smp_processor_id(), - (ce_status & CE_STATUS_AFSR_MASK), - afar, - ((ce_status >> CE_STATUS_UDBL_SHIFT) & 0x3ffUL), - ((ce_status >> CE_STATUS_UDBH_SHIFT) & 0x3ffUL)); - - udb_reg = ((ce_status >> CE_STATUS_UDBL_SHIFT) & 0x3ffUL); - if (udb_reg & (1 << 8)) { - scode = ecc_syndrome_table[udb_reg & 0xff]; + if (udbl & bit) { + scode = ecc_syndrome_table[udbl & 0xff]; if (prom_getunumber(scode, afar, memmod_str, sizeof(memmod_str)) == -1) p = syndrome_unknown; @@ -418,9 +324,8 @@ asmlinkage void cee_log(unsigned long ce_status, smp_processor_id(), scode, p); } - udb_reg = ((ce_status >> CE_STATUS_UDBH_SHIFT) & 0x3ffUL); - if (udb_reg & (1 << 8)) { - scode = ecc_syndrome_table[udb_reg & 0xff]; + if (udbh & bit) { + scode = ecc_syndrome_table[udbh & 0xff]; if (prom_getunumber(scode, afar, memmod_str, sizeof(memmod_str)) == -1) p = syndrome_unknown; @@ -430,6 +335,127 @@ asmlinkage void cee_log(unsigned long ce_status, "Memory Module \"%s\"\n", smp_processor_id(), scode, p); } + +} + +static void spitfire_cee_log(unsigned long afsr, unsigned long afar, unsigned long udbh, unsigned long udbl, int tl1, struct pt_regs *regs) +{ + + printk(KERN_WARNING "CPU[%d]: Correctable ECC Error " + "AFSR[%lx] AFAR[%016lx] UDBL[%lx] UDBH[%lx] TL>1[%d]\n", + smp_processor_id(), afsr, afar, udbl, udbh, tl1); + + spitfire_log_udb_syndrome(afar, udbh, udbl, UDBE_CE); + + /* We always log it, even if someone is listening for this + * trap. + */ + notify_die(DIE_TRAP, "Correctable ECC Error", regs, + 0, TRAP_TYPE_CEE, SIGTRAP); + + /* The Correctable ECC Error trap does not disable I/D caches. So + * we only have to restore the ESTATE Error Enable register. + */ + spitfire_enable_estate_errors(); +} + +static void spitfire_ue_log(unsigned long afsr, unsigned long afar, unsigned long udbh, unsigned long udbl, unsigned long tt, int tl1, struct pt_regs *regs) +{ + siginfo_t info; + + printk(KERN_WARNING "CPU[%d]: Uncorrectable Error AFSR[%lx] " + "AFAR[%lx] UDBL[%lx] UDBH[%ld] TT[%lx] TL>1[%d]\n", + smp_processor_id(), afsr, afar, udbl, udbh, tt, tl1); + + /* XXX add more human friendly logging of the error status + * XXX as is implemented for cheetah + */ + + spitfire_log_udb_syndrome(afar, udbh, udbl, UDBE_UE); + + /* We always log it, even if someone is listening for this + * trap. + */ + notify_die(DIE_TRAP, "Uncorrectable Error", regs, + 0, tt, SIGTRAP); + + if (regs->tstate & TSTATE_PRIV) { + if (tl1) + dump_tl1_traplog((struct tl1_traplog *)(regs + 1)); + die_if_kernel("UE", regs); + } + + /* XXX need more intelligent processing here, such as is implemented + * XXX for cheetah errors, in fact if the E-cache still holds the + * XXX line with bad parity this will loop + */ + + spitfire_clean_and_reenable_l1_caches(); + spitfire_enable_estate_errors(); + + if (test_thread_flag(TIF_32BIT)) { + regs->tpc &= 0xffffffff; + regs->tnpc &= 0xffffffff; + } + info.si_signo = SIGBUS; + info.si_errno = 0; + info.si_code = BUS_OBJERR; + info.si_addr = (void *)0; + info.si_trapno = 0; + force_sig_info(SIGBUS, &info, current); +} + +void spitfire_access_error(struct pt_regs *regs, unsigned long status_encoded, unsigned long afar) +{ + unsigned long afsr, tt, udbh, udbl; + int tl1; + + afsr = (status_encoded & SFSTAT_AFSR_MASK) >> SFSTAT_AFSR_SHIFT; + tt = (status_encoded & SFSTAT_TRAP_TYPE) >> SFSTAT_TRAP_TYPE_SHIFT; + tl1 = (status_encoded & SFSTAT_TL_GT_ONE) ? 1 : 0; + udbl = (status_encoded & SFSTAT_UDBL_MASK) >> SFSTAT_UDBL_SHIFT; + udbh = (status_encoded & SFSTAT_UDBH_MASK) >> SFSTAT_UDBH_SHIFT; + +#ifdef CONFIG_PCI + if (tt == TRAP_TYPE_DAE && + pci_poke_in_progress && pci_poke_cpu == smp_processor_id()) { + spitfire_clean_and_reenable_l1_caches(); + spitfire_enable_estate_errors(); + + pci_poke_faulted = 1; + regs->tnpc = regs->tpc + 4; + return; + } +#endif + + if (afsr & SFAFSR_UE) + spitfire_ue_log(afsr, afar, udbh, udbl, tt, tl1, regs); + + if (tt == TRAP_TYPE_CEE) { + /* Handle the case where we took a CEE trap, but ACK'd + * only the UE state in the UDB error registers. + */ + if (afsr & SFAFSR_UE) { + if (udbh & UDBE_CE) { + __asm__ __volatile__( + "stxa %0, [%1] %2\n\t" + "membar #Sync" + : /* no outputs */ + : "r" (udbh & UDBE_CE), + "r" (0x0), "i" (ASI_UDB_ERROR_W)); + } + if (udbl & UDBE_CE) { + __asm__ __volatile__( + "stxa %0, [%1] %2\n\t" + "membar #Sync" + : /* no outputs */ + : "r" (udbl & UDBE_CE), + "r" (0x18), "i" (ASI_UDB_ERROR_W)); + } + } + + spitfire_cee_log(afsr, afar, udbh, udbl, tl1, regs); + } } int cheetah_pcache_forced_on; diff --git a/arch/sparc64/kernel/ttable.S b/arch/sparc64/kernel/ttable.S index 491bb36..8365bc1 100644 --- a/arch/sparc64/kernel/ttable.S +++ b/arch/sparc64/kernel/ttable.S @@ -18,9 +18,10 @@ sparc64_ttable_tl0: tl0_resv000: BOOT_KERNEL BTRAP(0x1) BTRAP(0x2) BTRAP(0x3) tl0_resv004: BTRAP(0x4) BTRAP(0x5) BTRAP(0x6) BTRAP(0x7) tl0_iax: membar #Sync - TRAP_NOSAVE_7INSNS(__do_instruction_access_exception) + TRAP_NOSAVE_7INSNS(__spitfire_insn_access_exception) tl0_resv009: BTRAP(0x9) -tl0_iae: TRAP(do_iae) +tl0_iae: membar #Sync + TRAP_NOSAVE_7INSNS(__spitfire_access_error) tl0_resv00b: BTRAP(0xb) BTRAP(0xc) BTRAP(0xd) BTRAP(0xe) BTRAP(0xf) tl0_ill: membar #Sync TRAP_7INSNS(do_illegal_instruction) @@ -36,9 +37,10 @@ tl0_cwin: CLEAN_WINDOW tl0_div0: TRAP(do_div0) tl0_resv029: BTRAP(0x29) BTRAP(0x2a) BTRAP(0x2b) BTRAP(0x2c) BTRAP(0x2d) BTRAP(0x2e) tl0_resv02f: BTRAP(0x2f) -tl0_dax: TRAP_NOSAVE(__do_data_access_exception) +tl0_dax: TRAP_NOSAVE(__spitfire_data_access_exception) tl0_resv031: BTRAP(0x31) -tl0_dae: TRAP(do_dae) +tl0_dae: membar #Sync + TRAP_NOSAVE_7INSNS(__spitfire_access_error) tl0_resv033: BTRAP(0x33) tl0_mna: TRAP_NOSAVE(do_mna) tl0_lddfmna: TRAP_NOSAVE(do_lddfmna) @@ -73,7 +75,8 @@ tl0_resv05c: BTRAP(0x5c) BTRAP(0x5d) BTRAP(0x5e) BTRAP(0x5f) tl0_ivec: TRAP_IVEC tl0_paw: TRAP(do_paw) tl0_vaw: TRAP(do_vaw) -tl0_cee: TRAP_NOSAVE(cee_trap) +tl0_cee: membar #Sync + TRAP_NOSAVE_7INSNS(__spitfire_cee_trap) tl0_iamiss: #include "itlb_base.S" tl0_damiss: @@ -175,9 +178,10 @@ tl0_resv1f0: BTRAPS(0x1f0) BTRAPS(0x1f8) sparc64_ttable_tl1: tl1_resv000: BOOT_KERNEL BTRAPTL1(0x1) BTRAPTL1(0x2) BTRAPTL1(0x3) tl1_resv004: BTRAPTL1(0x4) BTRAPTL1(0x5) BTRAPTL1(0x6) BTRAPTL1(0x7) -tl1_iax: TRAP_NOSAVE(__do_instruction_access_exception_tl1) +tl1_iax: TRAP_NOSAVE(__spitfire_insn_access_exception_tl1) tl1_resv009: BTRAPTL1(0x9) -tl1_iae: TRAPTL1(do_iae_tl1) +tl1_iae: membar #Sync + TRAP_NOSAVE_7INSNS(__spitfire_access_error) tl1_resv00b: BTRAPTL1(0xb) BTRAPTL1(0xc) BTRAPTL1(0xd) BTRAPTL1(0xe) BTRAPTL1(0xf) tl1_ill: TRAPTL1(do_ill_tl1) tl1_privop: BTRAPTL1(0x11) @@ -193,9 +197,10 @@ tl1_cwin: CLEAN_WINDOW tl1_div0: TRAPTL1(do_div0_tl1) tl1_resv029: BTRAPTL1(0x29) BTRAPTL1(0x2a) BTRAPTL1(0x2b) BTRAPTL1(0x2c) tl1_resv02d: BTRAPTL1(0x2d) BTRAPTL1(0x2e) BTRAPTL1(0x2f) -tl1_dax: TRAP_NOSAVE(__do_data_access_exception_tl1) +tl1_dax: TRAP_NOSAVE(__spitfire_data_access_exception_tl1) tl1_resv031: BTRAPTL1(0x31) -tl1_dae: TRAPTL1(do_dae_tl1) +tl1_dae: membar #Sync + TRAP_NOSAVE_7INSNS(__spitfire_access_error) tl1_resv033: BTRAPTL1(0x33) tl1_mna: TRAP_NOSAVE(do_mna) tl1_lddfmna: TRAPTL1(do_lddfmna_tl1) @@ -219,8 +224,8 @@ tl1_paw: TRAPTL1(do_paw_tl1) tl1_vaw: TRAPTL1(do_vaw_tl1) /* The grotty trick to save %g1 into current->thread.cee_stuff - * is because when we take this trap we could be interrupting trap - * code already using the trap alternate global registers. + * is because when we take this trap we could be interrupting + * trap code already using the trap alternate global registers. * * We cross our fingers and pray that this store/load does * not cause yet another CEE trap. diff --git a/arch/sparc64/kernel/unaligned.c b/arch/sparc64/kernel/unaligned.c index 11c3e88..da9739f 100644 --- a/arch/sparc64/kernel/unaligned.c +++ b/arch/sparc64/kernel/unaligned.c @@ -349,9 +349,9 @@ int handle_popc(u32 insn, struct pt_regs *regs) extern void do_fpother(struct pt_regs *regs); extern void do_privact(struct pt_regs *regs); -extern void data_access_exception(struct pt_regs *regs, - unsigned long sfsr, - unsigned long sfar); +extern void spitfire_data_access_exception(struct pt_regs *regs, + unsigned long sfsr, + unsigned long sfar); int handle_ldf_stq(u32 insn, struct pt_regs *regs) { @@ -394,14 +394,14 @@ int handle_ldf_stq(u32 insn, struct pt_regs *regs) break; } default: - data_access_exception(regs, 0, addr); + spitfire_data_access_exception(regs, 0, addr); return 1; } if (put_user (first >> 32, (u32 __user *)addr) || __put_user ((u32)first, (u32 __user *)(addr + 4)) || __put_user (second >> 32, (u32 __user *)(addr + 8)) || __put_user ((u32)second, (u32 __user *)(addr + 12))) { - data_access_exception(regs, 0, addr); + spitfire_data_access_exception(regs, 0, addr); return 1; } } else { @@ -414,7 +414,7 @@ int handle_ldf_stq(u32 insn, struct pt_regs *regs) do_privact(regs); return 1; } else if (asi > ASI_SNFL) { - data_access_exception(regs, 0, addr); + spitfire_data_access_exception(regs, 0, addr); return 1; } switch (insn & 0x180000) { @@ -431,7 +431,7 @@ int handle_ldf_stq(u32 insn, struct pt_regs *regs) err |= __get_user (data[i], (u32 __user *)(addr + 4*i)); } if (err && !(asi & 0x2 /* NF */)) { - data_access_exception(regs, 0, addr); + spitfire_data_access_exception(regs, 0, addr); return 1; } if (asi & 0x8) /* Little */ { @@ -534,7 +534,7 @@ void handle_lddfmna(struct pt_regs *regs, unsigned long sfar, unsigned long sfsr *(u64 *)(f->regs + freg) = value; current_thread_info()->fpsaved[0] |= flag; } else { -daex: data_access_exception(regs, sfsr, sfar); +daex: spitfire_data_access_exception(regs, sfsr, sfar); return; } advance(regs); @@ -578,7 +578,7 @@ void handle_stdfmna(struct pt_regs *regs, unsigned long sfar, unsigned long sfsr __put_user ((u32)value, (u32 __user *)(sfar + 4))) goto daex; } else { -daex: data_access_exception(regs, sfsr, sfar); +daex: spitfire_data_access_exception(regs, sfsr, sfar); return; } advance(regs); diff --git a/arch/sparc64/kernel/winfixup.S b/arch/sparc64/kernel/winfixup.S index dfbc7e0..99c809a 100644 --- a/arch/sparc64/kernel/winfixup.S +++ b/arch/sparc64/kernel/winfixup.S @@ -318,7 +318,7 @@ fill_fixup_dax: nop rdpr %pstate, %l1 ! Prepare to change globals. mov %g4, %o1 ! Setup args for - mov %g5, %o2 ! final call to data_access_exception. + mov %g5, %o2 ! final call to spitfire_data_access_exception. andn %l1, PSTATE_MM, %l1 ! We want to be in RMO mov %g6, %o7 ! Stash away current. @@ -330,7 +330,7 @@ fill_fixup_dax: mov TSB_REG, %g1 ldxa [%g1] ASI_IMMU, %g5 #endif - call data_access_exception + call spitfire_data_access_exception add %sp, PTREGS_OFF, %o0 b,pt %xcc, rtrap @@ -391,7 +391,7 @@ window_dax_from_user_common: 109: or %g7, %lo(109b), %g7 mov %l4, %o1 mov %l5, %o2 - call data_access_exception + call spitfire_data_access_exception add %sp, PTREGS_OFF, %o0 ba,pt %xcc, rtrap clr %l6 diff --git a/include/asm-sparc64/sfafsr.h b/include/asm-sparc64/sfafsr.h new file mode 100644 index 0000000..2f792c2 --- /dev/null +++ b/include/asm-sparc64/sfafsr.h @@ -0,0 +1,82 @@ +#ifndef _SPARC64_SFAFSR_H +#define _SPARC64_SFAFSR_H + +#include + +/* Spitfire Asynchronous Fault Status register, ASI=0x4C VA<63:0>=0x0 */ + +#define SFAFSR_ME (_AC(1,UL) << SFAFSR_ME_SHIFT) +#define SFAFSR_ME_SHIFT 32 +#define SFAFSR_PRIV (_AC(1,UL) << SFAFSR_PRIV_SHIFT) +#define SFAFSR_PRIV_SHIFT 31 +#define SFAFSR_ISAP (_AC(1,UL) << SFAFSR_ISAP_SHIFT) +#define SFAFSR_ISAP_SHIFT 30 +#define SFAFSR_ETP (_AC(1,UL) << SFAFSR_ETP_SHIFT) +#define SFAFSR_ETP_SHIFT 29 +#define SFAFSR_IVUE (_AC(1,UL) << SFAFSR_IVUE_SHIFT) +#define SFAFSR_IVUE_SHIFT 28 +#define SFAFSR_TO (_AC(1,UL) << SFAFSR_TO_SHIFT) +#define SFAFSR_TO_SHIFT 27 +#define SFAFSR_BERR (_AC(1,UL) << SFAFSR_BERR_SHIFT) +#define SFAFSR_BERR_SHIFT 26 +#define SFAFSR_LDP (_AC(1,UL) << SFAFSR_LDP_SHIFT) +#define SFAFSR_LDP_SHIFT 25 +#define SFAFSR_CP (_AC(1,UL) << SFAFSR_CP_SHIFT) +#define SFAFSR_CP_SHIFT 24 +#define SFAFSR_WP (_AC(1,UL) << SFAFSR_WP_SHIFT) +#define SFAFSR_WP_SHIFT 23 +#define SFAFSR_EDP (_AC(1,UL) << SFAFSR_EDP_SHIFT) +#define SFAFSR_EDP_SHIFT 22 +#define SFAFSR_UE (_AC(1,UL) << SFAFSR_UE_SHIFT) +#define SFAFSR_UE_SHIFT 21 +#define SFAFSR_CE (_AC(1,UL) << SFAFSR_CE_SHIFT) +#define SFAFSR_CE_SHIFT 20 +#define SFAFSR_ETS (_AC(0xf,UL) << SFAFSR_ETS_SHIFT) +#define SFAFSR_ETS_SHIFT 16 +#define SFAFSR_PSYND (_AC(0xffff,UL) << SFAFSR_PSYND_SHIFT) +#define SFAFSR_PSYND_SHIFT 0 + +/* UDB Error Register, ASI=0x7f VA<63:0>=0x0(High),0x18(Low) for read + * ASI=0x77 VA<63:0>=0x0(High),0x18(Low) for write + */ + +#define UDBE_UE (_AC(1,UL) << 9) +#define UDBE_CE (_AC(1,UL) << 8) +#define UDBE_E_SYNDR (_AC(0xff,UL) << 0) + +/* The trap handlers for asynchronous errors encode the AFSR and + * other pieces of information into a 64-bit argument for C code + * encoded as follows: + * + * ----------------------------------------------- + * | UDB_H | UDB_L | TL>1 | TT | AFSR | + * ----------------------------------------------- + * 63 54 53 44 42 41 33 32 0 + * + * The AFAR is passed in unchanged. + */ +#define SFSTAT_UDBH_MASK (_AC(0x3ff,UL) << SFSTAT_UDBH_SHIFT) +#define SFSTAT_UDBH_SHIFT 54 +#define SFSTAT_UDBL_MASK (_AC(0x3ff,UL) << SFSTAT_UDBH_SHIFT) +#define SFSTAT_UDBL_SHIFT 44 +#define SFSTAT_TL_GT_ONE (_AC(1,UL) << SFSTAT_TL_GT_ONE_SHIFT) +#define SFSTAT_TL_GT_ONE_SHIFT 42 +#define SFSTAT_TRAP_TYPE (_AC(0x1FF,UL) << SFSTAT_TRAP_TYPE_SHIFT) +#define SFSTAT_TRAP_TYPE_SHIFT 33 +#define SFSTAT_AFSR_MASK (_AC(0x1ffffffff,UL) << SFSTAT_AFSR_SHIFT) +#define SFSTAT_AFSR_SHIFT 0 + +/* ESTATE Error Enable Register, ASI=0x4b VA<63:0>=0x0 */ +#define ESTATE_ERR_CE 0x1 /* Correctable errors */ +#define ESTATE_ERR_NCE 0x2 /* TO, BERR, LDP, ETP, EDP, WP, UE, IVUE */ +#define ESTATE_ERR_ISAP 0x4 /* System address parity error */ +#define ESTATE_ERR_ALL (ESTATE_ERR_CE | \ + ESTATE_ERR_NCE | \ + ESTATE_ERR_ISAP) + +/* The various trap types that report using the above state. */ +#define TRAP_TYPE_IAE 0x09 /* Instruction Access Error */ +#define TRAP_TYPE_DAE 0x32 /* Data Access Error */ +#define TRAP_TYPE_CEE 0x63 /* Correctable ECC Error */ + +#endif /* _SPARC64_SFAFSR_H */ -- cgit v0.10.2 From 3d6364abcfdaedeb34418c2894f61251d48614f6 Mon Sep 17 00:00:00 2001 From: Kumar Gala Date: Mon, 29 Aug 2005 12:45:30 -0700 Subject: [SPARC64]: remove use of asm/segment.h Removed sparc64 architecture specific users of asm/segment.h and asm-sparc64/segment.h itself Signed-off-by: Kumar Gala Signed-off-by: David S. Miller diff --git a/arch/sparc64/kernel/setup.c b/arch/sparc64/kernel/setup.c index b7e6a91..fbdfed3 100644 --- a/arch/sparc64/kernel/setup.c +++ b/arch/sparc64/kernel/setup.c @@ -33,7 +33,6 @@ #include #include -#include #include #include #include diff --git a/include/asm-sparc64/processor.h b/include/asm-sparc64/processor.h index d0bee24..3169f3e 100644 --- a/include/asm-sparc64/processor.h +++ b/include/asm-sparc64/processor.h @@ -18,7 +18,6 @@ #include #include #include -#include #include /* The sparc has no problems with write protection */ diff --git a/include/asm-sparc64/segment.h b/include/asm-sparc64/segment.h deleted file mode 100644 index b03e709..0000000 --- a/include/asm-sparc64/segment.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef __SPARC64_SEGMENT_H -#define __SPARC64_SEGMENT_H - -/* Only here because we have some old header files that expect it.. */ - -#endif -- cgit v0.10.2 From ca7c8d2c1e2a2f2445cb5e00f45b93af57f22c1b Mon Sep 17 00:00:00 2001 From: Kumar Gala Date: Mon, 29 Aug 2005 12:45:44 -0700 Subject: [SPARC]: remove use of asm/segment.h Removed sparc architecture specific users of asm/segment.h and asm-sparc/segment.h itself Signed-off-by: Kumar Gala Signed-off-by: David S. Miller diff --git a/arch/sparc/kernel/setup.c b/arch/sparc/kernel/setup.c index 55352ed..53c192a 100644 --- a/arch/sparc/kernel/setup.c +++ b/arch/sparc/kernel/setup.c @@ -32,7 +32,6 @@ #include #include -#include #include #include #include diff --git a/arch/sparc/kernel/tick14.c b/arch/sparc/kernel/tick14.c index fd8005a..591547a 100644 --- a/arch/sparc/kernel/tick14.c +++ b/arch/sparc/kernel/tick14.c @@ -19,7 +19,6 @@ #include #include -#include #include #include #include diff --git a/arch/sparc/kernel/time.c b/arch/sparc/kernel/time.c index 6486cbf..3b759ae 100644 --- a/arch/sparc/kernel/time.c +++ b/arch/sparc/kernel/time.c @@ -32,7 +32,6 @@ #include #include -#include #include #include #include diff --git a/arch/sparc/mm/fault.c b/arch/sparc/mm/fault.c index 37f4107..2bbd53f 100644 --- a/arch/sparc/mm/fault.c +++ b/arch/sparc/mm/fault.c @@ -23,7 +23,6 @@ #include #include -#include #include #include #include diff --git a/arch/sparc/mm/init.c b/arch/sparc/mm/init.c index ec2e050..c03baba 100644 --- a/arch/sparc/mm/init.c +++ b/arch/sparc/mm/init.c @@ -25,7 +25,6 @@ #include #include -#include #include #include #include diff --git a/include/asm-sparc/processor.h b/include/asm-sparc/processor.h index 32c9699..5a7a1a8 100644 --- a/include/asm-sparc/processor.h +++ b/include/asm-sparc/processor.h @@ -19,7 +19,6 @@ #include #include #include -#include #include #include diff --git a/include/asm-sparc/segment.h b/include/asm-sparc/segment.h deleted file mode 100644 index a1b7ffc..0000000 --- a/include/asm-sparc/segment.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef __SPARC_SEGMENT_H -#define __SPARC_SEGMENT_H - -/* Only here because we have some old header files that expect it.. */ - -#endif diff --git a/include/asm-sparc/system.h b/include/asm-sparc/system.h index 898562e..3557781 100644 --- a/include/asm-sparc/system.h +++ b/include/asm-sparc/system.h @@ -9,7 +9,6 @@ #include /* NR_CPUS */ #include -#include #include #include #include -- cgit v0.10.2 From 442464a50077ff00454ff8d7628cbe1b8eacc034 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 29 Aug 2005 12:46:07 -0700 Subject: [SPARC64]: Make debugging spinlocks usable again. When the spinlock routines were moved out of line into kernel/spinlock.c this made it so that the debugging spinlocks record lock acquisition program counts in the kernel/spinlock.c functions not in their callers. This makes the debugging info kind of useless. So record the correct caller's program counter and now this feature is useful once more. Signed-off-by: David S. Miller diff --git a/arch/sparc64/kernel/sparc64_ksyms.c b/arch/sparc64/kernel/sparc64_ksyms.c index 9202d92..0764b93 100644 --- a/arch/sparc64/kernel/sparc64_ksyms.c +++ b/arch/sparc64/kernel/sparc64_ksyms.c @@ -99,17 +99,6 @@ extern int __ashrdi3(int, int); extern void dump_thread(struct pt_regs *, struct user *); extern int dump_fpu (struct pt_regs * regs, elf_fpregset_t * fpregs); -#if defined(CONFIG_SMP) && defined(CONFIG_DEBUG_SPINLOCK) -extern void _do_spin_lock (spinlock_t *lock, char *str); -extern void _do_spin_unlock (spinlock_t *lock); -extern int _spin_trylock (spinlock_t *lock); -extern void _do_read_lock(rwlock_t *rw, char *str); -extern void _do_read_unlock(rwlock_t *rw, char *str); -extern void _do_write_lock(rwlock_t *rw, char *str); -extern void _do_write_unlock(rwlock_t *rw); -extern int _do_write_trylock(rwlock_t *rw, char *str); -#endif - extern unsigned long phys_base; extern unsigned long pfn_base; @@ -152,18 +141,6 @@ EXPORT_SYMBOL(_mcount); EXPORT_SYMBOL(cpu_online_map); EXPORT_SYMBOL(phys_cpu_present_map); -/* Spinlock debugging library, optional. */ -#ifdef CONFIG_DEBUG_SPINLOCK -EXPORT_SYMBOL(_do_spin_lock); -EXPORT_SYMBOL(_do_spin_unlock); -EXPORT_SYMBOL(_spin_trylock); -EXPORT_SYMBOL(_do_read_lock); -EXPORT_SYMBOL(_do_read_unlock); -EXPORT_SYMBOL(_do_write_lock); -EXPORT_SYMBOL(_do_write_unlock); -EXPORT_SYMBOL(_do_write_trylock); -#endif - EXPORT_SYMBOL(smp_call_function); #endif /* CONFIG_SMP */ diff --git a/arch/sparc64/lib/debuglocks.c b/arch/sparc64/lib/debuglocks.c index f03344c..7f6ccc4 100644 --- a/arch/sparc64/lib/debuglocks.c +++ b/arch/sparc64/lib/debuglocks.c @@ -12,8 +12,6 @@ #ifdef CONFIG_SMP -#define GET_CALLER(PC) __asm__ __volatile__("mov %%i7, %0" : "=r" (PC)) - static inline void show (char *str, spinlock_t *lock, unsigned long caller) { int cpu = smp_processor_id(); @@ -51,14 +49,13 @@ static inline void show_write (char *str, rwlock_t *lock, unsigned long caller) #undef INIT_STUCK #define INIT_STUCK 100000000 -void _do_spin_lock(spinlock_t *lock, char *str) +void _do_spin_lock(spinlock_t *lock, char *str, unsigned long caller) { - unsigned long caller, val; + unsigned long val; int stuck = INIT_STUCK; int cpu = get_cpu(); int shown = 0; - GET_CALLER(caller); again: __asm__ __volatile__("ldstub [%1], %0" : "=r" (val) @@ -84,12 +81,11 @@ again: put_cpu(); } -int _do_spin_trylock(spinlock_t *lock) +int _do_spin_trylock(spinlock_t *lock, unsigned long caller) { - unsigned long val, caller; + unsigned long val; int cpu = get_cpu(); - GET_CALLER(caller); __asm__ __volatile__("ldstub [%1], %0" : "=r" (val) : "r" (&(lock->lock)) @@ -118,14 +114,13 @@ void _do_spin_unlock(spinlock_t *lock) /* Keep INIT_STUCK the same... */ -void _do_read_lock (rwlock_t *rw, char *str) +void _do_read_lock(rwlock_t *rw, char *str, unsigned long caller) { - unsigned long caller, val; + unsigned long val; int stuck = INIT_STUCK; int cpu = get_cpu(); int shown = 0; - GET_CALLER(caller); wlock_again: /* Wait for any writer to go away. */ while (((long)(rw->lock)) < 0) { @@ -157,15 +152,13 @@ wlock_again: put_cpu(); } -void _do_read_unlock (rwlock_t *rw, char *str) +void _do_read_unlock(rwlock_t *rw, char *str, unsigned long caller) { - unsigned long caller, val; + unsigned long val; int stuck = INIT_STUCK; int cpu = get_cpu(); int shown = 0; - GET_CALLER(caller); - /* Drop our identity _first_. */ rw->reader_pc[cpu] = 0; current->thread.smp_lock_count--; @@ -193,14 +186,13 @@ runlock_again: put_cpu(); } -void _do_write_lock (rwlock_t *rw, char *str) +void _do_write_lock(rwlock_t *rw, char *str, unsigned long caller) { - unsigned long caller, val; + unsigned long val; int stuck = INIT_STUCK; int cpu = get_cpu(); int shown = 0; - GET_CALLER(caller); wlock_again: /* Spin while there is another writer. */ while (((long)rw->lock) < 0) { @@ -278,14 +270,12 @@ wlock_again: put_cpu(); } -void _do_write_unlock(rwlock_t *rw) +void _do_write_unlock(rwlock_t *rw, unsigned long caller) { - unsigned long caller, val; + unsigned long val; int stuck = INIT_STUCK; int shown = 0; - GET_CALLER(caller); - /* Drop our identity _first_ */ rw->writer_pc = 0; rw->writer_cpu = NO_PROC_ID; @@ -313,13 +303,11 @@ wlock_again: } } -int _do_write_trylock (rwlock_t *rw, char *str) +int _do_write_trylock(rwlock_t *rw, char *str, unsigned long caller) { - unsigned long caller, val; + unsigned long val; int cpu = get_cpu(); - GET_CALLER(caller); - /* Try to acuire the write bit. */ __asm__ __volatile__( " mov 1, %%g3\n" diff --git a/include/asm-sparc64/spinlock.h b/include/asm-sparc64/spinlock.h index 9cb93a5..d265bf6 100644 --- a/include/asm-sparc64/spinlock.h +++ b/include/asm-sparc64/spinlock.h @@ -132,12 +132,15 @@ do { \ membar("#LoadLoad"); \ } while((__lock)->lock) -extern void _do_spin_lock (spinlock_t *lock, char *str); -extern void _do_spin_unlock (spinlock_t *lock); -extern int _do_spin_trylock (spinlock_t *lock); - -#define _raw_spin_trylock(lp) _do_spin_trylock(lp) -#define _raw_spin_lock(lock) _do_spin_lock(lock, "spin_lock") +extern void _do_spin_lock(spinlock_t *lock, char *str, unsigned long caller); +extern void _do_spin_unlock(spinlock_t *lock); +extern int _do_spin_trylock(spinlock_t *lock, unsigned long caller); + +#define _raw_spin_trylock(lp) \ + _do_spin_trylock(lp, (unsigned long) __builtin_return_address(0)) +#define _raw_spin_lock(lock) \ + _do_spin_lock(lock, "spin_lock", \ + (unsigned long) __builtin_return_address(0)) #define _raw_spin_unlock(lock) _do_spin_unlock(lock) #define _raw_spin_lock_flags(lock, flags) _raw_spin_lock(lock) @@ -279,37 +282,41 @@ typedef struct { #define RW_LOCK_UNLOCKED (rwlock_t) { 0, 0, 0xff, { } } #define rwlock_init(lp) do { *(lp) = RW_LOCK_UNLOCKED; } while(0) -extern void _do_read_lock(rwlock_t *rw, char *str); -extern void _do_read_unlock(rwlock_t *rw, char *str); -extern void _do_write_lock(rwlock_t *rw, char *str); -extern void _do_write_unlock(rwlock_t *rw); -extern int _do_write_trylock(rwlock_t *rw, char *str); +extern void _do_read_lock(rwlock_t *rw, char *str, unsigned long caller); +extern void _do_read_unlock(rwlock_t *rw, char *str, unsigned long caller); +extern void _do_write_lock(rwlock_t *rw, char *str, unsigned long caller); +extern void _do_write_unlock(rwlock_t *rw, unsigned long caller); +extern int _do_write_trylock(rwlock_t *rw, char *str, unsigned long caller); #define _raw_read_lock(lock) \ do { unsigned long flags; \ local_irq_save(flags); \ - _do_read_lock(lock, "read_lock"); \ + _do_read_lock(lock, "read_lock", \ + (unsigned long) __builtin_return_address(0)); \ local_irq_restore(flags); \ } while(0) #define _raw_read_unlock(lock) \ do { unsigned long flags; \ local_irq_save(flags); \ - _do_read_unlock(lock, "read_unlock"); \ + _do_read_unlock(lock, "read_unlock", \ + (unsigned long) __builtin_return_address(0)); \ local_irq_restore(flags); \ } while(0) #define _raw_write_lock(lock) \ do { unsigned long flags; \ local_irq_save(flags); \ - _do_write_lock(lock, "write_lock"); \ + _do_write_lock(lock, "write_lock", \ + (unsigned long) __builtin_return_address(0)); \ local_irq_restore(flags); \ } while(0) #define _raw_write_unlock(lock) \ do { unsigned long flags; \ local_irq_save(flags); \ - _do_write_unlock(lock); \ + _do_write_unlock(lock, \ + (unsigned long) __builtin_return_address(0)); \ local_irq_restore(flags); \ } while(0) @@ -317,7 +324,8 @@ do { unsigned long flags; \ ({ unsigned long flags; \ int val; \ local_irq_save(flags); \ - val = _do_write_trylock(lock, "write_trylock"); \ + val = _do_write_trylock(lock, "write_trylock", \ + (unsigned long) __builtin_return_address(0)); \ local_irq_restore(flags); \ val; \ }) -- cgit v0.10.2 From 4f07118f656c179740cad35b827032e2e29b1210 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 29 Aug 2005 12:46:22 -0700 Subject: [SPARC64]: More fully work around Spitfire Errata 51. It appears that a memory barrier soon after a mispredicted branch, not just in the delay slot, can cause the hang condition of this cpu errata. So move them out-of-line, and explicitly put them into a "branch always, predict taken" delay slot which should fully kill this problem. Signed-off-by: David S. Miller diff --git a/arch/sparc64/kernel/pci_iommu.c b/arch/sparc64/kernel/pci_iommu.c index 2803bc7..425c60c 100644 --- a/arch/sparc64/kernel/pci_iommu.c +++ b/arch/sparc64/kernel/pci_iommu.c @@ -466,7 +466,7 @@ do_flush_sync: if (!limit) break; udelay(1); - membar("#LoadLoad"); + rmb(); } if (!limit) printk(KERN_WARNING "pci_strbuf_flush: flushflag timeout " diff --git a/arch/sparc64/kernel/process.c b/arch/sparc64/kernel/process.c index 07424b0..6625543 100644 --- a/arch/sparc64/kernel/process.c +++ b/arch/sparc64/kernel/process.c @@ -103,7 +103,7 @@ void cpu_idle(void) * other cpus see our increasing idleness for the buddy * redistribution algorithm. -DaveM */ - membar("#StoreStore | #StoreLoad"); + membar_storeload_storestore(); } } diff --git a/arch/sparc64/kernel/sbus.c b/arch/sparc64/kernel/sbus.c index 89f5e01..e09ddf9 100644 --- a/arch/sparc64/kernel/sbus.c +++ b/arch/sparc64/kernel/sbus.c @@ -147,7 +147,7 @@ static void sbus_strbuf_flush(struct sbus_iommu *iommu, u32 base, unsigned long if (!limit) break; udelay(1); - membar("#LoadLoad"); + rmb(); } if (!limit) printk(KERN_WARNING "sbus_strbuf_flush: flushflag timeout " diff --git a/arch/sparc64/kernel/signal32.c b/arch/sparc64/kernel/signal32.c index b1ed230..aecccd0 100644 --- a/arch/sparc64/kernel/signal32.c +++ b/arch/sparc64/kernel/signal32.c @@ -877,11 +877,12 @@ static void new_setup_frame32(struct k_sigaction *ka, struct pt_regs *regs, unsigned long page = (unsigned long) page_address(pte_page(*ptep)); - __asm__ __volatile__( - " membar #StoreStore\n" - " flush %0 + %1" - : : "r" (page), "r" (address & (PAGE_SIZE - 1)) - : "memory"); + wmb(); + __asm__ __volatile__("flush %0 + %1" + : /* no outputs */ + : "r" (page), + "r" (address & (PAGE_SIZE - 1)) + : "memory"); } pte_unmap(ptep); preempt_enable(); @@ -1292,11 +1293,12 @@ static void setup_rt_frame32(struct k_sigaction *ka, struct pt_regs *regs, unsigned long page = (unsigned long) page_address(pte_page(*ptep)); - __asm__ __volatile__( - " membar #StoreStore\n" - " flush %0 + %1" - : : "r" (page), "r" (address & (PAGE_SIZE - 1)) - : "memory"); + wmb(); + __asm__ __volatile__("flush %0 + %1" + : /* no outputs */ + : "r" (page), + "r" (address & (PAGE_SIZE - 1)) + : "memory"); } pte_unmap(ptep); preempt_enable(); diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c index b9b4249..b4fc6a5 100644 --- a/arch/sparc64/kernel/smp.c +++ b/arch/sparc64/kernel/smp.c @@ -144,7 +144,7 @@ void __init smp_callin(void) current->active_mm = &init_mm; while (!cpu_isset(cpuid, smp_commenced_mask)) - membar("#LoadLoad"); + rmb(); cpu_set(cpuid, cpu_online_map); } @@ -184,11 +184,11 @@ static inline long get_delta (long *rt, long *master) for (i = 0; i < NUM_ITERS; i++) { t0 = tick_ops->get_tick(); go[MASTER] = 1; - membar("#StoreLoad"); + membar_storeload(); while (!(tm = go[SLAVE])) - membar("#LoadLoad"); + rmb(); go[SLAVE] = 0; - membar("#StoreStore"); + wmb(); t1 = tick_ops->get_tick(); if (t1 - t0 < best_t1 - best_t0) @@ -221,7 +221,7 @@ void smp_synchronize_tick_client(void) go[MASTER] = 1; while (go[MASTER]) - membar("#LoadLoad"); + rmb(); local_irq_save(flags); { @@ -273,21 +273,21 @@ static void smp_synchronize_one_tick(int cpu) /* wait for client to be ready */ while (!go[MASTER]) - membar("#LoadLoad"); + rmb(); /* now let the client proceed into his loop */ go[MASTER] = 0; - membar("#StoreLoad"); + membar_storeload(); spin_lock_irqsave(&itc_sync_lock, flags); { for (i = 0; i < NUM_ROUNDS*NUM_ITERS; i++) { while (!go[MASTER]) - membar("#LoadLoad"); + rmb(); go[MASTER] = 0; - membar("#StoreStore"); + wmb(); go[SLAVE] = tick_ops->get_tick(); - membar("#StoreLoad"); + membar_storeload(); } } spin_unlock_irqrestore(&itc_sync_lock, flags); @@ -927,11 +927,11 @@ void smp_capture(void) smp_processor_id()); #endif penguins_are_doing_time = 1; - membar("#StoreStore | #LoadStore"); + membar_storestore_loadstore(); atomic_inc(&smp_capture_registry); smp_cross_call(&xcall_capture, 0, 0, 0); while (atomic_read(&smp_capture_registry) != ncpus) - membar("#LoadLoad"); + rmb(); #ifdef CAPTURE_DEBUG printk("done\n"); #endif @@ -947,7 +947,7 @@ void smp_release(void) smp_processor_id()); #endif penguins_are_doing_time = 0; - membar("#StoreStore | #StoreLoad"); + membar_storeload_storestore(); atomic_dec(&smp_capture_registry); } } @@ -970,9 +970,9 @@ void smp_penguin_jailcell(int irq, struct pt_regs *regs) save_alternate_globals(global_save); prom_world(1); atomic_inc(&smp_capture_registry); - membar("#StoreLoad | #StoreStore"); + membar_storeload_storestore(); while (penguins_are_doing_time) - membar("#LoadLoad"); + rmb(); restore_alternate_globals(global_save); atomic_dec(&smp_capture_registry); prom_world(0); diff --git a/arch/sparc64/kernel/sparc64_ksyms.c b/arch/sparc64/kernel/sparc64_ksyms.c index 0764b93..a3ea697 100644 --- a/arch/sparc64/kernel/sparc64_ksyms.c +++ b/arch/sparc64/kernel/sparc64_ksyms.c @@ -406,3 +406,12 @@ EXPORT_SYMBOL(xor_vis_4); EXPORT_SYMBOL(xor_vis_5); EXPORT_SYMBOL(prom_palette); + +/* memory barriers */ +EXPORT_SYMBOL(mb); +EXPORT_SYMBOL(rmb); +EXPORT_SYMBOL(wmb); +EXPORT_SYMBOL(membar_storeload); +EXPORT_SYMBOL(membar_storeload_storestore); +EXPORT_SYMBOL(membar_storeload_loadload); +EXPORT_SYMBOL(membar_storestore_loadstore); diff --git a/arch/sparc64/lib/Makefile b/arch/sparc64/lib/Makefile index 40dbeec..6201f10 100644 --- a/arch/sparc64/lib/Makefile +++ b/arch/sparc64/lib/Makefile @@ -12,7 +12,7 @@ lib-y := PeeCeeI.o copy_page.o clear_page.o strlen.o strncmp.o \ U1memcpy.o U1copy_from_user.o U1copy_to_user.o \ U3memcpy.o U3copy_from_user.o U3copy_to_user.o U3patch.o \ copy_in_user.o user_fixup.o memmove.o \ - mcount.o ipcsum.o rwsem.o xor.o find_bit.o delay.o + mcount.o ipcsum.o rwsem.o xor.o find_bit.o delay.o mb.o lib-$(CONFIG_DEBUG_SPINLOCK) += debuglocks.o lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o diff --git a/arch/sparc64/lib/debuglocks.c b/arch/sparc64/lib/debuglocks.c index 7f6ccc4..f5f0b55 100644 --- a/arch/sparc64/lib/debuglocks.c +++ b/arch/sparc64/lib/debuglocks.c @@ -61,7 +61,7 @@ again: : "=r" (val) : "r" (&(lock->lock)) : "memory"); - membar("#StoreLoad | #StoreStore"); + membar_storeload_storestore(); if (val) { while (lock->lock) { if (!--stuck) { @@ -69,7 +69,7 @@ again: show(str, lock, caller); stuck = INIT_STUCK; } - membar("#LoadLoad"); + rmb(); } goto again; } @@ -90,7 +90,7 @@ int _do_spin_trylock(spinlock_t *lock, unsigned long caller) : "=r" (val) : "r" (&(lock->lock)) : "memory"); - membar("#StoreLoad | #StoreStore"); + membar_storeload_storestore(); if (!val) { lock->owner_pc = ((unsigned int)caller); lock->owner_cpu = cpu; @@ -107,7 +107,7 @@ void _do_spin_unlock(spinlock_t *lock) { lock->owner_pc = 0; lock->owner_cpu = NO_PROC_ID; - membar("#StoreStore | #LoadStore"); + membar_storestore_loadstore(); lock->lock = 0; current->thread.smp_lock_count--; } @@ -129,7 +129,7 @@ wlock_again: show_read(str, rw, caller); stuck = INIT_STUCK; } - membar("#LoadLoad"); + rmb(); } /* Try once to increment the counter. */ __asm__ __volatile__( @@ -142,7 +142,7 @@ wlock_again: "2:" : "=r" (val) : "0" (&(rw->lock)) : "g1", "g7", "memory"); - membar("#StoreLoad | #StoreStore"); + membar_storeload_storestore(); if (val) goto wlock_again; rw->reader_pc[cpu] = ((unsigned int)caller); @@ -201,7 +201,7 @@ wlock_again: show_write(str, rw, caller); stuck = INIT_STUCK; } - membar("#LoadLoad"); + rmb(); } /* Try to acuire the write bit. */ @@ -256,7 +256,7 @@ wlock_again: show_write(str, rw, caller); stuck = INIT_STUCK; } - membar("#LoadLoad"); + rmb(); } goto wlock_again; } diff --git a/arch/sparc64/lib/mb.S b/arch/sparc64/lib/mb.S new file mode 100644 index 0000000..4004f74 --- /dev/null +++ b/arch/sparc64/lib/mb.S @@ -0,0 +1,73 @@ +/* mb.S: Out of line memory barriers. + * + * Copyright (C) 2005 David S. Miller (davem@davemloft.net) + */ + + /* These are here in an effort to more fully work around + * Spitfire Errata #51. Essentially, if a memory barrier + * occurs soon after a mispredicted branch, the chip can stop + * executing instructions until a trap occurs. Therefore, if + * interrupts are disabled, the chip can hang forever. + * + * It used to be believed that the memory barrier had to be + * right in the delay slot, but a case has been traced + * recently wherein the memory barrier was one instruction + * after the branch delay slot and the chip still hung. The + * offending sequence was the following in sym_wakeup_done() + * of the sym53c8xx_2 driver: + * + * call sym_ccb_from_dsa, 0 + * movge %icc, 0, %l0 + * brz,pn %o0, .LL1303 + * mov %o0, %l2 + * membar #LoadLoad + * + * The branch has to be mispredicted for the bug to occur. + * Therefore, we put the memory barrier explicitly into a + * "branch always, predicted taken" delay slot to avoid the + * problem case. + */ + + .text + +99: retl + nop + + .globl mb +mb: ba,pt %xcc, 99b + membar #LoadLoad | #LoadStore | #StoreStore | #StoreLoad + .size mb, .-mb + + .globl rmb +rmb: ba,pt %xcc, 99b + membar #LoadLoad + .size rmb, .-rmb + + .globl wmb +wmb: ba,pt %xcc, 99b + membar #StoreStore + .size wmb, .-wmb + + .globl membar_storeload +membar_storeload: + ba,pt %xcc, 99b + membar #StoreLoad + .size membar_storeload, .-membar_storeload + + .globl membar_storeload_storestore +membar_storeload_storestore: + ba,pt %xcc, 99b + membar #StoreLoad | #StoreStore + .size membar_storeload_storestore, .-membar_storeload_storestore + + .globl membar_storeload_loadload +membar_storeload_loadload: + ba,pt %xcc, 99b + membar #StoreLoad | #LoadLoad + .size membar_storeload_loadload, .-membar_storeload_loadload + + .globl membar_storestore_loadstore +membar_storestore_loadstore: + ba,pt %xcc, 99b + membar #StoreStore | #LoadStore + .size membar_storestore_loadstore, .-membar_storestore_loadstore diff --git a/arch/sparc64/solaris/misc.c b/arch/sparc64/solaris/misc.c index 15b4cfe..302efbc 100644 --- a/arch/sparc64/solaris/misc.c +++ b/arch/sparc64/solaris/misc.c @@ -737,7 +737,8 @@ MODULE_LICENSE("GPL"); extern u32 tl0_solaris[8]; #define update_ttable(x) \ tl0_solaris[3] = (((long)(x) - (long)tl0_solaris - 3) >> 2) | 0x40000000; \ - __asm__ __volatile__ ("membar #StoreStore; flush %0" : : "r" (&tl0_solaris[3])) + wmb(); \ + __asm__ __volatile__ ("flush %0" : : "r" (&tl0_solaris[3])) #else #endif @@ -761,7 +762,8 @@ int init_module(void) entry64_personality_patch |= (offsetof(struct task_struct, personality) + (sizeof(unsigned long) - 1)); - __asm__ __volatile__("membar #StoreStore; flush %0" + wmb(); + __asm__ __volatile__("flush %0" : : "r" (&entry64_personality_patch)); return 0; } diff --git a/include/asm-sparc64/atomic.h b/include/asm-sparc64/atomic.h index d80f3379..e175afc 100644 --- a/include/asm-sparc64/atomic.h +++ b/include/asm-sparc64/atomic.h @@ -72,10 +72,10 @@ extern int atomic64_sub_ret(int, atomic64_t *); /* Atomic operations are already serializing */ #ifdef CONFIG_SMP -#define smp_mb__before_atomic_dec() membar("#StoreLoad | #LoadLoad") -#define smp_mb__after_atomic_dec() membar("#StoreLoad | #StoreStore") -#define smp_mb__before_atomic_inc() membar("#StoreLoad | #LoadLoad") -#define smp_mb__after_atomic_inc() membar("#StoreLoad | #StoreStore") +#define smp_mb__before_atomic_dec() membar_storeload_loadload(); +#define smp_mb__after_atomic_dec() membar_storeload_storestore(); +#define smp_mb__before_atomic_inc() membar_storeload_loadload(); +#define smp_mb__after_atomic_inc() membar_storeload_storestore(); #else #define smp_mb__before_atomic_dec() barrier() #define smp_mb__after_atomic_dec() barrier() diff --git a/include/asm-sparc64/bitops.h b/include/asm-sparc64/bitops.h index 9c5e719..6388b83 100644 --- a/include/asm-sparc64/bitops.h +++ b/include/asm-sparc64/bitops.h @@ -72,8 +72,8 @@ static inline int __test_and_change_bit(int nr, volatile unsigned long *addr) } #ifdef CONFIG_SMP -#define smp_mb__before_clear_bit() membar("#StoreLoad | #LoadLoad") -#define smp_mb__after_clear_bit() membar("#StoreLoad | #StoreStore") +#define smp_mb__before_clear_bit() membar_storeload_loadload() +#define smp_mb__after_clear_bit() membar_storeload_storestore() #else #define smp_mb__before_clear_bit() barrier() #define smp_mb__after_clear_bit() barrier() diff --git a/include/asm-sparc64/spinlock.h b/include/asm-sparc64/spinlock.h index d265bf6..a02c437 100644 --- a/include/asm-sparc64/spinlock.h +++ b/include/asm-sparc64/spinlock.h @@ -43,7 +43,7 @@ typedef struct { #define spin_is_locked(lp) ((lp)->lock != 0) #define spin_unlock_wait(lp) \ -do { membar("#LoadLoad"); \ +do { rmb(); \ } while((lp)->lock) static inline void _raw_spin_lock(spinlock_t *lock) @@ -129,7 +129,7 @@ typedef struct { #define spin_is_locked(__lock) ((__lock)->lock != 0) #define spin_unlock_wait(__lock) \ do { \ - membar("#LoadLoad"); \ + rmb(); \ } while((__lock)->lock) extern void _do_spin_lock(spinlock_t *lock, char *str, unsigned long caller); diff --git a/include/asm-sparc64/system.h b/include/asm-sparc64/system.h index ee4bdfc..5e94c05 100644 --- a/include/asm-sparc64/system.h +++ b/include/asm-sparc64/system.h @@ -28,6 +28,14 @@ enum sparc_cpu { #define ARCH_SUN4C_SUN4 0 #define ARCH_SUN4 0 +extern void mb(void); +extern void rmb(void); +extern void wmb(void); +extern void membar_storeload(void); +extern void membar_storeload_storestore(void); +extern void membar_storeload_loadload(void); +extern void membar_storestore_loadstore(void); + #endif #define setipl(__new_ipl) \ @@ -78,16 +86,11 @@ enum sparc_cpu { #define nop() __asm__ __volatile__ ("nop") -#define membar(type) __asm__ __volatile__ ("membar " type : : : "memory") -#define mb() \ - membar("#LoadLoad | #LoadStore | #StoreStore | #StoreLoad") -#define rmb() membar("#LoadLoad") -#define wmb() membar("#StoreStore") #define read_barrier_depends() do { } while(0) #define set_mb(__var, __value) \ - do { __var = __value; membar("#StoreLoad | #StoreStore"); } while(0) + do { __var = __value; membar_storeload_storestore(); } while(0) #define set_wmb(__var, __value) \ - do { __var = __value; membar("#StoreStore"); } while(0) + do { __var = __value; wmb(); } while(0) #ifdef CONFIG_SMP #define smp_mb() mb() -- cgit v0.10.2 From bf3a46aa9b96f6eb3a49a568f72a2801c3e830c0 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 19:22:01 -0700 Subject: [NETFILTER]: convert nfmark and conntrack mark to 32bit As discussed at netconf'05, we convert nfmark and conntrack-mark to be 32bits even on 64bit architectures. Signed-off-by: Harald Welte Signed-off-by: David S. Miller diff --git a/include/linux/netfilter_ipv4/ip_conntrack.h b/include/linux/netfilter_ipv4/ip_conntrack.h index 08fe5f7..4ed720f 100644 --- a/include/linux/netfilter_ipv4/ip_conntrack.h +++ b/include/linux/netfilter_ipv4/ip_conntrack.h @@ -171,7 +171,7 @@ struct ip_conntrack #endif /* CONFIG_IP_NF_NAT_NEEDED */ #if defined(CONFIG_IP_NF_CONNTRACK_MARK) - unsigned long mark; + u_int32_t mark; #endif /* Traversed often, so hopefully in different cacheline to top */ diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 948527e..2e40f4c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -259,7 +259,7 @@ struct sk_buff { void (*destructor)(struct sk_buff *skb); #ifdef CONFIG_NETFILTER - unsigned long nfmark; + __u32 nfmark; __u32 nfcache; __u32 nfctinfo; struct nf_conntrack *nfct; diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c index 61798c4..dccd4ab 100644 --- a/net/ipv4/netfilter/ip_conntrack_standalone.c +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c @@ -185,7 +185,7 @@ static int ct_seq_show(struct seq_file *s, void *v) return -ENOSPC; #if defined(CONFIG_IP_NF_CONNTRACK_MARK) - if (seq_printf(s, "mark=%lu ", conntrack->mark)) + if (seq_printf(s, "mark=%u ", conntrack->mark)) return -ENOSPC; #endif diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 6706d3a..2d05caf 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -367,7 +367,7 @@ target(struct sk_buff **pskb, #ifdef DEBUG_CLUSTERP DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); #endif - DEBUGP("hash=%u ct_hash=%lu ", hash, ct->mark); + DEBUGP("hash=%u ct_hash=%u ", hash, ct->mark); if (!clusterip_responsible(cipinfo->config, hash)) { DEBUGP("not responsible\n"); return NF_DROP; diff --git a/net/ipv4/netfilter/ipt_CONNMARK.c b/net/ipv4/netfilter/ipt_CONNMARK.c index 30ddd3e..8ed7441 100644 --- a/net/ipv4/netfilter/ipt_CONNMARK.c +++ b/net/ipv4/netfilter/ipt_CONNMARK.c @@ -40,9 +40,9 @@ target(struct sk_buff **pskb, void *userinfo) { const struct ipt_connmark_target_info *markinfo = targinfo; - unsigned long diff; - unsigned long nfmark; - unsigned long newmark; + u_int32_t diff; + u_int32_t nfmark; + u_int32_t newmark; enum ip_conntrack_info ctinfo; struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo); @@ -94,6 +94,11 @@ checkentry(const char *tablename, } } + if (matchinfo->mark > 0xffffffff || matchinfo->mask > 0xffffffff) { + printk(KERN_WARNING "CONNMARK: Only supports 32bit mark\n"); + return 0; + } + return 1; } diff --git a/net/ipv4/netfilter/ipt_MARK.c b/net/ipv4/netfilter/ipt_MARK.c index 33c6f9b..8526398 100644 --- a/net/ipv4/netfilter/ipt_MARK.c +++ b/net/ipv4/netfilter/ipt_MARK.c @@ -76,6 +76,8 @@ checkentry_v0(const char *tablename, unsigned int targinfosize, unsigned int hook_mask) { + struct ipt_mark_target_info *markinfo = targinfo; + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_mark_target_info))) { printk(KERN_WARNING "MARK: targinfosize %u != %Zu\n", targinfosize, @@ -88,6 +90,11 @@ checkentry_v0(const char *tablename, return 0; } + if (markinfo->mark > 0xffffffff) { + printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n"); + return 0; + } + return 1; } @@ -120,6 +127,11 @@ checkentry_v1(const char *tablename, return 0; } + if (markinfo->mark > 0xffffffff) { + printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n"); + return 0; + } + return 1; } diff --git a/net/ipv4/netfilter/ipt_connmark.c b/net/ipv4/netfilter/ipt_connmark.c index 2706f96..bf8de47 100644 --- a/net/ipv4/netfilter/ipt_connmark.c +++ b/net/ipv4/netfilter/ipt_connmark.c @@ -54,9 +54,16 @@ checkentry(const char *tablename, unsigned int matchsize, unsigned int hook_mask) { + struct ipt_connmark_info *cm = + (struct ipt_connmark_info *)matchinfo; if (matchsize != IPT_ALIGN(sizeof(struct ipt_connmark_info))) return 0; + if (cm->mark > 0xffffffff || cm->mask > 0xffffffff) { + printk(KERN_WARNING "connmark: only support 32bit mark\n"); + return 0; + } + return 1; } diff --git a/net/ipv4/netfilter/ipt_mark.c b/net/ipv4/netfilter/ipt_mark.c index 8955728..00bef6c 100644 --- a/net/ipv4/netfilter/ipt_mark.c +++ b/net/ipv4/netfilter/ipt_mark.c @@ -37,9 +37,16 @@ checkentry(const char *tablename, unsigned int matchsize, unsigned int hook_mask) { + struct ipt_mark_info *minfo = (struct ipt_mark_info *) matchinfo; + if (matchsize != IPT_ALIGN(sizeof(struct ipt_mark_info))) return 0; + if (minfo->mark > 0xffffffff || minfo->mask > 0xffffffff) { + printk(KERN_WARNING "mark: only supports 32bit mark\n"); + return 0; + } + return 1; } -- cgit v0.10.2 From 6869c4d8e066e21623c812c448a05f1ed931c9c6 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 19:24:19 -0700 Subject: [NETFILTER]: reduce netfilter sk_buff enlargement As discussed at netconf'05, we're trying to save every bit in sk_buff. The patch below makes sk_buff 8 bytes smaller. I did some basic testing on my notebook and it seems to work. The only real in-tree user of nfcache was IPVS, who only needs a single bit. Unfortunately I couldn't find some other free bit in sk_buff to stuff that bit into, so I introduced a separate field for them. Maybe the IPVS guys can resolve that to further save space. Initially I wanted to shrink pkt_type to three bits (PACKET_HOST and alike are only 6 values defined), but unfortunately the bluetooth code overloads pkt_type :( The conntrack-event-api (out-of-tree) uses nfcache, but Rusty just came up with a way how to do it without any skb fields, so it's safe to remove it. - remove all never-implemented 'nfcache' code - don't have ipvs code abuse 'nfcache' field. currently get's their own compile-conditional skb->ipvs_property field. IPVS maintainers can decide to move this bit elswhere, but nfcache needs to die. - remove skb->nfcache field to save 4 bytes - move skb->nfctinfo into three unused bits to save further 4 bytes Signed-off-by: Harald Welte Signed-off-by: David S. Miller diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 2e20454..ec60856 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -21,10 +21,13 @@ #define NF_STOP 5 #define NF_MAX_VERDICT NF_STOP +/* only for userspace compatibility */ +#ifndef __KERNEL__ /* Generic cache responses from hook functions. <= 0x2000 is used for protocol-flags. */ #define NFC_UNKNOWN 0x4000 #define NFC_ALTERED 0x8000 +#endif #ifdef __KERNEL__ #include diff --git a/include/linux/netfilter_decnet.h b/include/linux/netfilter_decnet.h index 3064eec..0189794 100644 --- a/include/linux/netfilter_decnet.h +++ b/include/linux/netfilter_decnet.h @@ -9,6 +9,8 @@ #include +/* only for userspace compatibility */ +#ifndef __KERNEL__ /* IP Cache bits. */ /* Src IP address. */ #define NFC_DN_SRC 0x0001 @@ -18,6 +20,7 @@ #define NFC_DN_IF_IN 0x0004 /* Output device. */ #define NFC_DN_IF_OUT 0x0008 +#endif /* ! __KERNEL__ */ /* DECnet Hooks */ /* After promisc drops, checksum checks. */ diff --git a/include/linux/netfilter_ipv4.h b/include/linux/netfilter_ipv4.h index 3ebc36a..552815b 100644 --- a/include/linux/netfilter_ipv4.h +++ b/include/linux/netfilter_ipv4.h @@ -8,6 +8,8 @@ #include #include +/* only for userspace compatibility */ +#ifndef __KERNEL__ /* IP Cache bits. */ /* Src IP address. */ #define NFC_IP_SRC 0x0001 @@ -35,6 +37,7 @@ #define NFC_IP_DST_PT 0x0400 /* Something else about the proto */ #define NFC_IP_PROTO_UNKNOWN 0x2000 +#endif /* ! __KERNEL__ */ /* IP Hooks */ /* After promisc drops, checksum checks. */ diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h index bee7a5e..20c069a 100644 --- a/include/linux/netfilter_ipv6.h +++ b/include/linux/netfilter_ipv6.h @@ -10,6 +10,8 @@ #include +/* only for userspace compatibility */ +#ifndef __KERNEL__ /* IP Cache bits. */ /* Src IP address. */ #define NFC_IP6_SRC 0x0001 @@ -38,6 +40,7 @@ #define NFC_IP6_DST_PT 0x0400 /* Something else about the proto */ #define NFC_IP6_PROTO_UNKNOWN 0x2000 +#endif /* ! __KERNEL__ */ /* IP6 Hooks */ diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2e40f4c..4b929c3 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -190,7 +190,6 @@ struct skb_shared_info { * @end: End pointer * @destructor: Destruct function * @nfmark: Can be used for communication between hooks - * @nfcache: Cache info * @nfct: Associated connection, if any * @nfctinfo: Relationship of this skb to the connection * @nf_bridge: Saved data about a bridged frame - see br_netfilter.c @@ -252,17 +251,18 @@ struct sk_buff { __u8 local_df:1, cloned:1, ip_summed:2, - nohdr:1; - /* 3 bits spare */ + nohdr:1, + nfctinfo:3; __u8 pkt_type; __be16 protocol; void (*destructor)(struct sk_buff *skb); #ifdef CONFIG_NETFILTER __u32 nfmark; - __u32 nfcache; - __u32 nfctinfo; struct nf_conntrack *nfct; +#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) + __u8 ipvs_property:1; +#endif #ifdef CONFIG_BRIDGE_NETFILTER struct nf_bridge_info *nf_bridge; #endif diff --git a/net/bridge/netfilter/ebt_mark.c b/net/bridge/netfilter/ebt_mark.c index 02c632b..c93d35a 100644 --- a/net/bridge/netfilter/ebt_mark.c +++ b/net/bridge/netfilter/ebt_mark.c @@ -23,10 +23,9 @@ static int ebt_target_mark(struct sk_buff **pskb, unsigned int hooknr, { struct ebt_mark_t_info *info = (struct ebt_mark_t_info *)data; - if ((*pskb)->nfmark != info->mark) { + if ((*pskb)->nfmark != info->mark) (*pskb)->nfmark = info->mark; - (*pskb)->nfcache |= NFC_ALTERED; - } + return info->target; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 7eab867..096991c 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -361,7 +361,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, unsigned int __nocast gfp_mask) n->destructor = NULL; #ifdef CONFIG_NETFILTER C(nfmark); - C(nfcache); C(nfct); nf_conntrack_get(skb->nfct); C(nfctinfo); @@ -424,7 +423,6 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->destructor = NULL; #ifdef CONFIG_NETFILTER new->nfmark = old->nfmark; - new->nfcache = old->nfcache; new->nfct = old->nfct; nf_conntrack_get(old->nfct); new->nfctinfo = old->nfctinfo; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 80d1310..766564c 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -392,7 +392,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) #endif #ifdef CONFIG_NETFILTER to->nfmark = from->nfmark; - to->nfcache = from->nfcache; /* Connection association is same as pre-frag packet */ nf_conntrack_put(to->nfct); to->nfct = from->nfct; diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 5fb257d..3ac7eec 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -22,6 +22,7 @@ * * Changes: * Paul `Rusty' Russell properly handle non-linear skbs + * Harald Welte don't use nfcache * */ @@ -529,7 +530,7 @@ static unsigned int ip_vs_post_routing(unsigned int hooknum, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - if (!((*pskb)->nfcache & NFC_IPVS_PROPERTY)) + if (!((*pskb)->ipvs_property)) return NF_ACCEPT; /* The packet was sent from IPVS, exit this chain */ @@ -701,7 +702,7 @@ static int ip_vs_out_icmp(struct sk_buff **pskb, int *related) /* do the statistics and put it back */ ip_vs_out_stats(cp, skb); - skb->nfcache |= NFC_IPVS_PROPERTY; + skb->ipvs_property = 1; verdict = NF_ACCEPT; out: @@ -739,7 +740,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **pskb, EnterFunction(11); - if (skb->nfcache & NFC_IPVS_PROPERTY) + if (skb->ipvs_property) return NF_ACCEPT; iph = skb->nh.iph; @@ -821,7 +822,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **pskb, ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); ip_vs_conn_put(cp); - skb->nfcache |= NFC_IPVS_PROPERTY; + skb->ipvs_property = 1; LeaveFunction(11); return NF_ACCEPT; diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c index a8512a3..3b87482 100644 --- a/net/ipv4/ipvs/ip_vs_xmit.c +++ b/net/ipv4/ipvs/ip_vs_xmit.c @@ -127,7 +127,7 @@ ip_vs_dst_reset(struct ip_vs_dest *dest) #define IP_VS_XMIT(skb, rt) \ do { \ - (skb)->nfcache |= NFC_IPVS_PROPERTY; \ + (skb)->ipvs_property = 1; \ (skb)->ip_summed = CHECKSUM_NONE; \ NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, \ (rt)->u.dst.dev, dst_output); \ diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index a7f0c82..04c3414 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -625,9 +625,6 @@ unsigned int ip_conntrack_in(unsigned int hooknum, return NF_DROP; } - /* FIXME: Do this right please. --RR */ - (*pskb)->nfcache |= NFC_UNKNOWN; - /* Doesn't cover locally-generated broadcast, so not worth it. */ #if 0 /* Ignore broadcast: no `connection'. */ @@ -943,10 +940,8 @@ ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user) skb = ip_defrag(skb, user); local_bh_enable(); - if (skb) { + if (skb) ip_send_check(skb->nh.iph); - skb->nfcache |= NFC_ALTERED; - } return skb; } diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c index 739b6dd..ed4d731 100644 --- a/net/ipv4/netfilter/ip_nat_core.c +++ b/net/ipv4/netfilter/ip_nat_core.c @@ -321,7 +321,6 @@ manip_pkt(u_int16_t proto, { struct iphdr *iph; - (*pskb)->nfcache |= NFC_ALTERED; if (!skb_ip_make_writable(pskb, iphdroff + sizeof(*iph))) return 0; diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c index 91d5ea1..9ecba97 100644 --- a/net/ipv4/netfilter/ip_nat_standalone.c +++ b/net/ipv4/netfilter/ip_nat_standalone.c @@ -73,8 +73,6 @@ ip_nat_fn(unsigned int hooknum, IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET))); - (*pskb)->nfcache |= NFC_UNKNOWN; - /* If we had a hardware checksum before, it's now invalid */ if ((*pskb)->ip_summed == CHECKSUM_HW) if (skb_checksum_help(*pskb, (out == NULL))) diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index c6baa81..bc0af8d 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -392,7 +392,6 @@ ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct ipq_queue_entry *e) return -ENOMEM; memcpy(e->skb->data, v->payload, v->data_len); e->skb->ip_summed = CHECKSUM_NONE; - e->skb->nfcache |= NFC_ALTERED; /* * Extra routing may needed on local out, as the QUEUE target never diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index c88dfcd..ff8d85d 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -312,7 +312,6 @@ ipt_do_table(struct sk_buff **pskb, do { IP_NF_ASSERT(e); IP_NF_ASSERT(back); - (*pskb)->nfcache |= e->nfcache; if (ip_packet_match(ip, indev, outdev, &e->ip, offset)) { struct ipt_entry_target *t; diff --git a/net/ipv4/netfilter/ipt_CLASSIFY.c b/net/ipv4/netfilter/ipt_CLASSIFY.c index 9842e6e..dab78d8 100644 --- a/net/ipv4/netfilter/ipt_CLASSIFY.c +++ b/net/ipv4/netfilter/ipt_CLASSIFY.c @@ -32,10 +32,8 @@ target(struct sk_buff **pskb, { const struct ipt_classify_target_info *clinfo = targinfo; - if((*pskb)->priority != clinfo->priority) { + if((*pskb)->priority != clinfo->priority) (*pskb)->priority = clinfo->priority; - (*pskb)->nfcache |= NFC_ALTERED; - } return IPT_CONTINUE; } diff --git a/net/ipv4/netfilter/ipt_CONNMARK.c b/net/ipv4/netfilter/ipt_CONNMARK.c index 8ed7441..1346380 100644 --- a/net/ipv4/netfilter/ipt_CONNMARK.c +++ b/net/ipv4/netfilter/ipt_CONNMARK.c @@ -61,10 +61,8 @@ target(struct sk_buff **pskb, case IPT_CONNMARK_RESTORE: nfmark = (*pskb)->nfmark; diff = (ct->mark ^ nfmark) & markinfo->mask; - if (diff != 0) { + if (diff != 0) (*pskb)->nfmark = nfmark ^ diff; - (*pskb)->nfcache |= NFC_ALTERED; - } break; } } diff --git a/net/ipv4/netfilter/ipt_DSCP.c b/net/ipv4/netfilter/ipt_DSCP.c index 3ea4509..975476f 100644 --- a/net/ipv4/netfilter/ipt_DSCP.c +++ b/net/ipv4/netfilter/ipt_DSCP.c @@ -51,7 +51,6 @@ target(struct sk_buff **pskb, sizeof(diffs), (*pskb)->nh.iph->check ^ 0xFFFF)); - (*pskb)->nfcache |= NFC_ALTERED; } return IPT_CONTINUE; } diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c index 94a0ce1..f63a9bc 100644 --- a/net/ipv4/netfilter/ipt_ECN.c +++ b/net/ipv4/netfilter/ipt_ECN.c @@ -43,7 +43,6 @@ set_ect_ip(struct sk_buff **pskb, const struct ipt_ECN_info *einfo) sizeof(diffs), (*pskb)->nh.iph->check ^0xFFFF)); - (*pskb)->nfcache |= NFC_ALTERED; } return 1; } @@ -87,7 +86,6 @@ set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo, int inward) tcph->check = csum_fold(csum_partial((char *)diffs, sizeof(diffs), tcph->check^0xFFFF)); - (*pskb)->nfcache |= NFC_ALTERED; return 1; } diff --git a/net/ipv4/netfilter/ipt_MARK.c b/net/ipv4/netfilter/ipt_MARK.c index 8526398..52b4f2c 100644 --- a/net/ipv4/netfilter/ipt_MARK.c +++ b/net/ipv4/netfilter/ipt_MARK.c @@ -29,10 +29,9 @@ target_v0(struct sk_buff **pskb, { const struct ipt_mark_target_info *markinfo = targinfo; - if((*pskb)->nfmark != markinfo->mark) { + if((*pskb)->nfmark != markinfo->mark) (*pskb)->nfmark = markinfo->mark; - (*pskb)->nfcache |= NFC_ALTERED; - } + return IPT_CONTINUE; } @@ -61,10 +60,9 @@ target_v1(struct sk_buff **pskb, break; } - if((*pskb)->nfmark != mark) { + if((*pskb)->nfmark != mark) (*pskb)->nfmark = mark; - (*pskb)->nfcache |= NFC_ALTERED; - } + return IPT_CONTINUE; } diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index 9156964..f115a84 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -156,7 +156,6 @@ static void send_reset(struct sk_buff *oldskb, int hook) /* This packet will not be the same as the other: clear nf fields */ nf_reset(nskb); - nskb->nfcache = 0; nskb->nfmark = 0; #ifdef CONFIG_BRIDGE_NETFILTER nf_bridge_put(nskb->nf_bridge); diff --git a/net/ipv4/netfilter/ipt_TCPMSS.c b/net/ipv4/netfilter/ipt_TCPMSS.c index 7b84a25..9492883 100644 --- a/net/ipv4/netfilter/ipt_TCPMSS.c +++ b/net/ipv4/netfilter/ipt_TCPMSS.c @@ -190,7 +190,6 @@ ipt_tcpmss_target(struct sk_buff **pskb, newmss); retmodified: - (*pskb)->nfcache |= NFC_UNKNOWN | NFC_ALTERED; return IPT_CONTINUE; } diff --git a/net/ipv4/netfilter/ipt_TOS.c b/net/ipv4/netfilter/ipt_TOS.c index 85c70d2..49abb7e 100644 --- a/net/ipv4/netfilter/ipt_TOS.c +++ b/net/ipv4/netfilter/ipt_TOS.c @@ -46,7 +46,6 @@ target(struct sk_buff **pskb, sizeof(diffs), (*pskb)->nh.iph->check ^0xFFFF)); - (*pskb)->nfcache |= NFC_ALTERED; } return IPT_CONTINUE; } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index ae652ca..590d2b7 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -185,19 +185,6 @@ int ip6_route_me_harder(struct sk_buff *skb) } #endif -static inline int ip6_maybe_reroute(struct sk_buff *skb) -{ -#ifdef CONFIG_NETFILTER - if (skb->nfcache & NFC_ALTERED){ - if (ip6_route_me_harder(skb) != 0){ - kfree_skb(skb); - return -EINVAL; - } - } -#endif /* CONFIG_NETFILTER */ - return dst_output(skb); -} - /* * xmit an sk_buff (used by TCP) */ @@ -266,7 +253,8 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, mtu = dst_mtu(dst); if ((skb->len <= mtu) || ipfragok) { IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); - return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, ip6_maybe_reroute); + return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, + dst_output); } if (net_ratelimit()) diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c index a16df5b..83ccedc 100644 --- a/net/ipv6/netfilter/ip6_queue.c +++ b/net/ipv6/netfilter/ip6_queue.c @@ -388,7 +388,6 @@ ipq_mangle_ipv6(ipq_verdict_msg_t *v, struct ipq_queue_entry *e) return -ENOMEM; memcpy(e->skb->data, v->payload, v->data_len); e->skb->ip_summed = CHECKSUM_NONE; - e->skb->nfcache |= NFC_ALTERED; /* * Extra routing may needed on local out, as the QUEUE target never diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 7303451..41a67cf 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -401,7 +401,6 @@ ip6t_do_table(struct sk_buff **pskb, do { IP_NF_ASSERT(e); IP_NF_ASSERT(back); - (*pskb)->nfcache |= e->nfcache; if (ip6_packet_match(*pskb, indev, outdev, &e->ipv6, &protoff, &offset)) { struct ip6t_entry_target *t; diff --git a/net/ipv6/netfilter/ip6t_MARK.c b/net/ipv6/netfilter/ip6t_MARK.c index d09ceb0..81924fc 100644 --- a/net/ipv6/netfilter/ip6t_MARK.c +++ b/net/ipv6/netfilter/ip6t_MARK.c @@ -28,10 +28,9 @@ target(struct sk_buff **pskb, { const struct ip6t_mark_target_info *markinfo = targinfo; - if((*pskb)->nfmark != markinfo->mark) { + if((*pskb)->nfmark != markinfo->mark) (*pskb)->nfmark = markinfo->mark; - (*pskb)->nfcache |= NFC_ALTERED; - } + return IP6T_CONTINUE; } -- cgit v0.10.2 From 8728b834b226ffcf2c94a58530090e292af2a7bf Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 9 Aug 2005 19:25:21 -0700 Subject: [NET]: Kill skb->list Remove the "list" member of struct sk_buff, as it is entirely redundant. All SKB list removal callers know which list the SKB is on, so storing this in sk_buff does nothing other than taking up some space. Two tricky bits were SCTP, which I took care of, and two ATM drivers which Francois Romieu fixed up. Signed-off-by: David S. Miller Signed-off-by: Francois Romieu diff --git a/drivers/atm/nicstar.c b/drivers/atm/nicstar.c index b2a7b75..a0e3bd8 100644 --- a/drivers/atm/nicstar.c +++ b/drivers/atm/nicstar.c @@ -214,8 +214,7 @@ static int __devinit ns_init_card(int i, struct pci_dev *pcidev); static void __devinit ns_init_card_error(ns_dev *card, int error); static scq_info *get_scq(int size, u32 scd); static void free_scq(scq_info *scq, struct atm_vcc *vcc); -static void push_rxbufs(ns_dev *card, u32 type, u32 handle1, u32 addr1, - u32 handle2, u32 addr2); +static void push_rxbufs(ns_dev *, struct sk_buff *); static irqreturn_t ns_irq_handler(int irq, void *dev_id, struct pt_regs *regs); static int ns_open(struct atm_vcc *vcc); static void ns_close(struct atm_vcc *vcc); @@ -766,6 +765,7 @@ static int __devinit ns_init_card(int i, struct pci_dev *pcidev) ns_init_card_error(card, error); return error; } + NS_SKB_CB(hb)->buf_type = BUF_NONE; skb_queue_tail(&card->hbpool.queue, hb); card->hbpool.count++; } @@ -786,9 +786,10 @@ static int __devinit ns_init_card(int i, struct pci_dev *pcidev) ns_init_card_error(card, error); return error; } + NS_SKB_CB(lb)->buf_type = BUF_LG; skb_queue_tail(&card->lbpool.queue, lb); skb_reserve(lb, NS_SMBUFSIZE); - push_rxbufs(card, BUF_LG, (u32) lb, (u32) virt_to_bus(lb->data), 0, 0); + push_rxbufs(card, lb); /* Due to the implementation of push_rxbufs() this is 1, not 0 */ if (j == 1) { @@ -822,9 +823,10 @@ static int __devinit ns_init_card(int i, struct pci_dev *pcidev) ns_init_card_error(card, error); return error; } + NS_SKB_CB(sb)->buf_type = BUF_SM; skb_queue_tail(&card->sbpool.queue, sb); skb_reserve(sb, NS_AAL0_HEADER); - push_rxbufs(card, BUF_SM, (u32) sb, (u32) virt_to_bus(sb->data), 0, 0); + push_rxbufs(card, sb); } /* Test for strange behaviour which leads to crashes */ if ((bcount = ns_stat_sfbqc_get(readl(card->membase + STAT))) < card->sbnr.min) @@ -852,6 +854,7 @@ static int __devinit ns_init_card(int i, struct pci_dev *pcidev) ns_init_card_error(card, error); return error; } + NS_SKB_CB(iovb)->buf_type = BUF_NONE; skb_queue_tail(&card->iovpool.queue, iovb); card->iovpool.count++; } @@ -1078,12 +1081,18 @@ static void free_scq(scq_info *scq, struct atm_vcc *vcc) /* The handles passed must be pointers to the sk_buff containing the small or large buffer(s) cast to u32. */ -static void push_rxbufs(ns_dev *card, u32 type, u32 handle1, u32 addr1, - u32 handle2, u32 addr2) +static void push_rxbufs(ns_dev *card, struct sk_buff *skb) { + struct ns_skb_cb *cb = NS_SKB_CB(skb); + u32 handle1, addr1; + u32 handle2, addr2; u32 stat; unsigned long flags; + /* *BARF* */ + handle2 = addr2 = 0; + handle1 = (u32)skb; + addr1 = (u32)virt_to_bus(skb->data); #ifdef GENERAL_DEBUG if (!addr1) @@ -1093,7 +1102,7 @@ static void push_rxbufs(ns_dev *card, u32 type, u32 handle1, u32 addr1, stat = readl(card->membase + STAT); card->sbfqc = ns_stat_sfbqc_get(stat); card->lbfqc = ns_stat_lfbqc_get(stat); - if (type == BUF_SM) + if (cb->buf_type == BUF_SM) { if (!addr2) { @@ -1111,7 +1120,7 @@ static void push_rxbufs(ns_dev *card, u32 type, u32 handle1, u32 addr1, } } } - else /* type == BUF_LG */ + else /* buf_type == BUF_LG */ { if (!addr2) { @@ -1132,26 +1141,26 @@ static void push_rxbufs(ns_dev *card, u32 type, u32 handle1, u32 addr1, if (addr2) { - if (type == BUF_SM) + if (cb->buf_type == BUF_SM) { if (card->sbfqc >= card->sbnr.max) { - skb_unlink((struct sk_buff *) handle1); + skb_unlink((struct sk_buff *) handle1, &card->sbpool.queue); dev_kfree_skb_any((struct sk_buff *) handle1); - skb_unlink((struct sk_buff *) handle2); + skb_unlink((struct sk_buff *) handle2, &card->sbpool.queue); dev_kfree_skb_any((struct sk_buff *) handle2); return; } else card->sbfqc += 2; } - else /* (type == BUF_LG) */ + else /* (buf_type == BUF_LG) */ { if (card->lbfqc >= card->lbnr.max) { - skb_unlink((struct sk_buff *) handle1); + skb_unlink((struct sk_buff *) handle1, &card->lbpool.queue); dev_kfree_skb_any((struct sk_buff *) handle1); - skb_unlink((struct sk_buff *) handle2); + skb_unlink((struct sk_buff *) handle2, &card->lbpool.queue); dev_kfree_skb_any((struct sk_buff *) handle2); return; } @@ -1166,12 +1175,12 @@ static void push_rxbufs(ns_dev *card, u32 type, u32 handle1, u32 addr1, writel(handle2, card->membase + DR2); writel(addr1, card->membase + DR1); writel(handle1, card->membase + DR0); - writel(NS_CMD_WRITE_FREEBUFQ | (u32) type, card->membase + CMD); + writel(NS_CMD_WRITE_FREEBUFQ | cb->buf_type, card->membase + CMD); spin_unlock_irqrestore(&card->res_lock, flags); XPRINTK("nicstar%d: Pushing %s buffers at 0x%x and 0x%x.\n", card->index, - (type == BUF_SM ? "small" : "large"), addr1, addr2); + (cb->buf_type == BUF_SM ? "small" : "large"), addr1, addr2); } if (!card->efbie && card->sbfqc >= card->sbnr.min && @@ -1322,9 +1331,10 @@ static irqreturn_t ns_irq_handler(int irq, void *dev_id, struct pt_regs *regs) card->efbie = 0; break; } + NS_SKB_CB(sb)->buf_type = BUF_SM; skb_queue_tail(&card->sbpool.queue, sb); skb_reserve(sb, NS_AAL0_HEADER); - push_rxbufs(card, BUF_SM, (u32) sb, (u32) virt_to_bus(sb->data), 0, 0); + push_rxbufs(card, sb); } card->sbfqc = i; process_rsq(card); @@ -1348,9 +1358,10 @@ static irqreturn_t ns_irq_handler(int irq, void *dev_id, struct pt_regs *regs) card->efbie = 0; break; } + NS_SKB_CB(lb)->buf_type = BUF_LG; skb_queue_tail(&card->lbpool.queue, lb); skb_reserve(lb, NS_SMBUFSIZE); - push_rxbufs(card, BUF_LG, (u32) lb, (u32) virt_to_bus(lb->data), 0, 0); + push_rxbufs(card, lb); } card->lbfqc = i; process_rsq(card); @@ -2227,6 +2238,7 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe) recycle_rx_buf(card, skb); return; } + NS_SKB_CB(iovb)->buf_type = BUF_NONE; } else if (--card->iovpool.count < card->iovnr.min) @@ -2234,6 +2246,7 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe) struct sk_buff *new_iovb; if ((new_iovb = alloc_skb(NS_IOVBUFSIZE, GFP_ATOMIC)) != NULL) { + NS_SKB_CB(iovb)->buf_type = BUF_NONE; skb_queue_tail(&card->iovpool.queue, new_iovb); card->iovpool.count++; } @@ -2264,7 +2277,7 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe) if (NS_SKB(iovb)->iovcnt == 1) { - if (skb->list != &card->sbpool.queue) + if (NS_SKB_CB(skb)->buf_type != BUF_SM) { printk("nicstar%d: Expected a small buffer, and this is not one.\n", card->index); @@ -2278,7 +2291,7 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe) } else /* NS_SKB(iovb)->iovcnt >= 2 */ { - if (skb->list != &card->lbpool.queue) + if (NS_SKB_CB(skb)->buf_type != BUF_LG) { printk("nicstar%d: Expected a large buffer, and this is not one.\n", card->index); @@ -2322,8 +2335,7 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe) /* skb points to a small buffer */ if (!atm_charge(vcc, skb->truesize)) { - push_rxbufs(card, BUF_SM, (u32) skb, (u32) virt_to_bus(skb->data), - 0, 0); + push_rxbufs(card, skb); atomic_inc(&vcc->stats->rx_drop); } else @@ -2350,8 +2362,7 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe) { if (!atm_charge(vcc, sb->truesize)) { - push_rxbufs(card, BUF_SM, (u32) sb, (u32) virt_to_bus(sb->data), - 0, 0); + push_rxbufs(card, sb); atomic_inc(&vcc->stats->rx_drop); } else @@ -2367,16 +2378,14 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe) atomic_inc(&vcc->stats->rx); } - push_rxbufs(card, BUF_LG, (u32) skb, - (u32) virt_to_bus(skb->data), 0, 0); + push_rxbufs(card, skb); } else /* len > NS_SMBUFSIZE, the usual case */ { if (!atm_charge(vcc, skb->truesize)) { - push_rxbufs(card, BUF_LG, (u32) skb, - (u32) virt_to_bus(skb->data), 0, 0); + push_rxbufs(card, skb); atomic_inc(&vcc->stats->rx_drop); } else @@ -2394,8 +2403,7 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe) atomic_inc(&vcc->stats->rx); } - push_rxbufs(card, BUF_SM, (u32) sb, (u32) virt_to_bus(sb->data), - 0, 0); + push_rxbufs(card, sb); } @@ -2430,6 +2438,7 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe) card->hbpool.count++; } } + NS_SKB_CB(hb)->buf_type = BUF_NONE; } else if (--card->hbpool.count < card->hbnr.min) @@ -2437,6 +2446,7 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe) struct sk_buff *new_hb; if ((new_hb = dev_alloc_skb(NS_HBUFSIZE)) != NULL) { + NS_SKB_CB(new_hb)->buf_type = BUF_NONE; skb_queue_tail(&card->hbpool.queue, new_hb); card->hbpool.count++; } @@ -2444,6 +2454,7 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe) { if ((new_hb = dev_alloc_skb(NS_HBUFSIZE)) != NULL) { + NS_SKB_CB(new_hb)->buf_type = BUF_NONE; skb_queue_tail(&card->hbpool.queue, new_hb); card->hbpool.count++; } @@ -2473,8 +2484,7 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe) remaining = len - iov->iov_len; iov++; /* Free the small buffer */ - push_rxbufs(card, BUF_SM, (u32) sb, (u32) virt_to_bus(sb->data), - 0, 0); + push_rxbufs(card, sb); /* Copy all large buffers to the huge buffer and free them */ for (j = 1; j < NS_SKB(iovb)->iovcnt; j++) @@ -2485,8 +2495,7 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe) skb_put(hb, tocopy); iov++; remaining -= tocopy; - push_rxbufs(card, BUF_LG, (u32) lb, - (u32) virt_to_bus(lb->data), 0, 0); + push_rxbufs(card, lb); } #ifdef EXTRA_DEBUG if (remaining != 0 || hb->len != len) @@ -2527,9 +2536,10 @@ static void ns_sb_destructor(struct sk_buff *sb) sb = __dev_alloc_skb(NS_SMSKBSIZE, GFP_KERNEL); if (sb == NULL) break; + NS_SKB_CB(sb)->buf_type = BUF_SM; skb_queue_tail(&card->sbpool.queue, sb); skb_reserve(sb, NS_AAL0_HEADER); - push_rxbufs(card, BUF_SM, (u32) sb, (u32) virt_to_bus(sb->data), 0, 0); + push_rxbufs(card, sb); } while (card->sbfqc < card->sbnr.min); } @@ -2550,9 +2560,10 @@ static void ns_lb_destructor(struct sk_buff *lb) lb = __dev_alloc_skb(NS_LGSKBSIZE, GFP_KERNEL); if (lb == NULL) break; + NS_SKB_CB(lb)->buf_type = BUF_LG; skb_queue_tail(&card->lbpool.queue, lb); skb_reserve(lb, NS_SMBUFSIZE); - push_rxbufs(card, BUF_LG, (u32) lb, (u32) virt_to_bus(lb->data), 0, 0); + push_rxbufs(card, lb); } while (card->lbfqc < card->lbnr.min); } @@ -2569,6 +2580,7 @@ static void ns_hb_destructor(struct sk_buff *hb) hb = __dev_alloc_skb(NS_HBUFSIZE, GFP_KERNEL); if (hb == NULL) break; + NS_SKB_CB(hb)->buf_type = BUF_NONE; skb_queue_tail(&card->hbpool.queue, hb); card->hbpool.count++; } @@ -2577,45 +2589,25 @@ static void ns_hb_destructor(struct sk_buff *hb) #endif /* NS_USE_DESTRUCTORS */ - static void recycle_rx_buf(ns_dev *card, struct sk_buff *skb) { - if (skb->list == &card->sbpool.queue) - push_rxbufs(card, BUF_SM, (u32) skb, (u32) virt_to_bus(skb->data), 0, 0); - else if (skb->list == &card->lbpool.queue) - push_rxbufs(card, BUF_LG, (u32) skb, (u32) virt_to_bus(skb->data), 0, 0); - else - { - printk("nicstar%d: What kind of rx buffer is this?\n", card->index); - dev_kfree_skb_any(skb); - } -} + struct ns_skb_cb *cb = NS_SKB_CB(skb); + if (unlikely(cb->buf_type == BUF_NONE)) { + printk("nicstar%d: What kind of rx buffer is this?\n", card->index); + dev_kfree_skb_any(skb); + } else + push_rxbufs(card, skb); +} static void recycle_iovec_rx_bufs(ns_dev *card, struct iovec *iov, int count) { - struct sk_buff *skb; - - for (; count > 0; count--) - { - skb = (struct sk_buff *) (iov++)->iov_base; - if (skb->list == &card->sbpool.queue) - push_rxbufs(card, BUF_SM, (u32) skb, (u32) virt_to_bus(skb->data), - 0, 0); - else if (skb->list == &card->lbpool.queue) - push_rxbufs(card, BUF_LG, (u32) skb, (u32) virt_to_bus(skb->data), - 0, 0); - else - { - printk("nicstar%d: What kind of rx buffer is this?\n", card->index); - dev_kfree_skb_any(skb); - } - } + while (count-- > 0) + recycle_rx_buf(card, (struct sk_buff *) (iov++)->iov_base); } - static void recycle_iov_buf(ns_dev *card, struct sk_buff *iovb) { if (card->iovpool.count < card->iovnr.max) @@ -2631,7 +2623,7 @@ static void recycle_iov_buf(ns_dev *card, struct sk_buff *iovb) static void dequeue_sm_buf(ns_dev *card, struct sk_buff *sb) { - skb_unlink(sb); + skb_unlink(sb, &card->sbpool.queue); #ifdef NS_USE_DESTRUCTORS if (card->sbfqc < card->sbnr.min) #else @@ -2640,10 +2632,10 @@ static void dequeue_sm_buf(ns_dev *card, struct sk_buff *sb) struct sk_buff *new_sb; if ((new_sb = dev_alloc_skb(NS_SMSKBSIZE)) != NULL) { + NS_SKB_CB(new_sb)->buf_type = BUF_SM; skb_queue_tail(&card->sbpool.queue, new_sb); skb_reserve(new_sb, NS_AAL0_HEADER); - push_rxbufs(card, BUF_SM, (u32) new_sb, - (u32) virt_to_bus(new_sb->data), 0, 0); + push_rxbufs(card, new_sb); } } if (card->sbfqc < card->sbnr.init) @@ -2652,10 +2644,10 @@ static void dequeue_sm_buf(ns_dev *card, struct sk_buff *sb) struct sk_buff *new_sb; if ((new_sb = dev_alloc_skb(NS_SMSKBSIZE)) != NULL) { + NS_SKB_CB(new_sb)->buf_type = BUF_SM; skb_queue_tail(&card->sbpool.queue, new_sb); skb_reserve(new_sb, NS_AAL0_HEADER); - push_rxbufs(card, BUF_SM, (u32) new_sb, - (u32) virt_to_bus(new_sb->data), 0, 0); + push_rxbufs(card, new_sb); } } } @@ -2664,7 +2656,7 @@ static void dequeue_sm_buf(ns_dev *card, struct sk_buff *sb) static void dequeue_lg_buf(ns_dev *card, struct sk_buff *lb) { - skb_unlink(lb); + skb_unlink(lb, &card->lbpool.queue); #ifdef NS_USE_DESTRUCTORS if (card->lbfqc < card->lbnr.min) #else @@ -2673,10 +2665,10 @@ static void dequeue_lg_buf(ns_dev *card, struct sk_buff *lb) struct sk_buff *new_lb; if ((new_lb = dev_alloc_skb(NS_LGSKBSIZE)) != NULL) { + NS_SKB_CB(new_lb)->buf_type = BUF_LG; skb_queue_tail(&card->lbpool.queue, new_lb); skb_reserve(new_lb, NS_SMBUFSIZE); - push_rxbufs(card, BUF_LG, (u32) new_lb, - (u32) virt_to_bus(new_lb->data), 0, 0); + push_rxbufs(card, new_lb); } } if (card->lbfqc < card->lbnr.init) @@ -2685,10 +2677,10 @@ static void dequeue_lg_buf(ns_dev *card, struct sk_buff *lb) struct sk_buff *new_lb; if ((new_lb = dev_alloc_skb(NS_LGSKBSIZE)) != NULL) { + NS_SKB_CB(new_lb)->buf_type = BUF_LG; skb_queue_tail(&card->lbpool.queue, new_lb); skb_reserve(new_lb, NS_SMBUFSIZE); - push_rxbufs(card, BUF_LG, (u32) new_lb, - (u32) virt_to_bus(new_lb->data), 0, 0); + push_rxbufs(card, new_lb); } } } @@ -2880,9 +2872,10 @@ static int ns_ioctl(struct atm_dev *dev, unsigned int cmd, void __user *arg) sb = __dev_alloc_skb(NS_SMSKBSIZE, GFP_KERNEL); if (sb == NULL) return -ENOMEM; + NS_SKB_CB(sb)->buf_type = BUF_SM; skb_queue_tail(&card->sbpool.queue, sb); skb_reserve(sb, NS_AAL0_HEADER); - push_rxbufs(card, BUF_SM, (u32) sb, (u32) virt_to_bus(sb->data), 0, 0); + push_rxbufs(card, sb); } break; @@ -2894,9 +2887,10 @@ static int ns_ioctl(struct atm_dev *dev, unsigned int cmd, void __user *arg) lb = __dev_alloc_skb(NS_LGSKBSIZE, GFP_KERNEL); if (lb == NULL) return -ENOMEM; + NS_SKB_CB(lb)->buf_type = BUF_LG; skb_queue_tail(&card->lbpool.queue, lb); skb_reserve(lb, NS_SMBUFSIZE); - push_rxbufs(card, BUF_LG, (u32) lb, (u32) virt_to_bus(lb->data), 0, 0); + push_rxbufs(card, lb); } break; @@ -2923,6 +2917,7 @@ static int ns_ioctl(struct atm_dev *dev, unsigned int cmd, void __user *arg) hb = __dev_alloc_skb(NS_HBUFSIZE, GFP_KERNEL); if (hb == NULL) return -ENOMEM; + NS_SKB_CB(hb)->buf_type = BUF_NONE; ns_grab_int_lock(card, flags); skb_queue_tail(&card->hbpool.queue, hb); card->hbpool.count++; @@ -2953,6 +2948,7 @@ static int ns_ioctl(struct atm_dev *dev, unsigned int cmd, void __user *arg) iovb = alloc_skb(NS_IOVBUFSIZE, GFP_KERNEL); if (iovb == NULL) return -ENOMEM; + NS_SKB_CB(iovb)->buf_type = BUF_NONE; ns_grab_int_lock(card, flags); skb_queue_tail(&card->iovpool.queue, iovb); card->iovpool.count++; @@ -2979,17 +2975,12 @@ static int ns_ioctl(struct atm_dev *dev, unsigned int cmd, void __user *arg) } - static void which_list(ns_dev *card, struct sk_buff *skb) { - printk("It's a %s buffer.\n", skb->list == &card->sbpool.queue ? - "small" : skb->list == &card->lbpool.queue ? "large" : - skb->list == &card->hbpool.queue ? "huge" : - skb->list == &card->iovpool.queue ? "iovec" : "unknown"); + printk("skb buf_type: 0x%08x\n", NS_SKB_CB(skb)->buf_type); } - static void ns_poll(unsigned long arg) { int i; diff --git a/drivers/atm/nicstar.h b/drivers/atm/nicstar.h index ea83c46..5997bcb 100644 --- a/drivers/atm/nicstar.h +++ b/drivers/atm/nicstar.h @@ -103,8 +103,14 @@ #define NS_IOREMAP_SIZE 4096 -#define BUF_SM 0x00000000 /* These two are used for push_rxbufs() */ -#define BUF_LG 0x00000001 /* CMD, Write_FreeBufQ, LBUF bit */ +/* + * BUF_XX distinguish the Rx buffers depending on their (small/large) size. + * BUG_SM and BUG_LG are both used by the driver and the device. + * BUF_NONE is only used by the driver. + */ +#define BUF_SM 0x00000000 /* These two are used for push_rxbufs() */ +#define BUF_LG 0x00000001 /* CMD, Write_FreeBufQ, LBUF bit */ +#define BUF_NONE 0xffffffff /* Software only: */ #define NS_HBUFSIZE 65568 /* Size of max. AAL5 PDU */ #define NS_MAX_IOVECS (2 + (65568 - NS_SMBUFSIZE) / \ @@ -684,6 +690,12 @@ enum ns_regs /* Device driver structures ***************************************************/ +struct ns_skb_cb { + u32 buf_type; /* BUF_SM/BUF_LG/BUF_NONE */ +}; + +#define NS_SKB_CB(skb) ((struct ns_skb_cb *)((skb)->cb)) + typedef struct tsq_info { void *org; diff --git a/drivers/atm/zatm.c b/drivers/atm/zatm.c index a2b236a..85fee95 100644 --- a/drivers/atm/zatm.c +++ b/drivers/atm/zatm.c @@ -417,10 +417,12 @@ printk("dummy: 0x%08lx, 0x%08lx\n",dummy[0],dummy[1]); chan = (here[3] & uPD98401_AAL5_CHAN) >> uPD98401_AAL5_CHAN_SHIFT; if (chan < zatm_dev->chans && zatm_dev->rx_map[chan]) { + int pos = ZATM_VCC(vcc)->pool; + vcc = zatm_dev->rx_map[chan]; - if (skb == zatm_dev->last_free[ZATM_VCC(vcc)->pool]) - zatm_dev->last_free[ZATM_VCC(vcc)->pool] = NULL; - skb_unlink(skb); + if (skb == zatm_dev->last_free[pos]) + zatm_dev->last_free[pos] = NULL; + skb_unlink(skb, zatm_dev->pool + pos); } else { printk(KERN_ERR DEV_LABEL "(itf %d): RX indication " diff --git a/drivers/bluetooth/bfusb.c b/drivers/bluetooth/bfusb.c index c42d7e6..e8d2a34 100644 --- a/drivers/bluetooth/bfusb.c +++ b/drivers/bluetooth/bfusb.c @@ -158,7 +158,7 @@ static int bfusb_send_bulk(struct bfusb *bfusb, struct sk_buff *skb) if (err) { BT_ERR("%s bulk tx submit failed urb %p err %d", bfusb->hdev->name, urb, err); - skb_unlink(skb); + skb_unlink(skb, &bfusb->pending_q); usb_free_urb(urb); } else atomic_inc(&bfusb->pending_tx); @@ -212,7 +212,7 @@ static void bfusb_tx_complete(struct urb *urb, struct pt_regs *regs) read_lock(&bfusb->lock); - skb_unlink(skb); + skb_unlink(skb, &bfusb->pending_q); skb_queue_tail(&bfusb->completed_q, skb); bfusb_tx_wakeup(bfusb); @@ -253,7 +253,7 @@ static int bfusb_rx_submit(struct bfusb *bfusb, struct urb *urb) if (err) { BT_ERR("%s bulk rx submit failed urb %p err %d", bfusb->hdev->name, urb, err); - skb_unlink(skb); + skb_unlink(skb, &bfusb->pending_q); kfree_skb(skb); usb_free_urb(urb); } @@ -398,7 +398,7 @@ static void bfusb_rx_complete(struct urb *urb, struct pt_regs *regs) buf += len; } - skb_unlink(skb); + skb_unlink(skb, &bfusb->pending_q); kfree_skb(skb); bfusb_rx_submit(bfusb, urb); diff --git a/drivers/ieee1394/ieee1394_core.c b/drivers/ieee1394/ieee1394_core.c index b248d89..d633770 100644 --- a/drivers/ieee1394/ieee1394_core.c +++ b/drivers/ieee1394/ieee1394_core.c @@ -681,7 +681,7 @@ static void handle_packet_response(struct hpsb_host *host, int tcode, return; } - __skb_unlink(skb, skb->list); + __skb_unlink(skb, &host->pending_packet_queue); if (packet->state == hpsb_queued) { packet->sendtime = jiffies; @@ -989,7 +989,7 @@ void abort_timedouts(unsigned long __opaque) packet = (struct hpsb_packet *)skb->data; if (time_before(packet->sendtime + expire, jiffies)) { - __skb_unlink(skb, skb->list); + __skb_unlink(skb, &host->pending_packet_queue); packet->state = hpsb_complete; packet->ack_code = ACKX_TIMEOUT; queue_packet_complete(packet); diff --git a/drivers/isdn/act2000/capi.c b/drivers/isdn/act2000/capi.c index afa4668..6ae6eb3 100644 --- a/drivers/isdn/act2000/capi.c +++ b/drivers/isdn/act2000/capi.c @@ -606,7 +606,7 @@ handle_ack(act2000_card *card, act2000_chan *chan, __u8 blocknr) { if ((((m->msg.data_b3_req.fakencci >> 8) & 0xff) == chan->ncci) && (m->msg.data_b3_req.blocknr == blocknr)) { /* found corresponding DATA_B3_REQ */ - skb_unlink(tmp); + skb_unlink(tmp, &card->ackq); chan->queued -= m->msg.data_b3_req.datalen; if (m->msg.data_b3_req.flags) ret = m->msg.data_b3_req.datalen; diff --git a/drivers/net/shaper.c b/drivers/net/shaper.c index 3ad0b67..221354e 100644 --- a/drivers/net/shaper.c +++ b/drivers/net/shaper.c @@ -156,52 +156,6 @@ static int shaper_start_xmit(struct sk_buff *skb, struct net_device *dev) SHAPERCB(skb)->shapelen= shaper_clocks(shaper,skb); -#ifdef SHAPER_COMPLEX /* and broken.. */ - - while(ptr && ptr!=(struct sk_buff *)&shaper->sendq) - { - if(ptr->pripri - && jiffies - SHAPERCB(ptr)->shapeclock < SHAPER_MAXSLIP) - { - struct sk_buff *tmp=ptr->prev; - - /* - * It goes before us therefore we slip the length - * of the new frame. - */ - - SHAPERCB(ptr)->shapeclock+=SHAPERCB(skb)->shapelen; - SHAPERCB(ptr)->shapelatency+=SHAPERCB(skb)->shapelen; - - /* - * The packet may have slipped so far back it - * fell off. - */ - if(SHAPERCB(ptr)->shapelatency > SHAPER_LATENCY) - { - skb_unlink(ptr); - dev_kfree_skb(ptr); - } - ptr=tmp; - } - else - break; - } - if(ptr==NULL || ptr==(struct sk_buff *)&shaper->sendq) - skb_queue_head(&shaper->sendq,skb); - else - { - struct sk_buff *tmp; - /* - * Set the packet clock out time according to the - * frames ahead. Im sure a bit of thought could drop - * this loop. - */ - for(tmp=skb_peek(&shaper->sendq); tmp!=NULL && tmp!=ptr; tmp=tmp->next) - SHAPERCB(skb)->shapeclock+=tmp->shapelen; - skb_append(ptr,skb); - } -#else { struct sk_buff *tmp; /* @@ -220,7 +174,7 @@ static int shaper_start_xmit(struct sk_buff *skb, struct net_device *dev) } else skb_queue_tail(&shaper->sendq, skb); } -#endif + if(sh_debug) printk("Frame queued.\n"); if(skb_queue_len(&shaper->sendq)>SHAPER_QLEN) @@ -302,7 +256,7 @@ static void shaper_kick(struct shaper *shaper) * Pull the frame and get interrupts back on. */ - skb_unlink(skb); + skb_unlink(skb, &shaper->sendq); if (shaper->recovery < SHAPERCB(skb)->shapeclock + SHAPERCB(skb)->shapelen) shaper->recovery = SHAPERCB(skb)->shapeclock + SHAPERCB(skb)->shapelen; diff --git a/drivers/net/wan/sdla_fr.c b/drivers/net/wan/sdla_fr.c index c5f5e62..0497dbd 100644 --- a/drivers/net/wan/sdla_fr.c +++ b/drivers/net/wan/sdla_fr.c @@ -445,7 +445,7 @@ void s508_s514_unlock(sdla_t *card, unsigned long *smp_flags); void s508_s514_lock(sdla_t *card, unsigned long *smp_flags); unsigned short calc_checksum (char *, int); -static int setup_fr_header(struct sk_buff** skb, +static int setup_fr_header(struct sk_buff *skb, struct net_device* dev, char op_mode); @@ -1372,7 +1372,7 @@ static int if_send(struct sk_buff* skb, struct net_device* dev) /* Move the if_header() code to here. By inserting frame * relay header in if_header() we would break the * tcpdump and other packet sniffers */ - chan->fr_header_len = setup_fr_header(&skb,dev,chan->common.usedby); + chan->fr_header_len = setup_fr_header(skb,dev,chan->common.usedby); if (chan->fr_header_len < 0 ){ ++chan->ifstats.tx_dropped; ++card->wandev.stats.tx_dropped; @@ -1597,8 +1597,6 @@ static int setup_for_delayed_transmit(struct net_device* dev, return 1; } - skb_unlink(skb); - chan->transmit_length = len; chan->delay_skb = skb; @@ -4871,18 +4869,15 @@ static void unconfig_fr (sdla_t *card) } } -static int setup_fr_header(struct sk_buff **skb_orig, struct net_device* dev, +static int setup_fr_header(struct sk_buff *skb, struct net_device* dev, char op_mode) { - struct sk_buff *skb = *skb_orig; fr_channel_t *chan=dev->priv; - if (op_mode == WANPIPE){ - + if (op_mode == WANPIPE) { chan->fr_header[0]=Q922_UI; switch (htons(skb->protocol)){ - case ETH_P_IP: chan->fr_header[1]=NLPID_IP; break; @@ -4894,16 +4889,14 @@ static int setup_fr_header(struct sk_buff **skb_orig, struct net_device* dev, } /* If we are in bridging mode, we must apply - * an Ethernet header */ - if (op_mode == BRIDGE || op_mode == BRIDGE_NODE){ - - + * an Ethernet header + */ + if (op_mode == BRIDGE || op_mode == BRIDGE_NODE) { /* Encapsulate the packet as a bridged Ethernet frame. */ #ifdef DEBUG printk(KERN_INFO "%s: encapsulating skb for frame relay\n", dev->name); #endif - chan->fr_header[0] = 0x03; chan->fr_header[1] = 0x00; chan->fr_header[2] = 0x80; @@ -4916,7 +4909,6 @@ static int setup_fr_header(struct sk_buff **skb_orig, struct net_device* dev, /* Yuck. */ skb->protocol = ETH_P_802_3; return 8; - } return 0; diff --git a/drivers/usb/net/usbnet.c b/drivers/usb/net/usbnet.c index 4528a00..a2f6724 100644 --- a/drivers/usb/net/usbnet.c +++ b/drivers/usb/net/usbnet.c @@ -2903,19 +2903,18 @@ static struct net_device_stats *usbnet_get_stats (struct net_device *net) * completion callbacks. 2.5 should have fixed those bugs... */ -static void defer_bh (struct usbnet *dev, struct sk_buff *skb) +static void defer_bh(struct usbnet *dev, struct sk_buff *skb, struct sk_buff_head *list) { - struct sk_buff_head *list = skb->list; unsigned long flags; - spin_lock_irqsave (&list->lock, flags); - __skb_unlink (skb, list); - spin_unlock (&list->lock); - spin_lock (&dev->done.lock); - __skb_queue_tail (&dev->done, skb); + spin_lock_irqsave(&list->lock, flags); + __skb_unlink(skb, list); + spin_unlock(&list->lock); + spin_lock(&dev->done.lock); + __skb_queue_tail(&dev->done, skb); if (dev->done.qlen == 1) - tasklet_schedule (&dev->bh); - spin_unlock_irqrestore (&dev->done.lock, flags); + tasklet_schedule(&dev->bh); + spin_unlock_irqrestore(&dev->done.lock, flags); } /* some work can't be done in tasklets, so we use keventd @@ -3120,7 +3119,7 @@ block: break; } - defer_bh (dev, skb); + defer_bh(dev, skb, &dev->rxq); if (urb) { if (netif_running (dev->net) @@ -3490,7 +3489,7 @@ static void tx_complete (struct urb *urb, struct pt_regs *regs) urb->dev = NULL; entry->state = tx_done; - defer_bh (dev, skb); + defer_bh(dev, skb, &dev->txq); } /*-------------------------------------------------------------------------*/ diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 4b929c3..76c6885 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -204,7 +204,6 @@ struct sk_buff { struct sk_buff *next; struct sk_buff *prev; - struct sk_buff_head *list; struct sock *sk; struct timeval stamp; struct net_device *dev; @@ -597,7 +596,6 @@ static inline void __skb_queue_head(struct sk_buff_head *list, { struct sk_buff *prev, *next; - newsk->list = list; list->qlen++; prev = (struct sk_buff *)list; next = prev->next; @@ -622,7 +620,6 @@ static inline void __skb_queue_tail(struct sk_buff_head *list, { struct sk_buff *prev, *next; - newsk->list = list; list->qlen++; next = (struct sk_buff *)list; prev = next->prev; @@ -655,7 +652,6 @@ static inline struct sk_buff *__skb_dequeue(struct sk_buff_head *list) next->prev = prev; prev->next = next; result->next = result->prev = NULL; - result->list = NULL; } return result; } @@ -664,7 +660,7 @@ static inline struct sk_buff *__skb_dequeue(struct sk_buff_head *list) /* * Insert a packet on a list. */ -extern void skb_insert(struct sk_buff *old, struct sk_buff *newsk); +extern void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list); static inline void __skb_insert(struct sk_buff *newsk, struct sk_buff *prev, struct sk_buff *next, struct sk_buff_head *list) @@ -672,24 +668,23 @@ static inline void __skb_insert(struct sk_buff *newsk, newsk->next = next; newsk->prev = prev; next->prev = prev->next = newsk; - newsk->list = list; list->qlen++; } /* * Place a packet after a given packet in a list. */ -extern void skb_append(struct sk_buff *old, struct sk_buff *newsk); -static inline void __skb_append(struct sk_buff *old, struct sk_buff *newsk) +extern void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list); +static inline void __skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) { - __skb_insert(newsk, old, old->next, old->list); + __skb_insert(newsk, old, old->next, list); } /* * remove sk_buff from list. _Must_ be called atomically, and with * the list known.. */ -extern void skb_unlink(struct sk_buff *skb); +extern void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list); static inline void __skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) { struct sk_buff *next, *prev; @@ -698,7 +693,6 @@ static inline void __skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) next = skb->next; prev = skb->prev; skb->next = skb->prev = NULL; - skb->list = NULL; next->prev = prev; prev->next = next; } diff --git a/net/atm/ipcommon.c b/net/atm/ipcommon.c index 181a300..4b1faca 100644 --- a/net/atm/ipcommon.c +++ b/net/atm/ipcommon.c @@ -34,7 +34,6 @@ void skb_migrate(struct sk_buff_head *from,struct sk_buff_head *to) { - struct sk_buff *skb; unsigned long flags; struct sk_buff *skb_from = (struct sk_buff *) from; struct sk_buff *skb_to = (struct sk_buff *) to; @@ -47,8 +46,6 @@ void skb_migrate(struct sk_buff_head *from,struct sk_buff_head *to) prev->next = skb_to; to->prev->next = from->next; to->prev = from->prev; - for (skb = from->next; skb != skb_to; skb = skb->next) - skb->list = to; to->qlen += from->qlen; spin_unlock(&to->lock); from->prev = skb_from; diff --git a/net/ax25/ax25_subr.c b/net/ax25/ax25_subr.c index 99694b5..eb7343c 100644 --- a/net/ax25/ax25_subr.c +++ b/net/ax25/ax25_subr.c @@ -76,7 +76,7 @@ void ax25_requeue_frames(ax25_cb *ax25) if (skb_prev == NULL) skb_queue_head(&ax25->write_queue, skb); else - skb_append(skb_prev, skb); + skb_append(skb_prev, skb, &ax25->write_queue); skb_prev = skb; } } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 096991c..e6564b0 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -281,8 +281,6 @@ void kfree_skbmem(struct sk_buff *skb) void __kfree_skb(struct sk_buff *skb) { - BUG_ON(skb->list != NULL); - dst_release(skb->dst); #ifdef CONFIG_XFRM secpath_put(skb->sp); @@ -333,7 +331,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, unsigned int __nocast gfp_mask) #define C(x) n->x = skb->x n->next = n->prev = NULL; - n->list = NULL; n->sk = NULL; C(stamp); C(dev); @@ -403,7 +400,6 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) */ unsigned long offset = new->data - old->data; - new->list = NULL; new->sk = NULL; new->dev = old->dev; new->real_dev = old->real_dev; @@ -1342,50 +1338,43 @@ void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk) __skb_queue_tail(list, newsk); spin_unlock_irqrestore(&list->lock, flags); } + /** * skb_unlink - remove a buffer from a list * @skb: buffer to remove + * @list: list to use * - * Place a packet after a given packet in a list. The list locks are taken - * and this function is atomic with respect to other list locked calls + * Remove a packet from a list. The list locks are taken and this + * function is atomic with respect to other list locked calls * - * Works even without knowing the list it is sitting on, which can be - * handy at times. It also means that THE LIST MUST EXIST when you - * unlink. Thus a list must have its contents unlinked before it is - * destroyed. + * You must know what list the SKB is on. */ -void skb_unlink(struct sk_buff *skb) +void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) { - struct sk_buff_head *list = skb->list; - - if (list) { - unsigned long flags; + unsigned long flags; - spin_lock_irqsave(&list->lock, flags); - if (skb->list == list) - __skb_unlink(skb, skb->list); - spin_unlock_irqrestore(&list->lock, flags); - } + spin_lock_irqsave(&list->lock, flags); + __skb_unlink(skb, list); + spin_unlock_irqrestore(&list->lock, flags); } - /** * skb_append - append a buffer * @old: buffer to insert after * @newsk: buffer to insert + * @list: list to use * * Place a packet after a given packet in a list. The list locks are taken * and this function is atomic with respect to other list locked calls. * A buffer cannot be placed on two lists at the same time. */ - -void skb_append(struct sk_buff *old, struct sk_buff *newsk) +void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) { unsigned long flags; - spin_lock_irqsave(&old->list->lock, flags); - __skb_append(old, newsk); - spin_unlock_irqrestore(&old->list->lock, flags); + spin_lock_irqsave(&list->lock, flags); + __skb_append(old, newsk, list); + spin_unlock_irqrestore(&list->lock, flags); } @@ -1393,19 +1382,21 @@ void skb_append(struct sk_buff *old, struct sk_buff *newsk) * skb_insert - insert a buffer * @old: buffer to insert before * @newsk: buffer to insert + * @list: list to use + * + * Place a packet before a given packet in a list. The list locks are + * taken and this function is atomic with respect to other list locked + * calls. * - * Place a packet before a given packet in a list. The list locks are taken - * and this function is atomic with respect to other list locked calls * A buffer cannot be placed on two lists at the same time. */ - -void skb_insert(struct sk_buff *old, struct sk_buff *newsk) +void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) { unsigned long flags; - spin_lock_irqsave(&old->list->lock, flags); - __skb_insert(newsk, old->prev, old, old->list); - spin_unlock_irqrestore(&old->list->lock, flags); + spin_lock_irqsave(&list->lock, flags); + __skb_insert(newsk, old->prev, old, list); + spin_unlock_irqrestore(&list->lock, flags); } #if 0 diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index acdd18e..0c30409 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -1763,7 +1763,7 @@ static int dn_recvmsg(struct kiocb *iocb, struct socket *sock, nskb = skb->next; if (skb->len == 0) { - skb_unlink(skb); + skb_unlink(skb, queue); kfree_skb(skb); /* * N.B. Don't refer to skb or cb after this point diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c index 8cce1fd..e0bebf4 100644 --- a/net/decnet/dn_nsp_out.c +++ b/net/decnet/dn_nsp_out.c @@ -479,7 +479,7 @@ int dn_nsp_check_xmit_queue(struct sock *sk, struct sk_buff *skb, struct sk_buff xmit_count = cb2->xmit_count; segnum = cb2->segnum; /* Remove and drop ack'ed packet */ - skb_unlink(ack); + skb_unlink(ack, q); kfree_skb(ack); ack = NULL; diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c index de691e1..b807a31 100644 --- a/net/econet/af_econet.c +++ b/net/econet/af_econet.c @@ -869,7 +869,7 @@ static void aun_tx_ack(unsigned long seq, int result) foundit: tx_result(skb->sk, eb->cookie, result); - skb_unlink(skb); + skb_unlink(skb, &aun_queue); spin_unlock_irqrestore(&aun_queue_lock, flags); kfree_skb(skb); } @@ -947,7 +947,7 @@ static void ab_cleanup(unsigned long h) { tx_result(skb->sk, eb->cookie, ECTYPE_TRANSMIT_NOT_PRESENT); - skb_unlink(skb); + skb_unlink(skb, &aun_queue); kfree_skb(skb); } skb = newskb; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 69b1fcf..d2696af 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -975,7 +975,7 @@ do_fault: if (!skb->len) { if (sk->sk_send_head == skb) sk->sk_send_head = NULL; - __skb_unlink(skb, skb->list); + __skb_unlink(skb, &sk->sk_write_queue); sk_stream_free_skb(sk, skb); } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 53a8a53..ffa2402 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2085,7 +2085,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt seq_rtt = now - scb->when; tcp_dec_pcount_approx(&tp->fackets_out, skb); tcp_packets_out_dec(tp, skb); - __skb_unlink(skb, skb->list); + __skb_unlink(skb, &sk->sk_write_queue); sk_stream_free_skb(sk, skb); } @@ -2853,7 +2853,7 @@ static void tcp_ofo_queue(struct sock *sk) if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { SOCK_DEBUG(sk, "ofo packet was already received \n"); - __skb_unlink(skb, skb->list); + __skb_unlink(skb, &tp->out_of_order_queue); __kfree_skb(skb); continue; } @@ -2861,7 +2861,7 @@ static void tcp_ofo_queue(struct sock *sk) tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); - __skb_unlink(skb, skb->list); + __skb_unlink(skb, &tp->out_of_order_queue); __skb_queue_tail(&sk->sk_receive_queue, skb); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; if(skb->h.th->fin) @@ -3027,7 +3027,7 @@ drop: u32 end_seq = TCP_SKB_CB(skb)->end_seq; if (seq == TCP_SKB_CB(skb1)->end_seq) { - __skb_append(skb1, skb); + __skb_append(skb1, skb, &tp->out_of_order_queue); if (!tp->rx_opt.num_sacks || tp->selective_acks[0].end_seq != seq) @@ -3071,7 +3071,7 @@ drop: tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, end_seq); break; } - __skb_unlink(skb1, skb1->list); + __skb_unlink(skb1, &tp->out_of_order_queue); tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq); __kfree_skb(skb1); } @@ -3088,8 +3088,9 @@ add_sack: * simplifies code) */ static void -tcp_collapse(struct sock *sk, struct sk_buff *head, - struct sk_buff *tail, u32 start, u32 end) +tcp_collapse(struct sock *sk, struct sk_buff_head *list, + struct sk_buff *head, struct sk_buff *tail, + u32 start, u32 end) { struct sk_buff *skb; @@ -3099,7 +3100,7 @@ tcp_collapse(struct sock *sk, struct sk_buff *head, /* No new bits? It is possible on ofo queue. */ if (!before(start, TCP_SKB_CB(skb)->end_seq)) { struct sk_buff *next = skb->next; - __skb_unlink(skb, skb->list); + __skb_unlink(skb, list); __kfree_skb(skb); NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED); skb = next; @@ -3145,7 +3146,7 @@ tcp_collapse(struct sock *sk, struct sk_buff *head, nskb->mac.raw = nskb->head + (skb->mac.raw-skb->head); memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; - __skb_insert(nskb, skb->prev, skb, skb->list); + __skb_insert(nskb, skb->prev, skb, list); sk_stream_set_owner_r(nskb, sk); /* Copy data, releasing collapsed skbs. */ @@ -3164,7 +3165,7 @@ tcp_collapse(struct sock *sk, struct sk_buff *head, } if (!before(start, TCP_SKB_CB(skb)->end_seq)) { struct sk_buff *next = skb->next; - __skb_unlink(skb, skb->list); + __skb_unlink(skb, list); __kfree_skb(skb); NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED); skb = next; @@ -3200,7 +3201,8 @@ static void tcp_collapse_ofo_queue(struct sock *sk) if (skb == (struct sk_buff *)&tp->out_of_order_queue || after(TCP_SKB_CB(skb)->seq, end) || before(TCP_SKB_CB(skb)->end_seq, start)) { - tcp_collapse(sk, head, skb, start, end); + tcp_collapse(sk, &tp->out_of_order_queue, + head, skb, start, end); head = skb; if (skb == (struct sk_buff *)&tp->out_of_order_queue) break; @@ -3237,7 +3239,8 @@ static int tcp_prune_queue(struct sock *sk) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); tcp_collapse_ofo_queue(sk); - tcp_collapse(sk, sk->sk_receive_queue.next, + tcp_collapse(sk, &sk->sk_receive_queue, + sk->sk_receive_queue.next, (struct sk_buff*)&sk->sk_receive_queue, tp->copied_seq, tp->rcv_nxt); sk_stream_mem_reclaim(sk); @@ -3462,7 +3465,7 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th) struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); tp->copied_seq++; if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) { - __skb_unlink(skb, skb->list); + __skb_unlink(skb, &sk->sk_receive_queue); __kfree_skb(skb); } } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index dd30dd1..a4d1eb9 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -505,7 +505,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned /* Link BUFF into the send queue. */ skb_header_release(buff); - __skb_append(skb, buff); + __skb_append(skb, buff, &sk->sk_write_queue); return 0; } @@ -893,7 +893,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, /* Link BUFF into the send queue. */ skb_header_release(buff); - __skb_append(skb, buff); + __skb_append(skb, buff, &sk->sk_write_queue); return 0; } @@ -1238,7 +1238,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m tcp_skb_pcount(next_skb) != 1); /* Ok. We will be able to collapse the packet. */ - __skb_unlink(next_skb, next_skb->list); + __skb_unlink(next_skb, &sk->sk_write_queue); memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size); diff --git a/net/irda/irlap_frame.c b/net/irda/irlap_frame.c index 6dafbb4..eb65b49 100644 --- a/net/irda/irlap_frame.c +++ b/net/irda/irlap_frame.c @@ -988,9 +988,6 @@ void irlap_resend_rejected_frames(struct irlap_cb *self, int command) IRDA_DEBUG(0, "%s(), unable to copy\n", __FUNCTION__); return; } - /* Unlink tx_skb from list */ - tx_skb->next = tx_skb->prev = NULL; - tx_skb->list = NULL; /* Clear old Nr field + poll bit */ tx_skb->data[1] &= 0x0f; @@ -1063,9 +1060,6 @@ void irlap_resend_rejected_frame(struct irlap_cb *self, int command) IRDA_DEBUG(0, "%s(), unable to copy\n", __FUNCTION__); return; } - /* Unlink tx_skb from list */ - tx_skb->next = tx_skb->prev = NULL; - tx_skb->list = NULL; /* Clear old Nr field + poll bit */ tx_skb->data[1] &= 0x0f; diff --git a/net/lapb/lapb_subr.c b/net/lapb/lapb_subr.c index 5de05a0..8b5eefd 100644 --- a/net/lapb/lapb_subr.c +++ b/net/lapb/lapb_subr.c @@ -78,7 +78,7 @@ void lapb_requeue_frames(struct lapb_cb *lapb) if (!skb_prev) skb_queue_head(&lapb->write_queue, skb); else - skb_append(skb_prev, skb); + skb_append(skb_prev, skb, &lapb->write_queue); skb_prev = skb; } } diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 20b4cfe..f49b82d 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -714,7 +714,7 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock, if (uaddr) memcpy(uaddr, llc_ui_skb_cb(skb), sizeof(*uaddr)); msg->msg_namelen = sizeof(*uaddr); - if (!skb->list) { + if (!skb->next) { dgram_free: kfree_skb(skb); } diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index eba812a..5715486 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c @@ -71,7 +71,11 @@ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb) if (!ev->ind_prim && !ev->cfm_prim) { /* indicate or confirm not required */ - if (!skb->list) + /* XXX this is not very pretty, perhaps we should store + * XXX indicate/confirm-needed state in the llc_conn_state_ev + * XXX control block of the SKB instead? -DaveM + */ + if (!skb->next) goto out_kfree_skb; goto out_skb_put; } diff --git a/net/netrom/nr_subr.c b/net/netrom/nr_subr.c index 0627347..252c1b3 100644 --- a/net/netrom/nr_subr.c +++ b/net/netrom/nr_subr.c @@ -77,7 +77,7 @@ void nr_requeue_frames(struct sock *sk) if (skb_prev == NULL) skb_queue_head(&sk->sk_write_queue, skb); else - skb_append(skb_prev, skb); + skb_append(skb_prev, skb, &sk->sk_write_queue); skb_prev = skb; } } diff --git a/net/rose/rose_subr.c b/net/rose/rose_subr.c index 7db7e1c..ae135e2 100644 --- a/net/rose/rose_subr.c +++ b/net/rose/rose_subr.c @@ -74,7 +74,7 @@ void rose_requeue_frames(struct sock *sk) if (skb_prev == NULL) skb_queue_head(&sk->sk_write_queue, skb); else - skb_append(skb_prev, skb); + skb_append(skb_prev, skb, &sk->sk_write_queue); skb_prev = skb; } } diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 091a66f..4454afe 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -4892,7 +4892,7 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, sctp_skb_for_each(skb, &oldsk->sk_receive_queue, tmp) { event = sctp_skb2event(skb); if (event->asoc == assoc) { - __skb_unlink(skb, skb->list); + __skb_unlink(skb, &oldsk->sk_receive_queue); __skb_queue_tail(&newsk->sk_receive_queue, skb); } } @@ -4921,7 +4921,7 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, sctp_skb_for_each(skb, &oldsp->pd_lobby, tmp) { event = sctp_skb2event(skb); if (event->asoc == assoc) { - __skb_unlink(skb, skb->list); + __skb_unlink(skb, &oldsp->pd_lobby); __skb_queue_tail(queue, skb); } } diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index 8bbc279..ec2c857 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -50,9 +50,9 @@ /* Forward declarations for internal helpers. */ static struct sctp_ulpevent * sctp_ulpq_reasm(struct sctp_ulpq *ulpq, - struct sctp_ulpevent *); + struct sctp_ulpevent *); static struct sctp_ulpevent * sctp_ulpq_order(struct sctp_ulpq *, - struct sctp_ulpevent *); + struct sctp_ulpevent *); /* 1st Level Abstractions */ @@ -125,7 +125,9 @@ int sctp_ulpq_tail_data(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, event = sctp_ulpq_order(ulpq, event); } - /* Send event to the ULP. */ + /* Send event to the ULP. 'event' is the sctp_ulpevent for + * very first SKB on the 'temp' list. + */ if (event) sctp_ulpq_tail_event(ulpq, event); @@ -158,14 +160,18 @@ static int sctp_ulpq_clear_pd(struct sctp_ulpq *ulpq) return sctp_clear_pd(ulpq->asoc->base.sk); } - - +/* If the SKB of 'event' is on a list, it is the first such member + * of that list. + */ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sock *sk = ulpq->asoc->base.sk; - struct sk_buff_head *queue; + struct sk_buff_head *queue, *skb_list; + struct sk_buff *skb = sctp_event2skb(event); int clear_pd = 0; + skb_list = (struct sk_buff_head *) skb->prev; + /* If the socket is just going to throw this away, do not * even try to deliver it. */ @@ -197,10 +203,10 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) /* If we are harvesting multiple skbs they will be * collected on a list. */ - if (sctp_event2skb(event)->list) - sctp_skb_list_tail(sctp_event2skb(event)->list, queue); + if (skb_list) + sctp_skb_list_tail(skb_list, queue); else - __skb_queue_tail(queue, sctp_event2skb(event)); + __skb_queue_tail(queue, skb); /* Did we just complete partial delivery and need to get * rolling again? Move pending data to the receive @@ -214,10 +220,11 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) return 1; out_free: - if (sctp_event2skb(event)->list) - sctp_queue_purge_ulpevents(sctp_event2skb(event)->list); + if (skb_list) + sctp_queue_purge_ulpevents(skb_list); else sctp_ulpevent_free(event); + return 0; } @@ -269,7 +276,7 @@ static inline void sctp_ulpq_store_reasm(struct sctp_ulpq *ulpq, * payload was fragmented on the way and ip had to reassemble them. * We add the rest of skb's to the first skb's fraglist. */ -static struct sctp_ulpevent *sctp_make_reassembled_event(struct sk_buff *f_frag, struct sk_buff *l_frag) +static struct sctp_ulpevent *sctp_make_reassembled_event(struct sk_buff_head *queue, struct sk_buff *f_frag, struct sk_buff *l_frag) { struct sk_buff *pos; struct sctp_ulpevent *event; @@ -294,7 +301,7 @@ static struct sctp_ulpevent *sctp_make_reassembled_event(struct sk_buff *f_frag, skb_shinfo(f_frag)->frag_list = pos; /* Remove the first fragment from the reassembly queue. */ - __skb_unlink(f_frag, f_frag->list); + __skb_unlink(f_frag, queue); while (pos) { pnext = pos->next; @@ -304,7 +311,7 @@ static struct sctp_ulpevent *sctp_make_reassembled_event(struct sk_buff *f_frag, f_frag->data_len += pos->len; /* Remove the fragment from the reassembly queue. */ - __skb_unlink(pos, pos->list); + __skb_unlink(pos, queue); /* Break if we have reached the last fragment. */ if (pos == l_frag) @@ -375,7 +382,7 @@ static inline struct sctp_ulpevent *sctp_ulpq_retrieve_reassembled(struct sctp_u done: return retval; found: - retval = sctp_make_reassembled_event(first_frag, pos); + retval = sctp_make_reassembled_event(&ulpq->reasm, first_frag, pos); if (retval) retval->msg_flags |= MSG_EOR; goto done; @@ -435,7 +442,7 @@ static inline struct sctp_ulpevent *sctp_ulpq_retrieve_partial(struct sctp_ulpq * further. */ done: - retval = sctp_make_reassembled_event(first_frag, last_frag); + retval = sctp_make_reassembled_event(&ulpq->reasm, first_frag, last_frag); if (retval && is_last) retval->msg_flags |= MSG_EOR; @@ -527,7 +534,7 @@ static inline struct sctp_ulpevent *sctp_ulpq_retrieve_first(struct sctp_ulpq *u * further. */ done: - retval = sctp_make_reassembled_event(first_frag, last_frag); + retval = sctp_make_reassembled_event(&ulpq->reasm, first_frag, last_frag); return retval; } @@ -537,6 +544,7 @@ done: static inline void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { + struct sk_buff_head *event_list; struct sk_buff *pos, *tmp; struct sctp_ulpevent *cevent; struct sctp_stream *in; @@ -547,6 +555,8 @@ static inline void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq, ssn = event->ssn; in = &ulpq->asoc->ssnmap->in; + event_list = (struct sk_buff_head *) sctp_event2skb(event)->prev; + /* We are holding the chunks by stream, by SSN. */ sctp_skb_for_each(pos, &ulpq->lobby, tmp) { cevent = (struct sctp_ulpevent *) pos->cb; @@ -567,10 +577,10 @@ static inline void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq, /* Found it, so mark in the ssnmap. */ sctp_ssn_next(in, sid); - __skb_unlink(pos, pos->list); + __skb_unlink(pos, &ulpq->lobby); /* Attach all gathered skbs to the event. */ - __skb_queue_tail(sctp_event2skb(event)->list, pos); + __skb_queue_tail(event_list, pos); } } @@ -626,7 +636,7 @@ static inline void sctp_ulpq_store_ordered(struct sctp_ulpq *ulpq, } static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq, - struct sctp_ulpevent *event) + struct sctp_ulpevent *event) { __u16 sid, ssn; struct sctp_stream *in; @@ -667,7 +677,7 @@ static inline void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq) { struct sk_buff *pos, *tmp; struct sctp_ulpevent *cevent; - struct sctp_ulpevent *event = NULL; + struct sctp_ulpevent *event; struct sctp_stream *in; struct sk_buff_head temp; __u16 csid, cssn; @@ -675,6 +685,8 @@ static inline void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq) in = &ulpq->asoc->ssnmap->in; /* We are holding the chunks by stream, by SSN. */ + skb_queue_head_init(&temp); + event = NULL; sctp_skb_for_each(pos, &ulpq->lobby, tmp) { cevent = (struct sctp_ulpevent *) pos->cb; csid = cevent->stream; @@ -686,19 +698,20 @@ static inline void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq) /* Found it, so mark in the ssnmap. */ sctp_ssn_next(in, csid); - __skb_unlink(pos, pos->list); + __skb_unlink(pos, &ulpq->lobby); if (!event) { /* Create a temporary list to collect chunks on. */ event = sctp_skb2event(pos); - skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); } else { /* Attach all gathered skbs to the event. */ - __skb_queue_tail(sctp_event2skb(event)->list, pos); + __skb_queue_tail(&temp, pos); } } - /* Send event to the ULP. */ + /* Send event to the ULP. 'event' is the sctp_ulpevent for + * very first SKB on the 'temp' list. + */ if (event) sctp_ulpq_tail_event(ulpq, event); } diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 4bd95c8..46252d2 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -286,16 +286,16 @@ void unix_gc(void) skb = skb_peek(&s->sk_receive_queue); while (skb && skb != (struct sk_buff *)&s->sk_receive_queue) { - nextsk=skb->next; + nextsk = skb->next; /* * Do we have file descriptors ? */ - if(UNIXCB(skb).fp) - { - __skb_unlink(skb, skb->list); - __skb_queue_tail(&hitlist,skb); + if (UNIXCB(skb).fp) { + __skb_unlink(skb, + &s->sk_receive_queue); + __skb_queue_tail(&hitlist, skb); } - skb=nextsk; + skb = nextsk; } spin_unlock(&s->sk_receive_queue.lock); } diff --git a/net/x25/x25_subr.c b/net/x25/x25_subr.c index 7fd872ad..e20cfad 100644 --- a/net/x25/x25_subr.c +++ b/net/x25/x25_subr.c @@ -80,7 +80,7 @@ void x25_requeue_frames(struct sock *sk) if (!skb_prev) skb_queue_head(&sk->sk_write_queue, skb); else - skb_append(skb_prev, skb); + skb_append(skb_prev, skb, &sk->sk_write_queue); skb_prev = skb; } } -- cgit v0.10.2 From abc3bc58047efa72ee9c2e208cbeb73d261ad703 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 9 Aug 2005 19:25:56 -0700 Subject: [NET]: Kill skb->tc_classid Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 76c6885..f10a8b9 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -196,7 +196,6 @@ struct skb_shared_info { * @private: Data which is private to the HIPPI implementation * @tc_index: Traffic control index * @tc_verd: traffic control verdict - * @tc_classid: traffic control classid */ struct sk_buff { @@ -275,9 +274,7 @@ struct sk_buff { __u32 tc_index; /* traffic control index */ #ifdef CONFIG_NET_CLS_ACT __u32 tc_verd; /* traffic control verdict */ - __u32 tc_classid; /* traffic control classid */ #endif - #endif diff --git a/include/net/act_api.h b/include/net/act_api.h index ed00a99..b55eb7c 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -63,7 +63,7 @@ struct tc_action_ops __u32 type; /* TBD to match kind */ __u32 capab; /* capabilities includes 4 bit version */ struct module *owner; - int (*act)(struct sk_buff **, struct tc_action *); + int (*act)(struct sk_buff **, struct tc_action *, struct tcf_result *); int (*get_stats)(struct sk_buff *, struct tc_action *); int (*dump)(struct sk_buff *, struct tc_action *,int , int); int (*cleanup)(struct tc_action *, int bind); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index e6564b0..8896e6f 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -300,7 +300,6 @@ void __kfree_skb(struct sk_buff *skb) skb->tc_index = 0; #ifdef CONFIG_NET_CLS_ACT skb->tc_verd = 0; - skb->tc_classid = 0; #endif #endif @@ -376,7 +375,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, unsigned int __nocast gfp_mask) n->tc_verd = CLR_TC_OK2MUNGE(n->tc_verd); n->tc_verd = CLR_TC_MUNGED(n->tc_verd); C(input_dev); - C(tc_classid); #endif #endif diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 249c619..c896a01 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -165,7 +165,7 @@ int tcf_action_exec(struct sk_buff *skb, struct tc_action *act, while ((a = act) != NULL) { repeat: if (a->ops && a->ops->act) { - ret = a->ops->act(&skb, a); + ret = a->ops->act(&skb, a, res); if (TC_MUNGED & skb->tc_verd) { /* copied already, allow trampling */ skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd); @@ -179,11 +179,6 @@ repeat: act = a->next; } exec_done: - if (skb->tc_classid > 0) { - res->classid = skb->tc_classid; - res->class = 0; - skb->tc_classid = 0; - } return ret; } diff --git a/net/sched/gact.c b/net/sched/gact.c index a811c89..d1c6d54 100644 --- a/net/sched/gact.c +++ b/net/sched/gact.c @@ -135,7 +135,7 @@ tcf_gact_cleanup(struct tc_action *a, int bind) } static int -tcf_gact(struct sk_buff **pskb, struct tc_action *a) +tcf_gact(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res) { struct tcf_gact *p = PRIV(a, gact); struct sk_buff *skb = *pskb; diff --git a/net/sched/ipt.c b/net/sched/ipt.c index b114d99..f50136e 100644 --- a/net/sched/ipt.c +++ b/net/sched/ipt.c @@ -201,7 +201,7 @@ tcf_ipt_cleanup(struct tc_action *a, int bind) } static int -tcf_ipt(struct sk_buff **pskb, struct tc_action *a) +tcf_ipt(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res) { int ret = 0, result = 0; struct tcf_ipt *p = PRIV(a, ipt); diff --git a/net/sched/mirred.c b/net/sched/mirred.c index f309ce3..20d0691 100644 --- a/net/sched/mirred.c +++ b/net/sched/mirred.c @@ -158,7 +158,7 @@ tcf_mirred_cleanup(struct tc_action *a, int bind) } static int -tcf_mirred(struct sk_buff **pskb, struct tc_action *a) +tcf_mirred(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res) { struct tcf_mirred *p = PRIV(a, mirred); struct net_device *dev; diff --git a/net/sched/pedit.c b/net/sched/pedit.c index 678be6a..767d24f 100644 --- a/net/sched/pedit.c +++ b/net/sched/pedit.c @@ -130,7 +130,7 @@ tcf_pedit_cleanup(struct tc_action *a, int bind) } static int -tcf_pedit(struct sk_buff **pskb, struct tc_action *a) +tcf_pedit(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res) { struct tcf_pedit *p = PRIV(a, pedit); struct sk_buff *skb = *pskb; diff --git a/net/sched/police.c b/net/sched/police.c index c03545f..eb39fb2 100644 --- a/net/sched/police.c +++ b/net/sched/police.c @@ -284,7 +284,8 @@ static int tcf_act_police_cleanup(struct tc_action *a, int bind) return 0; } -static int tcf_act_police(struct sk_buff **pskb, struct tc_action *a) +static int tcf_act_police(struct sk_buff **pskb, struct tc_action *a, + struct tcf_result *res) { psched_time_t now; struct sk_buff *skb = *pskb; diff --git a/net/sched/simple.c b/net/sched/simple.c index 3ab4c67..8a6ae4f 100644 --- a/net/sched/simple.c +++ b/net/sched/simple.c @@ -44,7 +44,7 @@ static DEFINE_RWLOCK(simp_lock); #include #include -static int tcf_simp(struct sk_buff **pskb, struct tc_action *a) +static int tcf_simp(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res) { struct sk_buff *skb = *pskb; struct tcf_defact *p = PRIV(a, defact); -- cgit v0.10.2 From ac3247baf8ecadf168642e3898b0212c29c79715 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 19:28:03 -0700 Subject: [NETFILTER]: connection tracking event notifiers This adds a notifier chain based event mechanism for ip_conntrack state changes. As opposed to the previous implementations in patch-o-matic, we do no longer need a field in the skb to achieve this. Thanks to the valuable input from Patrick McHardy and Rusty on the idea of a per_cpu implementation. Signed-off-by: Harald Welte Signed-off-by: David S. Miller diff --git a/include/linux/netfilter_ipv4/ip_conntrack.h b/include/linux/netfilter_ipv4/ip_conntrack.h index 4ed720f..ae1270c 100644 --- a/include/linux/netfilter_ipv4/ip_conntrack.h +++ b/include/linux/netfilter_ipv4/ip_conntrack.h @@ -65,6 +65,63 @@ enum ip_conntrack_status { /* Both together */ IPS_NAT_DONE_MASK = (IPS_DST_NAT_DONE | IPS_SRC_NAT_DONE), + + /* Connection is dying (removed from lists), can not be unset. */ + IPS_DYING_BIT = 9, + IPS_DYING = (1 << IPS_DYING_BIT), +}; + +/* Connection tracking event bits */ +enum ip_conntrack_events +{ + /* New conntrack */ + IPCT_NEW_BIT = 0, + IPCT_NEW = (1 << IPCT_NEW_BIT), + + /* Expected connection */ + IPCT_RELATED_BIT = 1, + IPCT_RELATED = (1 << IPCT_RELATED_BIT), + + /* Destroyed conntrack */ + IPCT_DESTROY_BIT = 2, + IPCT_DESTROY = (1 << IPCT_DESTROY_BIT), + + /* Timer has been refreshed */ + IPCT_REFRESH_BIT = 3, + IPCT_REFRESH = (1 << IPCT_REFRESH_BIT), + + /* Status has changed */ + IPCT_STATUS_BIT = 4, + IPCT_STATUS = (1 << IPCT_STATUS_BIT), + + /* Update of protocol info */ + IPCT_PROTOINFO_BIT = 5, + IPCT_PROTOINFO = (1 << IPCT_PROTOINFO_BIT), + + /* Volatile protocol info */ + IPCT_PROTOINFO_VOLATILE_BIT = 6, + IPCT_PROTOINFO_VOLATILE = (1 << IPCT_PROTOINFO_VOLATILE_BIT), + + /* New helper for conntrack */ + IPCT_HELPER_BIT = 7, + IPCT_HELPER = (1 << IPCT_HELPER_BIT), + + /* Update of helper info */ + IPCT_HELPINFO_BIT = 8, + IPCT_HELPINFO = (1 << IPCT_HELPINFO_BIT), + + /* Volatile helper info */ + IPCT_HELPINFO_VOLATILE_BIT = 9, + IPCT_HELPINFO_VOLATILE = (1 << IPCT_HELPINFO_VOLATILE_BIT), + + /* NAT info */ + IPCT_NATINFO_BIT = 10, + IPCT_NATINFO = (1 << IPCT_NATINFO_BIT), +}; + +enum ip_conntrack_expect_events { + IPEXP_NEW_BIT = 0, + IPEXP_NEW = (1 << IPEXP_NEW_BIT), }; #ifdef __KERNEL__ @@ -280,6 +337,11 @@ static inline int is_confirmed(struct ip_conntrack *ct) return test_bit(IPS_CONFIRMED_BIT, &ct->status); } +static inline int is_dying(struct ip_conntrack *ct) +{ + return test_bit(IPS_DYING_BIT, &ct->status); +} + extern unsigned int ip_conntrack_htable_size; struct ip_conntrack_stat @@ -303,6 +365,88 @@ struct ip_conntrack_stat #define CONNTRACK_STAT_INC(count) (__get_cpu_var(ip_conntrack_stat).count++) +#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS +#include + +struct ip_conntrack_ecache { + struct ip_conntrack *ct; + unsigned int events; +}; +DECLARE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache); + +#define CONNTRACK_ECACHE(x) (__get_cpu_var(ip_conntrack_ecache).x) + +extern struct notifier_block *ip_conntrack_chain; +extern struct notifier_block *ip_conntrack_expect_chain; + +static inline int ip_conntrack_register_notifier(struct notifier_block *nb) +{ + return notifier_chain_register(&ip_conntrack_chain, nb); +} + +static inline int ip_conntrack_unregister_notifier(struct notifier_block *nb) +{ + return notifier_chain_unregister(&ip_conntrack_chain, nb); +} + +static inline int +ip_conntrack_expect_register_notifier(struct notifier_block *nb) +{ + return notifier_chain_register(&ip_conntrack_expect_chain, nb); +} + +static inline int +ip_conntrack_expect_unregister_notifier(struct notifier_block *nb) +{ + return notifier_chain_unregister(&ip_conntrack_expect_chain, nb); +} + +static inline void +ip_conntrack_event_cache(enum ip_conntrack_events event, + const struct sk_buff *skb) +{ + struct ip_conntrack_ecache *ecache = + &__get_cpu_var(ip_conntrack_ecache); + + if (unlikely((struct ip_conntrack *) skb->nfct != ecache->ct)) { + if (net_ratelimit()) { + printk(KERN_ERR "ctevent: skb->ct != ecache->ct !!!\n"); + dump_stack(); + } + } + ecache->events |= event; +} + +extern void +ip_conntrack_deliver_cached_events_for(const struct ip_conntrack *ct); +extern void ip_conntrack_event_cache_init(const struct sk_buff *skb); + +static inline void ip_conntrack_event(enum ip_conntrack_events event, + struct ip_conntrack *ct) +{ + if (is_confirmed(ct) && !is_dying(ct)) + notifier_call_chain(&ip_conntrack_chain, event, ct); +} + +static inline void +ip_conntrack_expect_event(enum ip_conntrack_expect_events event, + struct ip_conntrack_expect *exp) +{ + notifier_call_chain(&ip_conntrack_expect_chain, event, exp); +} +#else /* CONFIG_IP_NF_CONNTRACK_EVENTS */ +static inline void ip_conntrack_event_cache(enum ip_conntrack_events event, + const struct sk_buff *skb) {} +static inline void ip_conntrack_event(enum ip_conntrack_events event, + struct ip_conntrack *ct) {} +static inline void ip_conntrack_deliver_cached_events_for( + struct ip_conntrack *ct) {} +static inline void ip_conntrack_event_cache_init(const struct sk_buff *skb) {} +static inline void +ip_conntrack_expect_event(enum ip_conntrack_expect_events event, + struct ip_conntrack_expect *exp) {} +#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */ + #ifdef CONFIG_IP_NF_NAT_NEEDED static inline int ip_nat_initialized(struct ip_conntrack *conntrack, enum ip_nat_manip_type manip) diff --git a/include/linux/netfilter_ipv4/ip_conntrack_core.h b/include/linux/netfilter_ipv4/ip_conntrack_core.h index 694aec9..46eeea1 100644 --- a/include/linux/netfilter_ipv4/ip_conntrack_core.h +++ b/include/linux/netfilter_ipv4/ip_conntrack_core.h @@ -38,12 +38,21 @@ extern int __ip_conntrack_confirm(struct sk_buff **pskb); /* Confirm a connection: returns NF_DROP if packet must be dropped. */ static inline int ip_conntrack_confirm(struct sk_buff **pskb) { - if ((*pskb)->nfct - && !is_confirmed((struct ip_conntrack *)(*pskb)->nfct)) - return __ip_conntrack_confirm(pskb); - return NF_ACCEPT; + struct ip_conntrack *ct = (struct ip_conntrack *)(*pskb)->nfct; + int ret = NF_ACCEPT; + + if (ct && !is_confirmed(ct)) + ret = __ip_conntrack_confirm(pskb); + ip_conntrack_deliver_cached_events_for(ct); + + return ret; } +#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS +struct ip_conntrack_ecache; +extern void __ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ec); +#endif + extern struct list_head *ip_conntrack_hash; extern struct list_head ip_conntrack_expect_list; extern rwlock_t ip_conntrack_lock; diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 46d4cb1..ff3393e 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -40,6 +40,16 @@ config IP_NF_CONNTRACK_MARK of packets, but this mark value is kept in the conntrack session instead of the individual packets. +config IP_NF_CONNTRACK_EVENTS + bool "Connection tracking events" + depends on IP_NF_CONNTRACK + help + If this option is enabled, the connection tracking code will + provide a notifier chain that can be used by other kernel code + to get notified about changes in the connection tracking state. + + IF unsure, say `N'. + config IP_NF_CT_PROTO_SCTP tristate 'SCTP protocol connection tracking support (EXPERIMENTAL)' depends on IP_NF_CONNTRACK && EXPERIMENTAL diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index 04c3414..caf89de 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -37,6 +37,7 @@ #include #include #include +#include /* ip_conntrack_lock protects the main hash table, protocol/helper/expected registrations, conntrack timers*/ @@ -49,7 +50,7 @@ #include #include -#define IP_CONNTRACK_VERSION "2.1" +#define IP_CONNTRACK_VERSION "2.2" #if 0 #define DEBUGP printk @@ -76,6 +77,81 @@ unsigned int ip_ct_log_invalid; static LIST_HEAD(unconfirmed); static int ip_conntrack_vmalloc; +#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS +struct notifier_block *ip_conntrack_chain; +struct notifier_block *ip_conntrack_expect_chain; + +DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache); + +static inline void __deliver_cached_events(struct ip_conntrack_ecache *ecache) +{ + if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events) + notifier_call_chain(&ip_conntrack_chain, ecache->events, + ecache->ct); + ecache->events = 0; +} + +void __ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache) +{ + __deliver_cached_events(ecache); +} + +/* Deliver all cached events for a particular conntrack. This is called + * by code prior to async packet handling or freeing the skb */ +void +ip_conntrack_deliver_cached_events_for(const struct ip_conntrack *ct) +{ + struct ip_conntrack_ecache *ecache = + &__get_cpu_var(ip_conntrack_ecache); + + if (!ct) + return; + + if (ecache->ct == ct) { + DEBUGP("ecache: delivering event for %p\n", ct); + __deliver_cached_events(ecache); + } else { + if (net_ratelimit()) + printk(KERN_WARNING "ecache: want to deliver for %p, " + "but cache has %p\n", ct, ecache->ct); + } + + /* signalize that events have already been delivered */ + ecache->ct = NULL; +} + +/* Deliver cached events for old pending events, if current conntrack != old */ +void ip_conntrack_event_cache_init(const struct sk_buff *skb) +{ + struct ip_conntrack *ct = (struct ip_conntrack *) skb->nfct; + struct ip_conntrack_ecache *ecache = + &__get_cpu_var(ip_conntrack_ecache); + + /* take care of delivering potentially old events */ + if (ecache->ct != ct) { + enum ip_conntrack_info ctinfo; + /* we have to check, since at startup the cache is NULL */ + if (likely(ecache->ct)) { + DEBUGP("ecache: entered for different conntrack: " + "ecache->ct=%p, skb->nfct=%p. delivering " + "events\n", ecache->ct, ct); + __deliver_cached_events(ecache); + ip_conntrack_put(ecache->ct); + } else { + DEBUGP("ecache: entered for conntrack %p, " + "cache was clean before\n", ct); + } + + /* initialize for this conntrack/packet */ + ecache->ct = ip_conntrack_get(skb, &ctinfo); + /* ecache->events cleared by __deliver_cached_devents() */ + } else { + DEBUGP("ecache: re-entered for conntrack %p.\n", ct); + } +} + +#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */ + DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat); void @@ -223,6 +299,8 @@ destroy_conntrack(struct nf_conntrack *nfct) IP_NF_ASSERT(atomic_read(&nfct->use) == 0); IP_NF_ASSERT(!timer_pending(&ct->timeout)); + set_bit(IPS_DYING_BIT, &ct->status); + /* To make sure we don't get any weird locking issues here: * destroy_conntrack() MUST NOT be called with a write lock * to ip_conntrack_lock!!! -HW */ @@ -261,6 +339,7 @@ static void death_by_timeout(unsigned long ul_conntrack) { struct ip_conntrack *ct = (void *)ul_conntrack; + ip_conntrack_event(IPCT_DESTROY, ct); write_lock_bh(&ip_conntrack_lock); /* Inside lock so preempt is disabled on module removal path. * Otherwise we can get spurious warnings. */ @@ -374,6 +453,16 @@ __ip_conntrack_confirm(struct sk_buff **pskb) set_bit(IPS_CONFIRMED_BIT, &ct->status); CONNTRACK_STAT_INC(insert); write_unlock_bh(&ip_conntrack_lock); + if (ct->helper) + ip_conntrack_event_cache(IPCT_HELPER, *pskb); +#ifdef CONFIG_IP_NF_NAT_NEEDED + if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) || + test_bit(IPS_DST_NAT_DONE_BIT, &ct->status)) + ip_conntrack_event_cache(IPCT_NATINFO, *pskb); +#endif + ip_conntrack_event_cache(master_ct(ct) ? + IPCT_RELATED : IPCT_NEW, *pskb); + return NF_ACCEPT; } @@ -607,7 +696,7 @@ unsigned int ip_conntrack_in(unsigned int hooknum, struct ip_conntrack *ct; enum ip_conntrack_info ctinfo; struct ip_conntrack_protocol *proto; - int set_reply; + int set_reply = 0; int ret; /* Previously seen (loopback or untracked)? Ignore. */ @@ -666,6 +755,8 @@ unsigned int ip_conntrack_in(unsigned int hooknum, IP_NF_ASSERT((*pskb)->nfct); + ip_conntrack_event_cache_init(*pskb); + ret = proto->packet(ct, *pskb, ctinfo); if (ret < 0) { /* Invalid: inverse of the return code tells @@ -676,8 +767,8 @@ unsigned int ip_conntrack_in(unsigned int hooknum, return -ret; } - if (set_reply) - set_bit(IPS_SEEN_REPLY_BIT, &ct->status); + if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) + ip_conntrack_event_cache(IPCT_STATUS, *pskb); return ret; } @@ -824,6 +915,7 @@ int ip_conntrack_expect_related(struct ip_conntrack_expect *expect) evict_oldest_expect(expect->master); ip_conntrack_expect_insert(expect); + ip_conntrack_expect_event(IPEXP_NEW, expect); ret = 0; out: write_unlock_bh(&ip_conntrack_lock); @@ -861,8 +953,10 @@ int ip_conntrack_helper_register(struct ip_conntrack_helper *me) static inline int unhelp(struct ip_conntrack_tuple_hash *i, const struct ip_conntrack_helper *me) { - if (tuplehash_to_ctrack(i)->helper == me) + if (tuplehash_to_ctrack(i)->helper == me) { + ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i)); tuplehash_to_ctrack(i)->helper = NULL; + } return 0; } @@ -924,6 +1018,7 @@ void ip_ct_refresh_acct(struct ip_conntrack *ct, if (del_timer(&ct->timeout)) { ct->timeout.expires = jiffies + extra_jiffies; add_timer(&ct->timeout); + ip_conntrack_event_cache(IPCT_REFRESH, skb); } ct_add_counters(ct, ctinfo, skb); write_unlock_bh(&ip_conntrack_lock); @@ -1012,6 +1107,23 @@ ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data) ip_conntrack_put(ct); } + +#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS + { + /* we need to deliver all cached events in order to drop + * the reference counts */ + int cpu; + for_each_cpu(cpu) { + struct ip_conntrack_ecache *ecache = + &per_cpu(ip_conntrack_ecache, cpu); + if (ecache->ct) { + __ip_ct_deliver_cached_events(ecache); + ip_conntrack_put(ecache->ct); + ecache->ct = NULL; + } + } + } +#endif } /* Fast function for those who don't want to parse /proc (and I don't diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c index 7a3b773..9658896 100644 --- a/net/ipv4/netfilter/ip_conntrack_ftp.c +++ b/net/ipv4/netfilter/ip_conntrack_ftp.c @@ -262,7 +262,8 @@ static int find_nl_seq(u32 seq, const struct ip_ct_ftp_master *info, int dir) } /* We don't update if it's older than what we have. */ -static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir) +static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir, + struct sk_buff *skb) { unsigned int i, oldest = NUM_SEQ_TO_REMEMBER; @@ -276,10 +277,13 @@ static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir) oldest = i; } - if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER) + if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER) { info->seq_aft_nl[dir][info->seq_aft_nl_num[dir]++] = nl_seq; - else if (oldest != NUM_SEQ_TO_REMEMBER) + ip_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb); + } else if (oldest != NUM_SEQ_TO_REMEMBER) { info->seq_aft_nl[dir][oldest] = nl_seq; + ip_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb); + } } static int help(struct sk_buff **pskb, @@ -439,7 +443,7 @@ out_update_nl: /* Now if this ends in \n, update ftp info. Seq may have been * adjusted by NAT code. */ if (ends_in_nl) - update_nl_seq(seq, ct_ftp_info,dir); + update_nl_seq(seq, ct_ftp_info,dir, *pskb); out: spin_unlock_bh(&ip_ftp_lock); return ret; diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c index 602c74db..dca1f63 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c @@ -102,6 +102,7 @@ static int icmp_packet(struct ip_conntrack *ct, ct->timeout.function((unsigned long)ct); } else { atomic_inc(&ct->proto.icmp.count); + ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); ip_ct_refresh_acct(ct, ctinfo, skb, ip_ct_icmp_timeout); } diff --git a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c index 31d7539..3d5f878 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c @@ -404,6 +404,8 @@ static int sctp_packet(struct ip_conntrack *conntrack, } conntrack->proto.sctp.state = newconntrack; + if (oldsctpstate != newconntrack) + ip_conntrack_event_cache(IPCT_PROTOINFO, skb); write_unlock_bh(&sctp_lock); } diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index 809dfed..a569ad1 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -973,6 +973,10 @@ static int tcp_packet(struct ip_conntrack *conntrack, ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state]; write_unlock_bh(&tcp_lock); + ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); + if (new_state != old_state) + ip_conntrack_event_cache(IPCT_PROTOINFO, skb); + if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) { /* If only reply is a RST, we can consider ourselves not to have an established connection: this is a fairly common diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c index 8c1eaba..6066eaf 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c @@ -73,7 +73,8 @@ static int udp_packet(struct ip_conntrack *conntrack, ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_udp_timeout_stream); /* Also, more likely to be important, and not a probe */ - set_bit(IPS_ASSURED_BIT, &conntrack->status); + if (!test_and_set_bit(IPS_ASSURED_BIT, &conntrack->status)) + ip_conntrack_event_cache(IPCT_STATUS, skb); } else ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_udp_timeout); diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c index dccd4ab..f088000 100644 --- a/net/ipv4/netfilter/ip_conntrack_standalone.c +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c @@ -402,6 +402,7 @@ static unsigned int ip_confirm(unsigned int hooknum, const struct net_device *out, int (*okfn)(struct sk_buff *)) { + ip_conntrack_event_cache_init(*pskb); /* We've seen it coming out the other side: confirm it */ return ip_conntrack_confirm(pskb); } @@ -419,6 +420,7 @@ static unsigned int ip_conntrack_help(unsigned int hooknum, ct = ip_conntrack_get(*pskb, &ctinfo); if (ct && ct->helper) { unsigned int ret; + ip_conntrack_event_cache_init(*pskb); ret = ct->helper->help(pskb, ct, ctinfo); if (ret != NF_ACCEPT) return ret; @@ -889,6 +891,7 @@ static int init_or_cleanup(int init) return ret; cleanup: + synchronize_net(); #ifdef CONFIG_SYSCTL unregister_sysctl_table(ip_ct_sysctl_header); cleanup_localinops: @@ -971,6 +974,13 @@ void need_ip_conntrack(void) { } +#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS +EXPORT_SYMBOL_GPL(ip_conntrack_chain); +EXPORT_SYMBOL_GPL(ip_conntrack_expect_chain); +EXPORT_SYMBOL_GPL(ip_conntrack_register_notifier); +EXPORT_SYMBOL_GPL(ip_conntrack_unregister_notifier); +EXPORT_PER_CPU_SYMBOL_GPL(ip_conntrack_ecache); +#endif EXPORT_SYMBOL(ip_conntrack_protocol_register); EXPORT_SYMBOL(ip_conntrack_protocol_unregister); EXPORT_SYMBOL(ip_ct_get_tuple); -- cgit v0.10.2 From f9e815b376dc19e6afc551cd755ac64e9e42d81f Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 19:30:24 -0700 Subject: [NETFITLER]: Add nfnetlink layer. Introduce "nfnetlink" (netfilter netlink) layer. This layer is used as transport layer for all userspace communication of the new upcoming netfilter subsystems, such as ctnetlink, nfnetlink_queue and some day even the mythical pkttables ;) Signed-off-by: Harald Welte Signed-off-by: David S. Miller diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h new file mode 100644 index 0000000..8f1bfb8 --- /dev/null +++ b/include/linux/netfilter/nfnetlink.h @@ -0,0 +1,145 @@ +#ifndef _NFNETLINK_H +#define _NFNETLINK_H +#include + +/* nfnetlink groups: Up to 32 maximum */ +#define NF_NETLINK_CONNTRACK_NEW 0x00000001 +#define NF_NETLINK_CONNTRACK_UPDATE 0x00000002 +#define NF_NETLINK_CONNTRACK_DESTROY 0x00000004 +#define NF_NETLINK_CONNTRACK_EXP_NEW 0x00000008 +#define NF_NETLINK_CONNTRACK_EXP_UPDATE 0x00000010 +#define NF_NETLINK_CONNTRACK_EXP_DESTROY 0x00000020 + +/* Generic structure for encapsulation optional netfilter information. + * It is reminiscent of sockaddr, but with sa_family replaced + * with attribute type. + * ! This should someday be put somewhere generic as now rtnetlink and + * ! nfnetlink use the same attributes methods. - J. Schulist. + */ + +struct nfattr +{ + u_int16_t nfa_len; + u_int16_t nfa_type; +} __attribute__ ((packed)); + +/* FIXME: Shamelessly copy and pasted from rtnetlink.h, it's time + * to put this in a generic file */ + +#define NFA_ALIGNTO 4 +#define NFA_ALIGN(len) (((len) + NFA_ALIGNTO - 1) & ~(NFA_ALIGNTO - 1)) +#define NFA_OK(nfa,len) ((len) > 0 && (nfa)->nfa_len >= sizeof(struct nfattr) \ + && (nfa)->nfa_len <= (len)) +#define NFA_NEXT(nfa,attrlen) ((attrlen) -= NFA_ALIGN((nfa)->nfa_len), \ + (struct nfattr *)(((char *)(nfa)) + NFA_ALIGN((nfa)->nfa_len))) +#define NFA_LENGTH(len) (NFA_ALIGN(sizeof(struct nfattr)) + (len)) +#define NFA_SPACE(len) NFA_ALIGN(NFA_LENGTH(len)) +#define NFA_DATA(nfa) ((void *)(((char *)(nfa)) + NFA_LENGTH(0))) +#define NFA_PAYLOAD(nfa) ((int)((nfa)->nfa_len) - NFA_LENGTH(0)) +#define NFA_NEST(skb, type) \ +({ struct nfattr *__start = (struct nfattr *) (skb)->tail; \ + NFA_PUT(skb, type, 0, NULL); \ + __start; }) +#define NFA_NEST_END(skb, start) \ +({ (start)->nfa_len = ((skb)->tail - (unsigned char *) (start)); \ + (skb)->len; }) +#define NFA_NEST_CANCEL(skb, start) \ +({ if (start) \ + skb_trim(skb, (unsigned char *) (start) - (skb)->data); \ + -1; }) + +/* General form of address family dependent message. + */ +struct nfgenmsg { + u_int8_t nfgen_family; /* AF_xxx */ + u_int8_t version; /* nfnetlink version */ + u_int16_t res_id; /* resource id */ +} __attribute__ ((packed)); + +#define NFNETLINK_V1 1 + +#define NFM_NFA(n) ((struct nfattr *)(((char *)(n)) \ + + NLMSG_ALIGN(sizeof(struct nfgenmsg)))) +#define NFM_PAYLOAD(n) NLMSG_PAYLOAD(n, sizeof(struct nfgenmsg)) + +/* netfilter netlink message types are split in two pieces: + * 8 bit subsystem, 8bit operation. + */ + +#define NFNL_SUBSYS_ID(x) ((x & 0xff00) >> 8) +#define NFNL_MSG_TYPE(x) (x & 0x00ff) + +enum nfnl_subsys_id { + NFNL_SUBSYS_NONE = 0, + NFNL_SUBSYS_CTNETLINK, + NFNL_SUBSYS_CTNETLINK_EXP, + NFNL_SUBSYS_IPTNETLINK, + NFNL_SUBSYS_QUEUE, + NFNL_SUBSYS_ULOG, + NFNL_SUBSYS_COUNT, +}; + +#ifdef __KERNEL__ + +#include + +struct nfnl_callback +{ + kernel_cap_t cap_required; /* capabilities required for this msg */ + int (*call)(struct sock *nl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *cda[], int *errp); +}; + +struct nfnetlink_subsystem +{ + const char *name; + __u8 subsys_id; /* nfnetlink subsystem ID */ + __u8 cb_count; /* number of callbacks */ + u_int32_t attr_count; /* number of nfattr's */ + struct nfnl_callback *cb; /* callback for individual types */ +}; + +extern void __nfa_fill(struct sk_buff *skb, int attrtype, + int attrlen, const void *data); +#define NFA_PUT(skb, attrtype, attrlen, data) \ +({ if (skb_tailroom(skb) < (int)NFA_SPACE(attrlen)) goto nfattr_failure; \ + __nfa_fill(skb, attrtype, attrlen, data); }) + +extern struct semaphore nfnl_sem; + +#define nfnl_shlock() down(&nfnl_sem) +#define nfnl_shlock_nowait() down_trylock(&nfnl_sem) + +#define nfnl_shunlock() do { up(&nfnl_sem); \ + if(nfnl && nfnl->sk_receive_queue.qlen) \ + nfnl->sk_data_ready(nfnl, 0); \ + } while(0) + +extern void nfnl_lock(void); +extern void nfnl_unlock(void); + +extern int nfnetlink_subsys_register(struct nfnetlink_subsystem *n); +extern int nfnetlink_subsys_unregister(struct nfnetlink_subsystem *n); + +extern int nfattr_parse(struct nfattr *tb[], int maxattr, + struct nfattr *nfa, int len); + +#define nfattr_parse_nested(tb, max, nfa) \ + nfattr_parse((tb), (max), NFA_DATA((nfa)), NFA_PAYLOAD((nfa))) + +#define nfattr_bad_size(tb, max, cta_min) \ +({ int __i, __res = 0; \ + for (__i=0; __i, + * (C) 2002-2005 by Harald Welte + * (C) 2005 by Pablo Neira Ayuso + * + * Initial netfilter messages via netlink development funded and + * generally made possible by Network Robots, Inc. (www.networkrobots.com) + * + * Further development of this code funded by Astaro AG (http://www.astaro.com) + * + * This software may be used and distributed according to the terms + * of the GNU General Public License, incorporated herein by reference. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +MODULE_LICENSE("GPL"); + +static char __initdata nfversion[] = "0.30"; + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +static struct sock *nfnl = NULL; +static struct nfnetlink_subsystem *subsys_table[NFNL_SUBSYS_COUNT]; +DECLARE_MUTEX(nfnl_sem); + +void nfnl_lock(void) +{ + nfnl_shlock(); +} + +void nfnl_unlock(void) +{ + nfnl_shunlock(); +} + +int nfnetlink_subsys_register(struct nfnetlink_subsystem *n) +{ + DEBUGP("registering subsystem ID %u\n", n->subsys_id); + + /* If the netlink socket wasn't created, then fail */ + if (!nfnl) + return -1; + + nfnl_lock(); + subsys_table[n->subsys_id] = n; + nfnl_unlock(); + + return 0; +} + +int nfnetlink_subsys_unregister(struct nfnetlink_subsystem *n) +{ + DEBUGP("unregistering subsystem ID %u\n", n->subsys_id); + + nfnl_lock(); + subsys_table[n->subsys_id] = NULL; + nfnl_unlock(); + + return 0; +} + +static inline struct nfnetlink_subsystem *nfnetlink_get_subsys(u_int16_t type) +{ + u_int8_t subsys_id = NFNL_SUBSYS_ID(type); + + if (subsys_id >= NFNL_SUBSYS_COUNT + || subsys_table[subsys_id] == NULL) + return NULL; + + return subsys_table[subsys_id]; +} + +static inline struct nfnl_callback * +nfnetlink_find_client(u_int16_t type, struct nfnetlink_subsystem *ss) +{ + u_int8_t cb_id = NFNL_MSG_TYPE(type); + + if (cb_id >= ss->cb_count) { + DEBUGP("msgtype %u >= %u, returning\n", type, ss->cb_count); + return NULL; + } + + return &ss->cb[cb_id]; +} + +void __nfa_fill(struct sk_buff *skb, int attrtype, int attrlen, + const void *data) +{ + struct nfattr *nfa; + int size = NFA_LENGTH(attrlen); + + nfa = (struct nfattr *)skb_put(skb, NFA_ALIGN(size)); + nfa->nfa_type = attrtype; + nfa->nfa_len = size; + memcpy(NFA_DATA(nfa), data, attrlen); +} + +int nfattr_parse(struct nfattr *tb[], int maxattr, struct nfattr *nfa, int len) +{ + memset(tb, 0, sizeof(struct nfattr *) * maxattr); + + while (NFA_OK(nfa, len)) { + unsigned flavor = nfa->nfa_type; + if (flavor && flavor <= maxattr) + tb[flavor-1] = nfa; + nfa = NFA_NEXT(nfa, len); + } + + return 0; +} + +/** + * nfnetlink_check_attributes - check and parse nfnetlink attributes + * + * subsys: nfnl subsystem for which this message is to be parsed + * nlmsghdr: netlink message to be checked/parsed + * cda: array of pointers, needs to be at least subsys->attr_count big + * + */ +static int +nfnetlink_check_attributes(struct nfnetlink_subsystem *subsys, + struct nlmsghdr *nlh, struct nfattr *cda[]) +{ + int min_len; + + memset(cda, 0, sizeof(struct nfattr *) * subsys->attr_count); + + /* check attribute lengths. */ + min_len = NLMSG_ALIGN(sizeof(struct nfgenmsg)); + if (nlh->nlmsg_len < min_len) + return -EINVAL; + + if (nlh->nlmsg_len > min_len) { + struct nfattr *attr = NFM_NFA(NLMSG_DATA(nlh)); + int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len); + + while (NFA_OK(attr, attrlen)) { + unsigned flavor = attr->nfa_type; + if (flavor) { + if (flavor > subsys->attr_count) + return -EINVAL; + cda[flavor - 1] = attr; + } + attr = NFA_NEXT(attr, attrlen); + } + } else + return -EINVAL; + + return 0; +} + +int nfnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo) +{ + int allocation = in_interrupt() ? GFP_ATOMIC : GFP_KERNEL; + int err = 0; + + NETLINK_CB(skb).dst_groups = group; + if (echo) + atomic_inc(&skb->users); + netlink_broadcast(nfnl, skb, pid, group, allocation); + if (echo) + err = netlink_unicast(nfnl, skb, pid, MSG_DONTWAIT); + + return err; +} + +int nfnetlink_unicast(struct sk_buff *skb, u_int32_t pid, int flags) +{ + return netlink_unicast(nfnl, skb, pid, flags); +} + +/* Process one complete nfnetlink message. */ +static inline int nfnetlink_rcv_msg(struct sk_buff *skb, + struct nlmsghdr *nlh, int *errp) +{ + struct nfnl_callback *nc; + struct nfnetlink_subsystem *ss; + int type, err = 0; + + DEBUGP("entered; subsys=%u, msgtype=%u\n", + NFNL_SUBSYS_ID(nlh->nlmsg_type), + NFNL_MSG_TYPE(nlh->nlmsg_type)); + + /* Only requests are handled by kernel now. */ + if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) { + DEBUGP("received non-request message\n"); + return 0; + } + + /* All the messages must at least contain nfgenmsg */ + if (nlh->nlmsg_len < + NLMSG_LENGTH(NLMSG_ALIGN(sizeof(struct nfgenmsg)))) { + DEBUGP("received message was too short\n"); + return 0; + } + + type = nlh->nlmsg_type; + ss = nfnetlink_get_subsys(type); + if (!ss) + goto err_inval; + + nc = nfnetlink_find_client(type, ss); + if (!nc) { + DEBUGP("unable to find client for type %d\n", type); + goto err_inval; + } + + if (nc->cap_required && + !cap_raised(NETLINK_CB(skb).eff_cap, nc->cap_required)) { + DEBUGP("permission denied for type %d\n", type); + *errp = -EPERM; + return -1; + } + + { + struct nfattr *cda[ss->attr_count]; + + memset(cda, 0, ss->attr_count*sizeof(struct nfattr *)); + + err = nfnetlink_check_attributes(ss, nlh, cda); + if (err < 0) + goto err_inval; + + err = nc->call(nfnl, skb, nlh, cda, errp); + *errp = err; + return err; + } + +err_inval: + *errp = -EINVAL; + return -1; +} + +/* Process one packet of messages. */ +static inline int nfnetlink_rcv_skb(struct sk_buff *skb) +{ + int err; + struct nlmsghdr *nlh; + + while (skb->len >= NLMSG_SPACE(0)) { + u32 rlen; + + nlh = (struct nlmsghdr *)skb->data; + if (nlh->nlmsg_len < sizeof(struct nlmsghdr) + || skb->len < nlh->nlmsg_len) + return 0; + rlen = NLMSG_ALIGN(nlh->nlmsg_len); + if (rlen > skb->len) + rlen = skb->len; + if (nfnetlink_rcv_msg(skb, nlh, &err)) { + if (!err) + return -1; + netlink_ack(skb, nlh, err); + } else + if (nlh->nlmsg_flags & NLM_F_ACK) + netlink_ack(skb, nlh, 0); + skb_pull(skb, rlen); + } + + return 0; +} + +static void nfnetlink_rcv(struct sock *sk, int len) +{ + do { + struct sk_buff *skb; + + if (nfnl_shlock_nowait()) + return; + + while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { + if (nfnetlink_rcv_skb(skb)) { + if (skb->len) + skb_queue_head(&sk->sk_receive_queue, + skb); + else + kfree_skb(skb); + break; + } + kfree_skb(skb); + } + + up(&nfnl_sem); + } while(nfnl && nfnl->sk_receive_queue.qlen); +} + +void __exit nfnetlink_exit(void) +{ + printk("Removing netfilter NETLINK layer.\n"); + sock_release(nfnl->sk_socket); + return; +} + +int __init nfnetlink_init(void) +{ + printk("Netfilter messages via NETLINK v%s.\n", nfversion); + + nfnl = netlink_kernel_create(NETLINK_NETFILTER, nfnetlink_rcv); + if (!nfnl) { + printk(KERN_ERR "cannot initialize nfnetlink!\n"); + return -1; + } + + return 0; +} + +module_init(nfnetlink_init); +module_exit(nfnetlink_exit); + +EXPORT_SYMBOL_GPL(nfnetlink_subsys_register); +EXPORT_SYMBOL_GPL(nfnetlink_subsys_unregister); +EXPORT_SYMBOL_GPL(nfnetlink_send); +EXPORT_SYMBOL_GPL(nfnetlink_unicast); +EXPORT_SYMBOL_GPL(nfattr_parse); +EXPORT_SYMBOL_GPL(__nfa_fill); -- cgit v0.10.2 From b0573dea1fb32ebc72ffa05980fd840df1d80860 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 9 Aug 2005 19:30:51 -0700 Subject: [NET]: Introduce SO_{SND,RCV}BUFFORCE socket options Allows overriding of sysctl_{wmem,rmrm}_max Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller diff --git a/include/asm-alpha/socket.h b/include/asm-alpha/socket.h index d00259d..b519322 100644 --- a/include/asm-alpha/socket.h +++ b/include/asm-alpha/socket.h @@ -25,6 +25,8 @@ #define SO_ERROR 0x1007 #define SO_SNDBUF 0x1001 #define SO_RCVBUF 0x1002 +#define SO_SNDBUFFORCE 0x100a +#define SO_RCVBUFFORCE 0x100b #define SO_RCVLOWAT 0x1010 #define SO_SNDLOWAT 0x1011 #define SO_RCVTIMEO 0x1012 diff --git a/include/asm-arm/socket.h b/include/asm-arm/socket.h index 46d2058..3c51da6 100644 --- a/include/asm-arm/socket.h +++ b/include/asm-arm/socket.h @@ -14,6 +14,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff --git a/include/asm-arm26/socket.h b/include/asm-arm26/socket.h index 46d2058..3c51da6 100644 --- a/include/asm-arm26/socket.h +++ b/include/asm-arm26/socket.h @@ -14,6 +14,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff --git a/include/asm-cris/socket.h b/include/asm-cris/socket.h index f159b4f..8b1da3e 100644 --- a/include/asm-cris/socket.h +++ b/include/asm-cris/socket.h @@ -16,6 +16,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff --git a/include/asm-frv/socket.h b/include/asm-frv/socket.h index c3be17c..7177f8b 100644 --- a/include/asm-frv/socket.h +++ b/include/asm-frv/socket.h @@ -14,6 +14,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff --git a/include/asm-h8300/socket.h b/include/asm-h8300/socket.h index af33b85..d98cf85 100644 --- a/include/asm-h8300/socket.h +++ b/include/asm-h8300/socket.h @@ -14,6 +14,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff --git a/include/asm-i386/socket.h b/include/asm-i386/socket.h index 07f6b38..802ae76 100644 --- a/include/asm-i386/socket.h +++ b/include/asm-i386/socket.h @@ -14,6 +14,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff --git a/include/asm-ia64/socket.h b/include/asm-ia64/socket.h index 21a9f10..a255006 100644 --- a/include/asm-ia64/socket.h +++ b/include/asm-ia64/socket.h @@ -23,6 +23,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff --git a/include/asm-m32r/socket.h b/include/asm-m32r/socket.h index 159519d..8b6680f 100644 --- a/include/asm-m32r/socket.h +++ b/include/asm-m32r/socket.h @@ -14,6 +14,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff --git a/include/asm-m68k/socket.h b/include/asm-m68k/socket.h index 8d0b9fc..f578ca4 100644 --- a/include/asm-m68k/socket.h +++ b/include/asm-m68k/socket.h @@ -14,6 +14,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff --git a/include/asm-mips/socket.h b/include/asm-mips/socket.h index 020b4db..d478a86 100644 --- a/include/asm-mips/socket.h +++ b/include/asm-mips/socket.h @@ -37,6 +37,8 @@ To add: #define SO_REUSEPORT 0x0200 /* Allow local address and port reuse. */ #define SO_ERROR 0x1007 /* get error status and clear */ #define SO_SNDBUF 0x1001 /* Send buffer size. */ #define SO_RCVBUF 0x1002 /* Receive buffer. */ +#define SO_SNDBUFFORCE 0x100a +#define SO_RCVBUFFORCE 0x100b #define SO_SNDLOWAT 0x1003 /* send low-water mark */ #define SO_RCVLOWAT 0x1004 /* receive low-water mark */ #define SO_SNDTIMEO 0x1005 /* send timeout */ diff --git a/include/asm-parisc/socket.h b/include/asm-parisc/socket.h index 4a77996..1bf54dc 100644 --- a/include/asm-parisc/socket.h +++ b/include/asm-parisc/socket.h @@ -16,6 +16,8 @@ /* To add :#define SO_REUSEPORT 0x0200 */ #define SO_SNDBUF 0x1001 #define SO_RCVBUF 0x1002 +#define SO_SNDBUFFORCE 0x100a +#define SO_RCVBUFFORCE 0x100b #define SO_SNDLOWAT 0x1003 #define SO_RCVLOWAT 0x1004 #define SO_SNDTIMEO 0x1005 diff --git a/include/asm-ppc/socket.h b/include/asm-ppc/socket.h index 4134376..296e1a3 100644 --- a/include/asm-ppc/socket.h +++ b/include/asm-ppc/socket.h @@ -20,6 +20,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff --git a/include/asm-ppc64/socket.h b/include/asm-ppc64/socket.h index 59e00df..9e1af8e 100644 --- a/include/asm-ppc64/socket.h +++ b/include/asm-ppc64/socket.h @@ -21,6 +21,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff --git a/include/asm-s390/socket.h b/include/asm-s390/socket.h index 0e96eec..15a5298 100644 --- a/include/asm-s390/socket.h +++ b/include/asm-s390/socket.h @@ -22,6 +22,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff --git a/include/asm-sh/socket.h b/include/asm-sh/socket.h index dde696c..553904f 100644 --- a/include/asm-sh/socket.h +++ b/include/asm-sh/socket.h @@ -14,6 +14,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_RCVBUFFORCE 32 +#define SO_SNDBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff --git a/include/asm-sparc/socket.h b/include/asm-sparc/socket.h index c1154e3..09575b6 100644 --- a/include/asm-sparc/socket.h +++ b/include/asm-sparc/socket.h @@ -29,6 +29,8 @@ #define SO_SNDBUF 0x1001 #define SO_RCVBUF 0x1002 +#define SO_SNDBUFFORCE 0x100a +#define SO_RCVBUFFORCE 0x100b #define SO_ERROR 0x1007 #define SO_TYPE 0x1008 diff --git a/include/asm-sparc64/socket.h b/include/asm-sparc64/socket.h index 865547a..59987da 100644 --- a/include/asm-sparc64/socket.h +++ b/include/asm-sparc64/socket.h @@ -29,6 +29,8 @@ #define SO_SNDBUF 0x1001 #define SO_RCVBUF 0x1002 +#define SO_SNDBUFFORCE 0x100a +#define SO_RCVBUFFORCE 0x100b #define SO_ERROR 0x1007 #define SO_TYPE 0x1008 diff --git a/include/asm-v850/socket.h b/include/asm-v850/socket.h index 213b852..0240d36 100644 --- a/include/asm-v850/socket.h +++ b/include/asm-v850/socket.h @@ -14,6 +14,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff --git a/include/asm-x86_64/socket.h b/include/asm-x86_64/socket.h index d9a252e..f2cdbea 100644 --- a/include/asm-x86_64/socket.h +++ b/include/asm-x86_64/socket.h @@ -14,6 +14,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff --git a/include/asm-xtensa/socket.h b/include/asm-xtensa/socket.h index daccd05..00f83f3 100644 --- a/include/asm-xtensa/socket.h +++ b/include/asm-xtensa/socket.h @@ -24,6 +24,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff --git a/net/core/sock.c b/net/core/sock.c index 12f6d9a..51a5e7d 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -260,7 +260,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname, if (val > sysctl_wmem_max) val = sysctl_wmem_max; - +set_sndbuf: sk->sk_userlocks |= SOCK_SNDBUF_LOCK; if ((val * 2) < SOCK_MIN_SNDBUF) sk->sk_sndbuf = SOCK_MIN_SNDBUF; @@ -274,6 +274,13 @@ int sock_setsockopt(struct socket *sock, int level, int optname, sk->sk_write_space(sk); break; + case SO_SNDBUFFORCE: + if (!capable(CAP_NET_ADMIN)) { + ret = -EPERM; + break; + } + goto set_sndbuf; + case SO_RCVBUF: /* Don't error on this BSD doesn't and if you think about it this is right. Otherwise apps have to @@ -282,7 +289,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname, if (val > sysctl_rmem_max) val = sysctl_rmem_max; - +set_rcvbuf: sk->sk_userlocks |= SOCK_RCVBUF_LOCK; /* FIXME: is this lower bound the right one? */ if ((val * 2) < SOCK_MIN_RCVBUF) @@ -291,6 +298,13 @@ int sock_setsockopt(struct socket *sock, int level, int optname, sk->sk_rcvbuf = val * 2; break; + case SO_RCVBUFFORCE: + if (!capable(CAP_NET_ADMIN)) { + ret = -EPERM; + break; + } + goto set_rcvbuf; + case SO_KEEPALIVE: #ifdef CONFIG_INET if (sk->sk_protocol == IPPROTO_TCP) -- cgit v0.10.2 From 6f1cf16582160c4839f05007c978743911aa022b Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 9 Aug 2005 19:31:17 -0700 Subject: [NET]: Remove HIPPI private from skbuff.h This removes the private element from skbuff, that is only used by HIPPI. Instead it uses skb->cb[] to hold the additional data that is needed in the output path from hard_header to device driver. PS: The only qdisc that might potentially corrupt this cb[] is if netem was used over HIPPI. I will take care of that by fixing netem to use skb->stamp. I don't expect many users of netem over HIPPI Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller diff --git a/drivers/net/rrunner.c b/drivers/net/rrunner.c index 12a86f9..ec1a18d 100644 --- a/drivers/net/rrunner.c +++ b/drivers/net/rrunner.c @@ -1429,6 +1429,7 @@ static int rr_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct rr_private *rrpriv = netdev_priv(dev); struct rr_regs __iomem *regs = rrpriv->regs; + struct hippi_cb *hcb = (struct hippi_cb *) skb->cb; struct ring_ctrl *txctrl; unsigned long flags; u32 index, len = skb->len; @@ -1460,7 +1461,7 @@ static int rr_start_xmit(struct sk_buff *skb, struct net_device *dev) ifield = (u32 *)skb_push(skb, 8); ifield[0] = 0; - ifield[1] = skb->private.ifield; + ifield[1] = hcb->ifield; /* * We don't need the lock before we are actually going to start diff --git a/include/linux/hippidevice.h b/include/linux/hippidevice.h index 9debe6b..9bc3b68 100644 --- a/include/linux/hippidevice.h +++ b/include/linux/hippidevice.h @@ -26,6 +26,11 @@ #include #ifdef __KERNEL__ + +struct hippi_cb { + __u32 ifield; +}; + extern unsigned short hippi_type_trans(struct sk_buff *skb, struct net_device *dev); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f10a8b9..4aeadb1 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -193,7 +193,6 @@ struct skb_shared_info { * @nfct: Associated connection, if any * @nfctinfo: Relationship of this skb to the connection * @nf_bridge: Saved data about a bridged frame - see br_netfilter.c - * @private: Data which is private to the HIPPI implementation * @tc_index: Traffic control index * @tc_verd: traffic control verdict */ @@ -265,11 +264,6 @@ struct sk_buff { struct nf_bridge_info *nf_bridge; #endif #endif /* CONFIG_NETFILTER */ -#if defined(CONFIG_HIPPI) - union { - __u32 ifield; - } private; -#endif #ifdef CONFIG_NET_SCHED __u32 tc_index; /* traffic control index */ #ifdef CONFIG_NET_CLS_ACT diff --git a/net/802/hippi.c b/net/802/hippi.c index 051e8af..cb45ae1 100644 --- a/net/802/hippi.c +++ b/net/802/hippi.c @@ -51,6 +51,7 @@ static int hippi_header(struct sk_buff *skb, struct net_device *dev, unsigned len) { struct hippi_hdr *hip = (struct hippi_hdr *)skb_push(skb, HIPPI_HLEN); + struct hippi_cb *hcb = (struct hippi_cb *) skb->cb; if (!len){ len = skb->len - HIPPI_HLEN; @@ -84,9 +85,10 @@ static int hippi_header(struct sk_buff *skb, struct net_device *dev, if (daddr) { memcpy(hip->le.dest_switch_addr, daddr + 3, 3); - memcpy(&skb->private.ifield, daddr + 2, 4); + memcpy(&hcb->ifield, daddr + 2, 4); return HIPPI_HLEN; } + hcb->ifield = 0; return -((int)HIPPI_HLEN); } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 8896e6f..16df7bd 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -365,9 +365,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, unsigned int __nocast gfp_mask) nf_bridge_get(skb->nf_bridge); #endif #endif /*CONFIG_NETFILTER*/ -#if defined(CONFIG_HIPPI) - C(private); -#endif #ifdef CONFIG_NET_SCHED C(tc_index); #ifdef CONFIG_NET_CLS_ACT -- cgit v0.10.2 From 080774a243f56ce2195ace96fba3d18548ee48ce Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 19:32:58 -0700 Subject: [NETFILTER]: Add ctnetlink subsystem Add ctnetlink subsystem for userspace-access to ip_conntrack table. This allows reading and updating of existing entries, as well as creating new ones (and new expect's) via nfnetlink. Please note the 'strange' byte order: nfattr (tag+length) are in host byte order, while the payload is always guaranteed to be in network byte order. This allows a simple userspace process to encapsulate netlink messages into arch-independent udp packets by just processing/swapping the headers and not knowing anything about the actual payload. Signed-off-by: Harald Welte Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index 8f1bfb8..ace7a7b 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -56,7 +56,7 @@ struct nfgenmsg { u_int16_t res_id; /* resource id */ } __attribute__ ((packed)); -#define NFNETLINK_V1 1 +#define NFNETLINK_V0 0 #define NFM_NFA(n) ((struct nfattr *)(((char *)(n)) \ + NLMSG_ALIGN(sizeof(struct nfgenmsg)))) @@ -81,6 +81,7 @@ enum nfnl_subsys_id { #ifdef __KERNEL__ +#include #include struct nfnl_callback diff --git a/include/linux/netfilter/nfnetlink_conntrack.h b/include/linux/netfilter/nfnetlink_conntrack.h new file mode 100644 index 0000000..fb528e0 --- /dev/null +++ b/include/linux/netfilter/nfnetlink_conntrack.h @@ -0,0 +1,123 @@ +#ifndef _IPCONNTRACK_NETLINK_H +#define _IPCONNTRACK_NETLINK_H +#include + +enum cntl_msg_types { + IPCTNL_MSG_CT_NEW, + IPCTNL_MSG_CT_GET, + IPCTNL_MSG_CT_DELETE, + IPCTNL_MSG_CT_GET_CTRZERO, + + IPCTNL_MSG_MAX +}; + +enum ctnl_exp_msg_types { + IPCTNL_MSG_EXP_NEW, + IPCTNL_MSG_EXP_GET, + IPCTNL_MSG_EXP_DELETE, + + IPCTNL_MSG_EXP_MAX +}; + + +enum ctattr_type { + CTA_UNSPEC, + CTA_TUPLE_ORIG, + CTA_TUPLE_REPLY, + CTA_STATUS, + CTA_PROTOINFO, + CTA_HELP, + CTA_NAT, + CTA_TIMEOUT, + CTA_MARK, + CTA_COUNTERS_ORIG, + CTA_COUNTERS_REPLY, + CTA_USE, + CTA_EXPECT, + CTA_ID, + __CTA_MAX +}; +#define CTA_MAX (__CTA_MAX - 1) + +enum ctattr_tuple { + CTA_TUPLE_UNSPEC, + CTA_TUPLE_IP, + CTA_TUPLE_PROTO, + __CTA_TUPLE_MAX +}; +#define CTA_TUPLE_MAX (__CTA_TUPLE_MAX - 1) + +enum ctattr_ip { + CTA_IP_UNSPEC, + CTA_IP_V4_SRC, + CTA_IP_V4_DST, + CTA_IP_V6_SRC, + CTA_IP_V6_DST, + __CTA_IP_MAX +}; +#define CTA_IP_MAX (__CTA_IP_MAX - 1) + +enum ctattr_l4proto { + CTA_PROTO_UNSPEC, + CTA_PROTO_NUM, + CTA_PROTO_SRC_PORT, + CTA_PROTO_DST_PORT, + CTA_PROTO_ICMP_ID, + CTA_PROTO_ICMP_TYPE, + CTA_PROTO_ICMP_CODE, + __CTA_PROTO_MAX +}; +#define CTA_PROTO_MAX (__CTA_PROTO_MAX - 1) + +enum ctattr_protoinfo { + CTA_PROTOINFO_UNSPEC, + CTA_PROTOINFO_TCP_STATE, + __CTA_PROTOINFO_MAX +}; +#define CTA_PROTOINFO_MAX (__CTA_PROTOINFO_MAX - 1) + +enum ctattr_counters { + CTA_COUNTERS_UNSPEC, + CTA_COUNTERS_PACKETS, + CTA_COUNTERS_BYTES, + __CTA_COUNTERS_MAX +}; +#define CTA_COUNTERS_MAX (__CTA_COUNTERS_MAX - 1) + +enum ctattr_nat { + CTA_NAT_UNSPEC, + CTA_NAT_MINIP, + CTA_NAT_MAXIP, + CTA_NAT_PROTO, + __CTA_NAT_MAX +}; +#define CTA_NAT_MAX (__CTA_NAT_MAX - 1) + +enum ctattr_protonat { + CTA_PROTONAT_UNSPEC, + CTA_PROTONAT_PORT_MIN, + CTA_PROTONAT_PORT_MAX, + __CTA_PROTONAT_MAX +}; +#define CTA_PROTONAT_MAX (__CTA_PROTONAT_MAX - 1) + +enum ctattr_expect { + CTA_EXPECT_UNSPEC, + CTA_EXPECT_TUPLE, + CTA_EXPECT_MASK, + CTA_EXPECT_TIMEOUT, + CTA_EXPECT_ID, + __CTA_EXPECT_MAX +}; +#define CTA_EXPECT_MAX (__CTA_EXPECT_MAX - 1) + +enum ctattr_help { + CTA_HELP_UNSPEC, + CTA_HELP_NAME, + __CTA_HELP_MAX +}; +#define CTA_HELP_MAX (__CTA_HELP_MAX - 1) + +#define CTA_HELP_MAXNAMESIZE 32 + +#endif /* _IPCONNTRACK_NETLINK_H */ diff --git a/include/linux/netfilter_ipv4/ip_conntrack.h b/include/linux/netfilter_ipv4/ip_conntrack.h index ae1270c..ff2c1c6 100644 --- a/include/linux/netfilter_ipv4/ip_conntrack.h +++ b/include/linux/netfilter_ipv4/ip_conntrack.h @@ -209,6 +209,9 @@ struct ip_conntrack /* Current number of expected connections */ unsigned int expecting; + /* Unique ID that identifies this conntrack*/ + unsigned int id; + /* Helper, if any. */ struct ip_conntrack_helper *helper; @@ -257,6 +260,9 @@ struct ip_conntrack_expect /* Usage count. */ atomic_t use; + /* Unique ID */ + unsigned int id; + #ifdef CONFIG_IP_NF_NAT_NEEDED /* This is the original per-proto part, used to map the * expected connection the way the recipient expects. */ @@ -296,7 +302,12 @@ ip_conntrack_get(const struct sk_buff *skb, enum ip_conntrack_info *ctinfo) } /* decrement reference count on a conntrack */ -extern void ip_conntrack_put(struct ip_conntrack *ct); +static inline void +ip_conntrack_put(struct ip_conntrack *ct) +{ + IP_NF_ASSERT(ct); + nf_conntrack_put(&ct->ct_general); +} /* call to create an explicit dependency on ip_conntrack. */ extern void need_ip_conntrack(void); @@ -331,6 +342,39 @@ extern void ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *data), void *data); +extern struct ip_conntrack_helper * +__ip_conntrack_helper_find_byname(const char *); +extern struct ip_conntrack_helper * +ip_conntrack_helper_find_get(const struct ip_conntrack_tuple *tuple); +extern void ip_conntrack_helper_put(struct ip_conntrack_helper *helper); + +extern struct ip_conntrack_protocol * +__ip_conntrack_proto_find(u_int8_t protocol); +extern struct ip_conntrack_protocol * +ip_conntrack_proto_find_get(u_int8_t protocol); +extern void ip_conntrack_proto_put(struct ip_conntrack_protocol *proto); + +extern void ip_ct_remove_expectations(struct ip_conntrack *ct); + +extern struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *, + struct ip_conntrack_tuple *); + +extern void ip_conntrack_free(struct ip_conntrack *ct); + +extern void ip_conntrack_hash_insert(struct ip_conntrack *ct); + +extern struct ip_conntrack_expect * +__ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple); + +extern struct ip_conntrack_expect * +ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple); + +extern struct ip_conntrack_tuple_hash * +__ip_conntrack_find(const struct ip_conntrack_tuple *tuple, + const struct ip_conntrack *ignored_conntrack); + +extern void ip_conntrack_flush(void); + /* It's confirmed if it is, or has been in the hash table. */ static inline int is_confirmed(struct ip_conntrack *ct) { diff --git a/include/linux/netfilter_ipv4/ip_conntrack_core.h b/include/linux/netfilter_ipv4/ip_conntrack_core.h index 46eeea1..fbf6c3e 100644 --- a/include/linux/netfilter_ipv4/ip_conntrack_core.h +++ b/include/linux/netfilter_ipv4/ip_conntrack_core.h @@ -2,6 +2,9 @@ #define _IP_CONNTRACK_CORE_H #include +#define MAX_IP_CT_PROTO 256 +extern struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO]; + /* This header is used to share core functionality between the standalone connection tracking module, and the compatibility layer's use of connection tracking. */ @@ -53,6 +56,8 @@ struct ip_conntrack_ecache; extern void __ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ec); #endif +extern void __ip_ct_expect_unlink_destroy(struct ip_conntrack_expect *exp); + extern struct list_head *ip_conntrack_hash; extern struct list_head ip_conntrack_expect_list; extern rwlock_t ip_conntrack_lock; diff --git a/include/linux/netfilter_ipv4/ip_conntrack_helper.h b/include/linux/netfilter_ipv4/ip_conntrack_helper.h index 3692daa..8d69279 100644 --- a/include/linux/netfilter_ipv4/ip_conntrack_helper.h +++ b/include/linux/netfilter_ipv4/ip_conntrack_helper.h @@ -24,6 +24,8 @@ struct ip_conntrack_helper int (*help)(struct sk_buff **pskb, struct ip_conntrack *ct, enum ip_conntrack_info conntrackinfo); + + int (*to_nfattr)(struct sk_buff *skb, const struct ip_conntrack *ct); }; extern int ip_conntrack_helper_register(struct ip_conntrack_helper *); diff --git a/include/linux/netfilter_ipv4/ip_conntrack_protocol.h b/include/linux/netfilter_ipv4/ip_conntrack_protocol.h index e20b57c..b6b99be 100644 --- a/include/linux/netfilter_ipv4/ip_conntrack_protocol.h +++ b/include/linux/netfilter_ipv4/ip_conntrack_protocol.h @@ -2,6 +2,7 @@ #ifndef _IP_CONNTRACK_PROTOCOL_H #define _IP_CONNTRACK_PROTOCOL_H #include +#include struct seq_file; @@ -47,22 +48,22 @@ struct ip_conntrack_protocol int (*error)(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, unsigned int hooknum); + /* convert protoinfo to nfnetink attributes */ + int (*to_nfattr)(struct sk_buff *skb, struct nfattr *nfa, + const struct ip_conntrack *ct); + + int (*tuple_to_nfattr)(struct sk_buff *skb, + const struct ip_conntrack_tuple *t); + int (*nfattr_to_tuple)(struct nfattr *tb[], + struct ip_conntrack_tuple *t); + /* Module (if any) which this is connected to. */ struct module *me; }; -#define MAX_IP_CT_PROTO 256 -extern struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO]; - /* Protocol registration. */ extern int ip_conntrack_protocol_register(struct ip_conntrack_protocol *proto); extern void ip_conntrack_protocol_unregister(struct ip_conntrack_protocol *proto); - -static inline struct ip_conntrack_protocol *ip_ct_find_proto(u_int8_t protocol) -{ - return ip_ct_protos[protocol]; -} - /* Existing built-in protocols */ extern struct ip_conntrack_protocol ip_conntrack_protocol_tcp; extern struct ip_conntrack_protocol ip_conntrack_protocol_udp; @@ -73,6 +74,11 @@ extern int ip_conntrack_protocol_tcp_init(void); /* Log invalid packets */ extern unsigned int ip_ct_log_invalid; +extern int ip_ct_port_tuple_to_nfattr(struct sk_buff *, + const struct ip_conntrack_tuple *); +extern int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[], + struct ip_conntrack_tuple *); + #ifdef CONFIG_SYSCTL #ifdef DEBUG_INVALID_PACKETS #define LOG_INVALID(proto) \ diff --git a/include/linux/netfilter_ipv4/ip_nat_protocol.h b/include/linux/netfilter_ipv4/ip_nat_protocol.h index 129708c..ef63aa9 100644 --- a/include/linux/netfilter_ipv4/ip_nat_protocol.h +++ b/include/linux/netfilter_ipv4/ip_nat_protocol.h @@ -4,6 +4,9 @@ #include #include +#include +#include + struct iphdr; struct ip_nat_range; @@ -15,6 +18,8 @@ struct ip_nat_protocol /* Protocol number. */ unsigned int protonum; + struct module *me; + /* Translate a packet to the target according to manip type. Return true if succeeded. */ int (*manip_pkt)(struct sk_buff **pskb, @@ -43,19 +48,20 @@ struct ip_nat_protocol unsigned int (*print_range)(char *buffer, const struct ip_nat_range *range); -}; -#define MAX_IP_NAT_PROTO 256 -extern struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO]; + int (*range_to_nfattr)(struct sk_buff *skb, + const struct ip_nat_range *range); + + int (*nfattr_to_range)(struct nfattr *tb[], + struct ip_nat_range *range); +}; /* Protocol registration. */ extern int ip_nat_protocol_register(struct ip_nat_protocol *proto); extern void ip_nat_protocol_unregister(struct ip_nat_protocol *proto); -static inline struct ip_nat_protocol *ip_nat_find_proto(u_int8_t protocol) -{ - return ip_nat_protos[protocol]; -} +extern struct ip_nat_protocol *ip_nat_proto_find_get(u_int8_t protocol); +extern void ip_nat_proto_put(struct ip_nat_protocol *proto); /* Built-in protocols. */ extern struct ip_nat_protocol ip_nat_protocol_tcp; @@ -67,4 +73,9 @@ extern int init_protocols(void) __init; extern void cleanup_protocols(void); extern struct ip_nat_protocol *find_nat_proto(u_int16_t protonum); +extern int ip_nat_port_range_to_nfattr(struct sk_buff *skb, + const struct ip_nat_range *range); +extern int ip_nat_port_nfattr_to_range(struct nfattr *tb[], + struct ip_nat_range *range); + #endif /*_IP_NAT_PROTO_H*/ diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index ff3393e..e47ba39 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -702,5 +702,12 @@ config IP_NF_ARP_MANGLE Allows altering the ARP packet payload: source and destination hardware and network addresses. +config IP_NF_CONNTRACK_NETLINK + tristate 'Connection tracking netlink interface' + depends on IP_NF_CONNTRACK && NETFILTER_NETLINK + help + This option enables support for a netlink-based userspace interface + + endmenu diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 45796d5..abf2a7d 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -9,6 +9,10 @@ iptable_nat-objs := ip_nat_standalone.o ip_nat_rule.o ip_nat_core.o ip_nat_helpe # connection tracking obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o +# conntrack netlink interface +obj-$(CONFIG_IP_NF_CONNTRACK_NETLINK) += ip_conntrack_netlink.o + + # SCTP protocol connection tracking obj-$(CONFIG_IP_NF_CT_PROTO_SCTP) += ip_conntrack_proto_sctp.o diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index caf89de..d9fddae 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -50,7 +50,7 @@ #include #include -#define IP_CONNTRACK_VERSION "2.2" +#define IP_CONNTRACK_VERSION "2.3" #if 0 #define DEBUGP printk @@ -77,6 +77,8 @@ unsigned int ip_ct_log_invalid; static LIST_HEAD(unconfirmed); static int ip_conntrack_vmalloc; +static unsigned int ip_conntrack_next_id = 1; +static unsigned int ip_conntrack_expect_next_id = 1; #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS struct notifier_block *ip_conntrack_chain; struct notifier_block *ip_conntrack_expect_chain; @@ -154,13 +156,6 @@ void ip_conntrack_event_cache_init(const struct sk_buff *skb) DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat); -void -ip_conntrack_put(struct ip_conntrack *ct) -{ - IP_NF_ASSERT(ct); - nf_conntrack_put(&ct->ct_general); -} - static int ip_conntrack_hash_rnd_initted; static unsigned int ip_conntrack_hash_rnd; @@ -222,6 +217,12 @@ static void unlink_expect(struct ip_conntrack_expect *exp) exp->master->expecting--; } +void __ip_ct_expect_unlink_destroy(struct ip_conntrack_expect *exp) +{ + unlink_expect(exp); + ip_conntrack_expect_put(exp); +} + static void expectation_timed_out(unsigned long ul_expect) { struct ip_conntrack_expect *exp = (void *)ul_expect; @@ -232,6 +233,33 @@ static void expectation_timed_out(unsigned long ul_expect) ip_conntrack_expect_put(exp); } +struct ip_conntrack_expect * +__ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple) +{ + struct ip_conntrack_expect *i; + + list_for_each_entry(i, &ip_conntrack_expect_list, list) { + if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) { + atomic_inc(&i->use); + return i; + } + } + return NULL; +} + +/* Just find a expectation corresponding to a tuple. */ +struct ip_conntrack_expect * +ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple) +{ + struct ip_conntrack_expect *i; + + read_lock_bh(&ip_conntrack_lock); + i = __ip_conntrack_expect_find(tuple); + read_unlock_bh(&ip_conntrack_lock); + + return i; +} + /* If an expectation for this connection is found, it gets delete from * global list then returned. */ static struct ip_conntrack_expect * @@ -256,7 +284,7 @@ find_expectation(const struct ip_conntrack_tuple *tuple) } /* delete all expectations for this conntrack */ -static void remove_expectations(struct ip_conntrack *ct) +void ip_ct_remove_expectations(struct ip_conntrack *ct) { struct ip_conntrack_expect *i, *tmp; @@ -286,7 +314,7 @@ clean_from_lists(struct ip_conntrack *ct) LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]); /* Destroy all pending expectations */ - remove_expectations(ct); + ip_ct_remove_expectations(ct); } static void @@ -304,7 +332,7 @@ destroy_conntrack(struct nf_conntrack *nfct) /* To make sure we don't get any weird locking issues here: * destroy_conntrack() MUST NOT be called with a write lock * to ip_conntrack_lock!!! -HW */ - proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum); + proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum); if (proto && proto->destroy) proto->destroy(ct); @@ -316,7 +344,7 @@ destroy_conntrack(struct nf_conntrack *nfct) * except TFTP can create an expectation on the first packet, * before connection is in the list, so we need to clean here, * too. */ - remove_expectations(ct); + ip_ct_remove_expectations(ct); /* We overload first tuple to link into unconfirmed list. */ if (!is_confirmed(ct)) { @@ -331,8 +359,7 @@ destroy_conntrack(struct nf_conntrack *nfct) ip_conntrack_put(ct->master); DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct); - kmem_cache_free(ip_conntrack_cachep, ct); - atomic_dec(&ip_conntrack_count); + ip_conntrack_free(ct); } static void death_by_timeout(unsigned long ul_conntrack) @@ -359,7 +386,7 @@ conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i, && ip_ct_tuple_equal(tuple, &i->tuple); } -static struct ip_conntrack_tuple_hash * +struct ip_conntrack_tuple_hash * __ip_conntrack_find(const struct ip_conntrack_tuple *tuple, const struct ip_conntrack *ignored_conntrack) { @@ -394,6 +421,29 @@ ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple, return h; } +static void __ip_conntrack_hash_insert(struct ip_conntrack *ct, + unsigned int hash, + unsigned int repl_hash) +{ + ct->id = ++ip_conntrack_next_id; + list_prepend(&ip_conntrack_hash[hash], + &ct->tuplehash[IP_CT_DIR_ORIGINAL].list); + list_prepend(&ip_conntrack_hash[repl_hash], + &ct->tuplehash[IP_CT_DIR_REPLY].list); +} + +void ip_conntrack_hash_insert(struct ip_conntrack *ct) +{ + unsigned int hash, repl_hash; + + hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); + + write_lock_bh(&ip_conntrack_lock); + __ip_conntrack_hash_insert(ct, hash, repl_hash); + write_unlock_bh(&ip_conntrack_lock); +} + /* Confirm a connection given skb; places it in hash table */ int __ip_conntrack_confirm(struct sk_buff **pskb) @@ -440,10 +490,7 @@ __ip_conntrack_confirm(struct sk_buff **pskb) /* Remove from unconfirmed list */ list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); - list_prepend(&ip_conntrack_hash[hash], - &ct->tuplehash[IP_CT_DIR_ORIGINAL]); - list_prepend(&ip_conntrack_hash[repl_hash], - &ct->tuplehash[IP_CT_DIR_REPLY]); + __ip_conntrack_hash_insert(ct, hash, repl_hash); /* Timer relative to confirmation time, not original setting time, otherwise we'd get timer wrap in weird delay cases. */ @@ -527,34 +574,84 @@ static inline int helper_cmp(const struct ip_conntrack_helper *i, return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask); } -static struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple) +static struct ip_conntrack_helper * +__ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple) { return LIST_FIND(&helpers, helper_cmp, struct ip_conntrack_helper *, tuple); } -/* Allocate a new conntrack: we return -ENOMEM if classification - failed due to stress. Otherwise it really is unclassifiable. */ -static struct ip_conntrack_tuple_hash * -init_conntrack(const struct ip_conntrack_tuple *tuple, - struct ip_conntrack_protocol *protocol, - struct sk_buff *skb) +struct ip_conntrack_helper * +ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple) +{ + struct ip_conntrack_helper *helper; + + /* need ip_conntrack_lock to assure that helper exists until + * try_module_get() is called */ + read_lock_bh(&ip_conntrack_lock); + + helper = __ip_conntrack_helper_find(tuple); + if (helper) { + /* need to increase module usage count to assure helper will + * not go away while the caller is e.g. busy putting a + * conntrack in the hash that uses the helper */ + if (!try_module_get(helper->me)) + helper = NULL; + } + + read_unlock_bh(&ip_conntrack_lock); + + return helper; +} + +void ip_conntrack_helper_put(struct ip_conntrack_helper *helper) +{ + module_put(helper->me); +} + +struct ip_conntrack_protocol * +__ip_conntrack_proto_find(u_int8_t protocol) +{ + return ip_ct_protos[protocol]; +} + +/* this is guaranteed to always return a valid protocol helper, since + * it falls back to generic_protocol */ +struct ip_conntrack_protocol * +ip_conntrack_proto_find_get(u_int8_t protocol) +{ + struct ip_conntrack_protocol *p; + + preempt_disable(); + p = __ip_conntrack_proto_find(protocol); + if (p) { + if (!try_module_get(p->me)) + p = &ip_conntrack_generic_protocol; + } + preempt_enable(); + + return p; +} + +void ip_conntrack_proto_put(struct ip_conntrack_protocol *p) +{ + module_put(p->me); +} + +struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig, + struct ip_conntrack_tuple *repl) { struct ip_conntrack *conntrack; - struct ip_conntrack_tuple repl_tuple; - size_t hash; - struct ip_conntrack_expect *exp; if (!ip_conntrack_hash_rnd_initted) { get_random_bytes(&ip_conntrack_hash_rnd, 4); ip_conntrack_hash_rnd_initted = 1; } - hash = hash_conntrack(tuple); - if (ip_conntrack_max && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) { + unsigned int hash = hash_conntrack(orig); /* Try dropping from this hash chain. */ if (!early_drop(&ip_conntrack_hash[hash])) { if (net_ratelimit()) @@ -565,31 +662,58 @@ init_conntrack(const struct ip_conntrack_tuple *tuple, } } - if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) { - DEBUGP("Can't invert tuple.\n"); - return NULL; - } - conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC); if (!conntrack) { DEBUGP("Can't allocate conntrack.\n"); - return ERR_PTR(-ENOMEM); + return NULL; } memset(conntrack, 0, sizeof(*conntrack)); atomic_set(&conntrack->ct_general.use, 1); conntrack->ct_general.destroy = destroy_conntrack; - conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple; - conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple; - if (!protocol->new(conntrack, skb)) { - kmem_cache_free(ip_conntrack_cachep, conntrack); - return NULL; - } + conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; + conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; /* Don't set timer yet: wait for confirmation */ init_timer(&conntrack->timeout); conntrack->timeout.data = (unsigned long)conntrack; conntrack->timeout.function = death_by_timeout; + atomic_inc(&ip_conntrack_count); + + return conntrack; +} + +void +ip_conntrack_free(struct ip_conntrack *conntrack) +{ + atomic_dec(&ip_conntrack_count); + kmem_cache_free(ip_conntrack_cachep, conntrack); +} + +/* Allocate a new conntrack: we return -ENOMEM if classification + * failed due to stress. Otherwise it really is unclassifiable */ +static struct ip_conntrack_tuple_hash * +init_conntrack(struct ip_conntrack_tuple *tuple, + struct ip_conntrack_protocol *protocol, + struct sk_buff *skb) +{ + struct ip_conntrack *conntrack; + struct ip_conntrack_tuple repl_tuple; + struct ip_conntrack_expect *exp; + + if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) { + DEBUGP("Can't invert tuple.\n"); + return NULL; + } + + if (!(conntrack = ip_conntrack_alloc(tuple, &repl_tuple))) + return NULL; + + if (!protocol->new(conntrack, skb)) { + ip_conntrack_free(conntrack); + return NULL; + } + write_lock_bh(&ip_conntrack_lock); exp = find_expectation(tuple); @@ -610,7 +734,7 @@ init_conntrack(const struct ip_conntrack_tuple *tuple, nf_conntrack_get(&conntrack->master->ct_general); CONNTRACK_STAT_INC(expect_new); } else { - conntrack->helper = ip_ct_find_helper(&repl_tuple); + conntrack->helper = __ip_conntrack_helper_find(&repl_tuple); CONNTRACK_STAT_INC(new); } @@ -618,7 +742,6 @@ init_conntrack(const struct ip_conntrack_tuple *tuple, /* Overload tuple linked list to put us in unconfirmed list. */ list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed); - atomic_inc(&ip_conntrack_count); write_unlock_bh(&ip_conntrack_lock); if (exp) { @@ -729,7 +852,7 @@ unsigned int ip_conntrack_in(unsigned int hooknum, } #endif - proto = ip_ct_find_proto((*pskb)->nh.iph->protocol); + proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol); /* It may be an special packet, error, unclean... * inverse of the return code tells to the netfilter @@ -777,7 +900,7 @@ int invert_tuplepr(struct ip_conntrack_tuple *inverse, const struct ip_conntrack_tuple *orig) { return ip_ct_invert_tuple(inverse, orig, - ip_ct_find_proto(orig->dst.protonum)); + __ip_conntrack_proto_find(orig->dst.protonum)); } /* Would two expected things clash? */ @@ -857,6 +980,8 @@ static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp) exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ; add_timer(&exp->timeout); + exp->id = ++ip_conntrack_expect_next_id; + atomic_inc(&exp->use); CONNTRACK_STAT_INC(expect_create); } @@ -936,7 +1061,7 @@ void ip_conntrack_alter_reply(struct ip_conntrack *conntrack, conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; if (!conntrack->master && conntrack->expecting == 0) - conntrack->helper = ip_ct_find_helper(newreply); + conntrack->helper = __ip_conntrack_helper_find(newreply); write_unlock_bh(&ip_conntrack_lock); } @@ -950,6 +1075,19 @@ int ip_conntrack_helper_register(struct ip_conntrack_helper *me) return 0; } +struct ip_conntrack_helper * +__ip_conntrack_helper_find_byname(const char *name) +{ + struct ip_conntrack_helper *h; + + list_for_each_entry(h, &helpers, list) { + if (!strcmp(h->name, name)) + return h; + } + + return NULL; +} + static inline int unhelp(struct ip_conntrack_tuple_hash *i, const struct ip_conntrack_helper *me) { @@ -1025,6 +1163,39 @@ void ip_ct_refresh_acct(struct ip_conntrack *ct, } } +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) +/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be + * in ip_conntrack_core, since we don't want the protocols to autoload + * or depend on ctnetlink */ +int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb, + const struct ip_conntrack_tuple *tuple) +{ + NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t), + &tuple->src.u.tcp.port); + NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t), + &tuple->dst.u.tcp.port); + return 0; + +nfattr_failure: + return -1; +} + +int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[], + struct ip_conntrack_tuple *t) +{ + if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1]) + return -EINVAL; + + t->src.u.tcp.port = + *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]); + t->dst.u.tcp.port = + *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]); + + return 0; +} +#endif + /* Returns new sk_buff, or NULL */ struct sk_buff * ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user) @@ -1203,16 +1374,13 @@ static void free_conntrack_hash(void) * ip_conntrack_htable_size)); } -/* Mishearing the voices in his head, our hero wonders how he's - supposed to kill the mall. */ -void ip_conntrack_cleanup(void) +void ip_conntrack_flush() { - ip_ct_attach = NULL; /* This makes sure all current packets have passed through netfilter framework. Roll on, two-stage module delete... */ synchronize_net(); - + i_see_dead_people: ip_ct_iterate_cleanup(kill_all, NULL); if (atomic_read(&ip_conntrack_count) != 0) { @@ -1222,7 +1390,14 @@ void ip_conntrack_cleanup(void) /* wait until all references to ip_conntrack_untracked are dropped */ while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1) schedule(); +} +/* Mishearing the voices in his head, our hero wonders how he's + supposed to kill the mall. */ +void ip_conntrack_cleanup(void) +{ + ip_ct_attach = NULL; + ip_conntrack_flush(); kmem_cache_destroy(ip_conntrack_cachep); kmem_cache_destroy(ip_conntrack_expect_cachep); free_conntrack_hash(); diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c new file mode 100644 index 0000000..f43ec18 --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -0,0 +1,1588 @@ +/* Connection tracking via netlink socket. Allows for user space + * protocol helpers and general trouble making from userspace. + * + * (C) 2001 by Jay Schulist + * (C) 2002-2005 by Harald Welte + * (C) 2003 by Patrick Mchardy + * (C) 2005 by Pablo Neira Ayuso + * + * I've reworked this stuff to use attributes instead of conntrack + * structures. 5.44 am. I need more tea. --pablo 05/07/11. + * + * Initial connection tracking via netlink development funded and + * generally made possible by Network Robots, Inc. (www.networkrobots.com) + * + * Further development of this code funded by Astaro AG (http://www.astaro.com) + * + * This software may be used and distributed according to the terms + * of the GNU General Public License, incorporated herein by reference. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); + +static char __initdata version[] = "0.90"; + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + + +static inline int +ctnetlink_dump_tuples_proto(struct sk_buff *skb, + const struct ip_conntrack_tuple *tuple) +{ + struct ip_conntrack_protocol *proto; + + NFA_PUT(skb, CTA_PROTO_NUM, sizeof(u_int8_t), &tuple->dst.protonum); + + proto = ip_conntrack_proto_find_get(tuple->dst.protonum); + if (proto && proto->tuple_to_nfattr) + return proto->tuple_to_nfattr(skb, tuple); + + return 0; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_dump_tuples(struct sk_buff *skb, + const struct ip_conntrack_tuple *tuple) +{ + struct nfattr *nest_parms; + + nest_parms = NFA_NEST(skb, CTA_TUPLE_IP); + NFA_PUT(skb, CTA_IP_V4_SRC, sizeof(u_int32_t), &tuple->src.ip); + NFA_PUT(skb, CTA_IP_V4_DST, sizeof(u_int32_t), &tuple->dst.ip); + NFA_NEST_END(skb, nest_parms); + + nest_parms = NFA_NEST(skb, CTA_TUPLE_PROTO); + ctnetlink_dump_tuples_proto(skb, tuple); + NFA_NEST_END(skb, nest_parms); + + return 0; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_dump_status(struct sk_buff *skb, const struct ip_conntrack *ct) +{ + u_int32_t status = htonl((u_int32_t) ct->status); + NFA_PUT(skb, CTA_STATUS, sizeof(status), &status); + return 0; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_dump_timeout(struct sk_buff *skb, const struct ip_conntrack *ct) +{ + long timeout_l = ct->timeout.expires - jiffies; + u_int32_t timeout; + + if (timeout_l < 0) + timeout = 0; + else + timeout = htonl(timeout_l / HZ); + + NFA_PUT(skb, CTA_TIMEOUT, sizeof(timeout), &timeout); + return 0; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_dump_protoinfo(struct sk_buff *skb, const struct ip_conntrack *ct) +{ + struct ip_conntrack_protocol *proto = ip_conntrack_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum); + + struct nfattr *nest_proto; + int ret; + + if (!proto || !proto->to_nfattr) + return 0; + + nest_proto = NFA_NEST(skb, CTA_PROTOINFO); + + ret = proto->to_nfattr(skb, nest_proto, ct); + + ip_conntrack_proto_put(proto); + + NFA_NEST_END(skb, nest_proto); + + return ret; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_dump_helpinfo(struct sk_buff *skb, const struct ip_conntrack *ct) +{ + struct nfattr *nest_helper; + + if (!ct->helper) + return 0; + + nest_helper = NFA_NEST(skb, CTA_HELP); + NFA_PUT(skb, CTA_HELP_NAME, CTA_HELP_MAXNAMESIZE, &ct->helper->name); + + if (ct->helper->to_nfattr) + ct->helper->to_nfattr(skb, ct); + + NFA_NEST_END(skb, nest_helper); + + return 0; + +nfattr_failure: + return -1; +} + +#ifdef CONFIG_IP_NF_CT_ACCT +static inline int +ctnetlink_dump_counters(struct sk_buff *skb, const struct ip_conntrack *ct, + enum ip_conntrack_dir dir) +{ + enum ctattr_type type = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG; + struct nfattr *nest_count = NFA_NEST(skb, type); + u_int64_t tmp; + + tmp = cpu_to_be64(ct->counters[dir].packets); + NFA_PUT(skb, CTA_COUNTERS_PACKETS, sizeof(u_int64_t), &tmp); + + tmp = cpu_to_be64(ct->counters[dir].bytes); + NFA_PUT(skb, CTA_COUNTERS_BYTES, sizeof(u_int64_t), &tmp); + + NFA_NEST_END(skb, nest_count); + + return 0; + +nfattr_failure: + return -1; +} +#else +#define ctnetlink_dump_counters(a, b, c) (0) +#endif + +#ifdef CONFIG_IP_NF_CONNTRACK_MARK +static inline int +ctnetlink_dump_mark(struct sk_buff *skb, const struct ip_conntrack *ct) +{ + u_int32_t mark = htonl(ct->mark); + + NFA_PUT(skb, CTA_MARK, sizeof(u_int32_t), &mark); + return 0; + +nfattr_failure: + return -1; +} +#else +#define ctnetlink_dump_mark(a, b) (0) +#endif + +static inline int +ctnetlink_dump_id(struct sk_buff *skb, const struct ip_conntrack *ct) +{ + u_int32_t id = htonl(ct->id); + NFA_PUT(skb, CTA_ID, sizeof(u_int32_t), &id); + return 0; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_dump_use(struct sk_buff *skb, const struct ip_conntrack *ct) +{ + unsigned int use = htonl(atomic_read(&ct->ct_general.use)); + + NFA_PUT(skb, CTA_USE, sizeof(u_int32_t), &use); + return 0; + +nfattr_failure: + return -1; +} + +#define tuple(ct, dir) (&(ct)->tuplehash[dir].tuple) + +static int +ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq, + int event, int nowait, + const struct ip_conntrack *ct) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + struct nfattr *nest_parms; + unsigned char *b; + + b = skb->tail; + + event |= NFNL_SUBSYS_CTNETLINK << 8; + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg)); + nfmsg = NLMSG_DATA(nlh); + + nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0; + nfmsg->nfgen_family = AF_INET; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG); + if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) + goto nfattr_failure; + NFA_NEST_END(skb, nest_parms); + + nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY); + if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0) + goto nfattr_failure; + NFA_NEST_END(skb, nest_parms); + + if (ctnetlink_dump_status(skb, ct) < 0 || + ctnetlink_dump_timeout(skb, ct) < 0 || + ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 || + ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0 || + ctnetlink_dump_protoinfo(skb, ct) < 0 || + ctnetlink_dump_helpinfo(skb, ct) < 0 || + ctnetlink_dump_mark(skb, ct) < 0 || + ctnetlink_dump_id(skb, ct) < 0 || + ctnetlink_dump_use(skb, ct) < 0) + goto nfattr_failure; + + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +nfattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS +static int ctnetlink_conntrack_event(struct notifier_block *this, + unsigned long events, void *ptr) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + struct nfattr *nest_parms; + struct ip_conntrack *ct = (struct ip_conntrack *)ptr; + struct sk_buff *skb; + unsigned int type; + unsigned char *b; + unsigned int flags = 0, groups; + + /* ignore our fake conntrack entry */ + if (ct == &ip_conntrack_untracked) + return NOTIFY_DONE; + + if (events & IPCT_DESTROY) { + type = IPCTNL_MSG_CT_DELETE; + groups = NF_NETLINK_CONNTRACK_DESTROY; + goto alloc_skb; + } + if (events & (IPCT_NEW | IPCT_RELATED)) { + type = IPCTNL_MSG_CT_NEW; + flags = NLM_F_CREATE|NLM_F_EXCL; + /* dump everything */ + events = ~0UL; + groups = NF_NETLINK_CONNTRACK_NEW; + goto alloc_skb; + } + if (events & (IPCT_STATUS | + IPCT_PROTOINFO | + IPCT_HELPER | + IPCT_HELPINFO | + IPCT_NATINFO)) { + type = IPCTNL_MSG_CT_NEW; + groups = NF_NETLINK_CONNTRACK_UPDATE; + goto alloc_skb; + } + + return NOTIFY_DONE; + +alloc_skb: + /* FIXME: Check if there are any listeners before, don't hurt performance */ + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); + if (!skb) + return NOTIFY_DONE; + + b = skb->tail; + + type |= NFNL_SUBSYS_CTNETLINK << 8; + nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(struct nfgenmsg)); + nfmsg = NLMSG_DATA(nlh); + + nlh->nlmsg_flags = flags; + nfmsg->nfgen_family = AF_INET; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG); + if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) + goto nfattr_failure; + NFA_NEST_END(skb, nest_parms); + + nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY); + if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0) + goto nfattr_failure; + NFA_NEST_END(skb, nest_parms); + + /* NAT stuff is now a status flag */ + if ((events & IPCT_STATUS || events & IPCT_NATINFO) + && ctnetlink_dump_status(skb, ct) < 0) + goto nfattr_failure; + if (events & IPCT_REFRESH + && ctnetlink_dump_timeout(skb, ct) < 0) + goto nfattr_failure; + if (events & IPCT_PROTOINFO + && ctnetlink_dump_protoinfo(skb, ct) < 0) + goto nfattr_failure; + if (events & IPCT_HELPINFO + && ctnetlink_dump_helpinfo(skb, ct) < 0) + goto nfattr_failure; + + if (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 || + ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0) + goto nfattr_failure; + + nlh->nlmsg_len = skb->tail - b; + nfnetlink_send(skb, 0, groups, 0); + return NOTIFY_DONE; + +nlmsg_failure: +nfattr_failure: + kfree_skb(skb); + return NOTIFY_DONE; +} +#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */ + +static int ctnetlink_done(struct netlink_callback *cb) +{ + DEBUGP("entered %s\n", __FUNCTION__); + return 0; +} + +static int +ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct ip_conntrack *ct = NULL; + struct ip_conntrack_tuple_hash *h; + struct list_head *i; + u_int32_t *id = (u_int32_t *) &cb->args[1]; + + DEBUGP("entered %s, last bucket=%lu id=%u\n", __FUNCTION__, + cb->args[0], *id); + + read_lock_bh(&ip_conntrack_lock); + for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) { + list_for_each(i, &ip_conntrack_hash[cb->args[0]]) { + h = (struct ip_conntrack_tuple_hash *) i; + if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) + continue; + ct = tuplehash_to_ctrack(h); + if (ct->id <= *id) + continue; + if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + IPCTNL_MSG_CT_NEW, + 1, ct) < 0) + goto out; + *id = ct->id; + } + } +out: + read_unlock_bh(&ip_conntrack_lock); + + DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id); + + return skb->len; +} + +#ifdef CONFIG_IP_NF_CT_ACCT +static int +ctnetlink_dump_table_w(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct ip_conntrack *ct = NULL; + struct ip_conntrack_tuple_hash *h; + struct list_head *i; + u_int32_t *id = (u_int32_t *) &cb->args[1]; + + DEBUGP("entered %s, last bucket=%u id=%u\n", __FUNCTION__, + cb->args[0], *id); + + write_lock_bh(&ip_conntrack_lock); + for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) { + list_for_each(i, &ip_conntrack_hash[cb->args[0]]) { + h = (struct ip_conntrack_tuple_hash *) i; + if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) + continue; + ct = tuplehash_to_ctrack(h); + if (ct->id <= *id) + continue; + if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + IPCTNL_MSG_CT_NEW, + 1, ct) < 0) + goto out; + *id = ct->id; + + memset(&ct->counters, 0, sizeof(ct->counters)); + } + } +out: + write_unlock_bh(&ip_conntrack_lock); + + DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id); + + return skb->len; +} +#endif + +static const int cta_min_ip[CTA_IP_MAX] = { + [CTA_IP_V4_SRC-1] = sizeof(u_int32_t), + [CTA_IP_V4_DST-1] = sizeof(u_int32_t), +}; + +static inline int +ctnetlink_parse_tuple_ip(struct nfattr *attr, struct ip_conntrack_tuple *tuple) +{ + struct nfattr *tb[CTA_IP_MAX]; + + DEBUGP("entered %s\n", __FUNCTION__); + + memset(tb, 0, CTA_IP_MAX * sizeof(tb)); + + if (nfattr_parse_nested(tb, CTA_IP_MAX, attr) < 0) + goto nfattr_failure; + + if (nfattr_bad_size(tb, CTA_IP_MAX, cta_min_ip)) + return -EINVAL; + + if (!tb[CTA_IP_V4_SRC-1]) + return -EINVAL; + tuple->src.ip = *(u_int32_t *)NFA_DATA(tb[CTA_IP_V4_SRC-1]); + + if (!tb[CTA_IP_V4_DST-1]) + return -EINVAL; + tuple->dst.ip = *(u_int32_t *)NFA_DATA(tb[CTA_IP_V4_DST-1]); + + DEBUGP("leaving\n"); + + return 0; + +nfattr_failure: + return -1; +} + +static const int cta_min_proto[CTA_PROTO_MAX] = { + [CTA_PROTO_NUM-1] = sizeof(u_int16_t), + [CTA_PROTO_SRC_PORT-1] = sizeof(u_int16_t), + [CTA_PROTO_DST_PORT-1] = sizeof(u_int16_t), + [CTA_PROTO_ICMP_TYPE-1] = sizeof(u_int8_t), + [CTA_PROTO_ICMP_CODE-1] = sizeof(u_int8_t), + [CTA_PROTO_ICMP_ID-1] = sizeof(u_int16_t), +}; + +static inline int +ctnetlink_parse_tuple_proto(struct nfattr *attr, + struct ip_conntrack_tuple *tuple) +{ + struct nfattr *tb[CTA_PROTO_MAX]; + struct ip_conntrack_protocol *proto; + int ret = 0; + + DEBUGP("entered %s\n", __FUNCTION__); + + memset(tb, 0, CTA_PROTO_MAX * sizeof(tb)); + + if (nfattr_parse_nested(tb, CTA_PROTO_MAX, attr) < 0) + goto nfattr_failure; + + if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto)) + return -EINVAL; + + if (!tb[CTA_PROTO_NUM-1]) + return -EINVAL; + tuple->dst.protonum = *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_NUM-1]); + + proto = ip_conntrack_proto_find_get(tuple->dst.protonum); + + if (likely(proto && proto->nfattr_to_tuple)) { + ret = proto->nfattr_to_tuple(tb, tuple); + ip_conntrack_proto_put(proto); + } + + return ret; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_parse_tuple(struct nfattr *cda[], struct ip_conntrack_tuple *tuple, + enum ctattr_tuple type) +{ + struct nfattr *tb[CTA_TUPLE_MAX]; + int err; + + DEBUGP("entered %s\n", __FUNCTION__); + + memset(tb, 0, CTA_TUPLE_MAX * sizeof(tb)); + memset(tuple, 0, sizeof(*tuple)); + + if (nfattr_parse_nested(tb, CTA_TUPLE_MAX, cda[type-1]) < 0) + goto nfattr_failure; + + if (!tb[CTA_TUPLE_IP-1]) + return -EINVAL; + + err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP-1], tuple); + if (err < 0) + return err; + + if (!tb[CTA_TUPLE_PROTO-1]) + return -EINVAL; + + err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO-1], tuple); + if (err < 0) + return err; + + /* orig and expect tuples get DIR_ORIGINAL */ + if (type == CTA_TUPLE_REPLY) + tuple->dst.dir = IP_CT_DIR_REPLY; + else + tuple->dst.dir = IP_CT_DIR_ORIGINAL; + + DUMP_TUPLE(tuple); + + DEBUGP("leaving\n"); + + return 0; + +nfattr_failure: + return -1; +} + +#ifdef CONFIG_IP_NF_NAT_NEEDED +static const int cta_min_protonat[CTA_PROTONAT_MAX] = { + [CTA_PROTONAT_PORT_MIN-1] = sizeof(u_int16_t), + [CTA_PROTONAT_PORT_MAX-1] = sizeof(u_int16_t), +}; + +static int ctnetlink_parse_nat_proto(struct nfattr *attr, + const struct ip_conntrack *ct, + struct ip_nat_range *range) +{ + struct nfattr *tb[CTA_PROTONAT_MAX]; + struct ip_nat_protocol *npt; + + DEBUGP("entered %s\n", __FUNCTION__); + + memset(tb, 0, CTA_PROTONAT_MAX * sizeof(tb)); + + if (nfattr_parse_nested(tb, CTA_PROTONAT_MAX, attr) < 0) + goto nfattr_failure; + + if (nfattr_bad_size(tb, CTA_PROTONAT_MAX, cta_min_protonat)) + goto nfattr_failure; + + npt = ip_nat_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum); + if (!npt) + return 0; + + if (!npt->nfattr_to_range) { + ip_nat_proto_put(npt); + return 0; + } + + /* nfattr_to_range returns 1 if it parsed, 0 if not, neg. on error */ + if (npt->nfattr_to_range(tb, range) > 0) + range->flags |= IP_NAT_RANGE_PROTO_SPECIFIED; + + ip_nat_proto_put(npt); + + DEBUGP("leaving\n"); + return 0; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_parse_nat(struct nfattr *cda[], + const struct ip_conntrack *ct, struct ip_nat_range *range) +{ + struct nfattr *tb[CTA_NAT_MAX]; + int err; + + DEBUGP("entered %s\n", __FUNCTION__); + + memset(tb, 0, CTA_NAT_MAX * sizeof(tb)); + memset(range, 0, sizeof(*range)); + + if (nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]) < 0) + goto nfattr_failure; + + if (tb[CTA_NAT_MINIP-1]) + range->min_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MINIP-1]); + + if (!tb[CTA_NAT_MAXIP-1]) + range->max_ip = range->min_ip; + else + range->max_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MAXIP-1]); + + if (range->min_ip) + range->flags |= IP_NAT_RANGE_MAP_IPS; + + if (!tb[CTA_NAT_PROTO-1]) + return 0; + + err = ctnetlink_parse_nat_proto(tb[CTA_NAT_PROTO-1], ct, range); + if (err < 0) + return err; + + DEBUGP("leaving\n"); + return 0; + +nfattr_failure: + return -1; +} +#endif + +static inline int +ctnetlink_parse_help(struct nfattr *attr, char **helper_name) +{ + struct nfattr *tb[CTA_HELP_MAX]; + + DEBUGP("entered %s\n", __FUNCTION__); + memset(tb, 0, CTA_HELP_MAX * sizeof(tb)); + + if (nfattr_parse_nested(tb, CTA_HELP_MAX, attr) < 0) + goto nfattr_failure; + + if (!tb[CTA_HELP_NAME-1]) + return -EINVAL; + + *helper_name = NFA_DATA(tb[CTA_HELP_NAME-1]); + + return 0; + +nfattr_failure: + return -1; +} + +static int +ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) +{ + struct ip_conntrack_tuple_hash *h; + struct ip_conntrack_tuple tuple; + struct ip_conntrack *ct; + int err = 0; + + DEBUGP("entered %s\n", __FUNCTION__); + + if (cda[CTA_TUPLE_ORIG-1]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG); + else if (cda[CTA_TUPLE_REPLY-1]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY); + else { + /* Flush the whole table */ + ip_conntrack_flush(); + return 0; + } + + if (err < 0) + return err; + + h = ip_conntrack_find_get(&tuple, NULL); + if (!h) { + DEBUGP("tuple not found in conntrack hash\n"); + return -ENOENT; + } + + ct = tuplehash_to_ctrack(h); + + if (cda[CTA_ID-1]) { + u_int32_t id = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_ID-1])); + if (ct->id != id) { + ip_conntrack_put(ct); + return -ENOENT; + } + } + if (del_timer(&ct->timeout)) { + ip_conntrack_put(ct); + ct->timeout.function((unsigned long)ct); + return 0; + } + ip_conntrack_put(ct); + DEBUGP("leaving\n"); + + return 0; +} + +static int +ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) +{ + struct ip_conntrack_tuple_hash *h; + struct ip_conntrack_tuple tuple; + struct ip_conntrack *ct; + struct sk_buff *skb2 = NULL; + int err = 0; + + DEBUGP("entered %s\n", __FUNCTION__); + + if (nlh->nlmsg_flags & NLM_F_DUMP) { + struct nfgenmsg *msg = NLMSG_DATA(nlh); + u32 rlen; + + if (msg->nfgen_family != AF_INET) + return -EAFNOSUPPORT; + + if (NFNL_MSG_TYPE(nlh->nlmsg_type) == + IPCTNL_MSG_CT_GET_CTRZERO) { +#ifdef CONFIG_IP_NF_CT_ACCT + if ((*errp = netlink_dump_start(ctnl, skb, nlh, + ctnetlink_dump_table_w, + ctnetlink_done)) != 0) + return -EINVAL; +#else + return -ENOTSUPP; +#endif + } else { + if ((*errp = netlink_dump_start(ctnl, skb, nlh, + ctnetlink_dump_table, + ctnetlink_done)) != 0) + return -EINVAL; + } + + rlen = NLMSG_ALIGN(nlh->nlmsg_len); + if (rlen > skb->len) + rlen = skb->len; + skb_pull(skb, rlen); + return 0; + } + + if (cda[CTA_TUPLE_ORIG-1]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG); + else if (cda[CTA_TUPLE_REPLY-1]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY); + else + return -EINVAL; + + if (err < 0) + return err; + + h = ip_conntrack_find_get(&tuple, NULL); + if (!h) { + DEBUGP("tuple not found in conntrack hash"); + return -ENOENT; + } + DEBUGP("tuple found\n"); + ct = tuplehash_to_ctrack(h); + + err = -ENOMEM; + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); + if (!skb2) { + ip_conntrack_put(ct); + return -ENOMEM; + } + NETLINK_CB(skb2).dst_pid = NETLINK_CB(skb).pid; + + err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq, + IPCTNL_MSG_CT_NEW, 1, ct); + ip_conntrack_put(ct); + if (err <= 0) + goto out; + + err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); + if (err < 0) + goto out; + + DEBUGP("leaving\n"); + return 0; + +out: + if (skb2) + kfree_skb(skb2); + return -1; +} + +static inline int +ctnetlink_change_status(struct ip_conntrack *ct, struct nfattr *cda[]) +{ + unsigned long d, status = *(u_int32_t *)NFA_DATA(cda[CTA_STATUS-1]); + d = ct->status ^ status; + + if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING)) + /* unchangeable */ + return -EINVAL; + + if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY)) + /* SEEN_REPLY bit can only be set */ + return -EINVAL; + + + if (d & IPS_ASSURED && !(status & IPS_ASSURED)) + /* ASSURED bit can only be set */ + return -EINVAL; + + if (cda[CTA_NAT-1]) { +#ifndef CONFIG_IP_NF_NAT_NEEDED + return -EINVAL; +#else + unsigned int hooknum; + struct ip_nat_range range; + + if (ctnetlink_parse_nat(cda, ct, &range) < 0) + return -EINVAL; + + DEBUGP("NAT: %u.%u.%u.%u-%u.%u.%u.%u:%u-%u\n", + NIPQUAD(range.min_ip), NIPQUAD(range.max_ip), + htons(range.min.all), htons(range.max.all)); + + /* This is tricky but it works. ip_nat_setup_info needs the + * hook number as parameter, so let's do the correct + * conversion and run away */ + if (status & IPS_SRC_NAT_DONE) + hooknum = NF_IP_POST_ROUTING; /* IP_NAT_MANIP_SRC */ + else if (status & IPS_DST_NAT_DONE) + hooknum = NF_IP_PRE_ROUTING; /* IP_NAT_MANIP_DST */ + else + return -EINVAL; /* Missing NAT flags */ + + DEBUGP("NAT status: %lu\n", + status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK)); + + if (ip_nat_initialized(ct, hooknum)) + return -EEXIST; + ip_nat_setup_info(ct, &range, hooknum); + + DEBUGP("NAT status after setup_info: %lu\n", + ct->status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK)); +#endif + } + + /* Be careful here, modifying NAT bits can screw up things, + * so don't let users modify them directly if they don't pass + * ip_nat_range. */ + ct->status |= status & ~(IPS_NAT_DONE_MASK | IPS_NAT_MASK); + return 0; +} + + +static inline int +ctnetlink_change_helper(struct ip_conntrack *ct, struct nfattr *cda[]) +{ + struct ip_conntrack_helper *helper; + char *helpname; + int err; + + DEBUGP("entered %s\n", __FUNCTION__); + + /* don't change helper of sibling connections */ + if (ct->master) + return -EINVAL; + + err = ctnetlink_parse_help(cda[CTA_HELP-1], &helpname); + if (err < 0) + return err; + + helper = __ip_conntrack_helper_find_byname(helpname); + if (!helper) { + if (!strcmp(helpname, "")) + helper = NULL; + else + return -EINVAL; + } + + if (ct->helper) { + if (!helper) { + /* we had a helper before ... */ + ip_ct_remove_expectations(ct); + ct->helper = NULL; + } else { + /* need to zero data of old helper */ + memset(&ct->help, 0, sizeof(ct->help)); + } + } + + ct->helper = helper; + + return 0; +} + +static inline int +ctnetlink_change_timeout(struct ip_conntrack *ct, struct nfattr *cda[]) +{ + u_int32_t timeout = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_TIMEOUT-1])); + + if (!del_timer(&ct->timeout)) + return -ETIME; + + ct->timeout.expires = jiffies + timeout * HZ; + add_timer(&ct->timeout); + + return 0; +} + +static int +ctnetlink_change_conntrack(struct ip_conntrack *ct, struct nfattr *cda[]) +{ + int err; + + DEBUGP("entered %s\n", __FUNCTION__); + + if (cda[CTA_HELP-1]) { + err = ctnetlink_change_helper(ct, cda); + if (err < 0) + return err; + } + + if (cda[CTA_TIMEOUT-1]) { + err = ctnetlink_change_timeout(ct, cda); + if (err < 0) + return err; + } + + if (cda[CTA_STATUS-1]) { + err = ctnetlink_change_status(ct, cda); + if (err < 0) + return err; + } + + DEBUGP("all done\n"); + return 0; +} + +static int +ctnetlink_create_conntrack(struct nfattr *cda[], + struct ip_conntrack_tuple *otuple, + struct ip_conntrack_tuple *rtuple) +{ + struct ip_conntrack *ct; + int err = -EINVAL; + + DEBUGP("entered %s\n", __FUNCTION__); + + ct = ip_conntrack_alloc(otuple, rtuple); + if (ct == NULL || IS_ERR(ct)) + return -ENOMEM; + + if (!cda[CTA_TIMEOUT-1]) + goto err; + ct->timeout.expires = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_TIMEOUT-1])); + + ct->timeout.expires = jiffies + ct->timeout.expires * HZ; + ct->status |= IPS_CONFIRMED; + + err = ctnetlink_change_status(ct, cda); + if (err < 0) + goto err; + + ct->helper = ip_conntrack_helper_find_get(rtuple); + + add_timer(&ct->timeout); + ip_conntrack_hash_insert(ct); + + if (ct->helper) + ip_conntrack_helper_put(ct->helper); + + DEBUGP("conntrack with id %u inserted\n", ct->id); + return 0; + +err: + ip_conntrack_free(ct); + return err; +} + +static int +ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) +{ + struct ip_conntrack_tuple otuple, rtuple; + struct ip_conntrack_tuple_hash *h = NULL; + int err = 0; + + DEBUGP("entered %s\n", __FUNCTION__); + + if (cda[CTA_TUPLE_ORIG-1]) { + err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG); + if (err < 0) + return err; + } + + if (cda[CTA_TUPLE_REPLY-1]) { + err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY); + if (err < 0) + return err; + } + + write_lock_bh(&ip_conntrack_lock); + if (cda[CTA_TUPLE_ORIG-1]) + h = __ip_conntrack_find(&otuple, NULL); + else if (cda[CTA_TUPLE_REPLY-1]) + h = __ip_conntrack_find(&rtuple, NULL); + + if (h == NULL) { + write_unlock_bh(&ip_conntrack_lock); + DEBUGP("no such conntrack, create new\n"); + err = -ENOENT; + if (nlh->nlmsg_flags & NLM_F_CREATE) + err = ctnetlink_create_conntrack(cda, &otuple, &rtuple); + goto out_unlock; + } else { + /* we only allow nat config for new conntracks */ + if (cda[CTA_NAT-1]) { + err = -EINVAL; + goto out_unlock; + } + } + + /* We manipulate the conntrack inside the global conntrack table lock, + * so there's no need to increase the refcount */ + DEBUGP("conntrack found\n"); + err = -EEXIST; + if (!(nlh->nlmsg_flags & NLM_F_EXCL)) + err = ctnetlink_change_conntrack(tuplehash_to_ctrack(h), cda); + +out_unlock: + write_unlock_bh(&ip_conntrack_lock); + return err; +} + +/*********************************************************************** + * EXPECT + ***********************************************************************/ + +static inline int +ctnetlink_exp_dump_tuple(struct sk_buff *skb, + const struct ip_conntrack_tuple *tuple, + enum ctattr_expect type) +{ + struct nfattr *nest_parms = NFA_NEST(skb, type); + + if (ctnetlink_dump_tuples(skb, tuple) < 0) + goto nfattr_failure; + + NFA_NEST_END(skb, nest_parms); + + return 0; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_exp_dump_expect(struct sk_buff *skb, + const struct ip_conntrack_expect *exp) +{ + u_int32_t timeout = htonl((exp->timeout.expires - jiffies) / HZ); + u_int32_t id = htonl(exp->id); + struct nfattr *nest_parms = NFA_NEST(skb, CTA_EXPECT); + + if (ctnetlink_exp_dump_tuple(skb, &exp->tuple, CTA_EXPECT_TUPLE) < 0) + goto nfattr_failure; + if (ctnetlink_exp_dump_tuple(skb, &exp->mask, CTA_EXPECT_MASK) < 0) + goto nfattr_failure; + + NFA_PUT(skb, CTA_EXPECT_TIMEOUT, sizeof(timeout), &timeout); + NFA_PUT(skb, CTA_EXPECT_ID, sizeof(u_int32_t), &id); + NFA_NEST_END(skb, nest_parms); + + return 0; + +nfattr_failure: + return -1; +} + +static int +ctnetlink_exp_fill_info(struct sk_buff *skb, u32 pid, u32 seq, + int event, + int nowait, + const struct ip_conntrack_expect *exp) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + unsigned char *b; + + b = skb->tail; + + event |= NFNL_SUBSYS_CTNETLINK_EXP << 8; + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg)); + nfmsg = NLMSG_DATA(nlh); + + nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0; + nfmsg->nfgen_family = AF_INET; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + if (ctnetlink_exp_dump_expect(skb, exp) < 0) + goto nfattr_failure; + + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +nfattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS +static int ctnetlink_expect_event(struct notifier_block *this, + unsigned long events, void *ptr) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + struct ip_conntrack_expect *exp = (struct ip_conntrack_expect *)ptr; + struct sk_buff *skb; + unsigned int type; + unsigned char *b; + int flags = 0; + u16 proto; + + if (events & IPEXP_NEW) { + type = IPCTNL_MSG_EXP_NEW; + flags = NLM_F_CREATE|NLM_F_EXCL; + } else + return NOTIFY_DONE; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); + if (!skb) + return NOTIFY_DONE; + + b = skb->tail; + + type |= NFNL_SUBSYS_CTNETLINK << 8; + nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(struct nfgenmsg)); + nfmsg = NLMSG_DATA(nlh); + + nlh->nlmsg_flags = flags; + nfmsg->nfgen_family = AF_INET; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + if (ctnetlink_exp_dump_expect(skb, exp) < 0) + goto nfattr_failure; + + nlh->nlmsg_len = skb->tail - b; + proto = exp->tuple.dst.protonum; + nfnetlink_send(skb, 0, NF_NETLINK_CONNTRACK_EXP_NEW, 0); + return NOTIFY_DONE; + +nlmsg_failure: +nfattr_failure: + kfree_skb(skb); + return NOTIFY_DONE; +} +#endif + +static int +ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct ip_conntrack_expect *exp = NULL; + struct list_head *i; + u_int32_t *id = (u_int32_t *) &cb->args[0]; + + DEBUGP("entered %s, last id=%llu\n", __FUNCTION__, *id); + + read_lock_bh(&ip_conntrack_lock); + list_for_each(i, &ip_conntrack_expect_list) { + exp = (struct ip_conntrack_expect *) i; + if (exp->id <= *id) + continue; + if (ctnetlink_exp_fill_info(skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + IPCTNL_MSG_EXP_NEW, + 1, exp) < 0) + goto out; + *id = exp->id; + } +out: + read_unlock_bh(&ip_conntrack_lock); + + DEBUGP("leaving, last id=%llu\n", *id); + + return skb->len; +} + +static int +ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) +{ + struct ip_conntrack_tuple tuple; + struct ip_conntrack_expect *exp; + struct sk_buff *skb2; + int err = 0; + + DEBUGP("entered %s\n", __FUNCTION__); + + if (nlh->nlmsg_flags & NLM_F_DUMP) { + struct nfgenmsg *msg = NLMSG_DATA(nlh); + u32 rlen; + + if (msg->nfgen_family != AF_INET) + return -EAFNOSUPPORT; + + if ((*errp = netlink_dump_start(ctnl, skb, nlh, + ctnetlink_exp_dump_table, + ctnetlink_done)) != 0) + return -EINVAL; + rlen = NLMSG_ALIGN(nlh->nlmsg_len); + if (rlen > skb->len) + rlen = skb->len; + skb_pull(skb, rlen); + return 0; + } + + if (cda[CTA_TUPLE_ORIG-1]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG); + else if (cda[CTA_TUPLE_REPLY-1]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY); + else + return -EINVAL; + + if (err < 0) + return err; + + exp = ip_conntrack_expect_find_get(&tuple); + if (!exp) + return -ENOENT; + + err = -ENOMEM; + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb2) + goto out; + NETLINK_CB(skb2).dst_pid = NETLINK_CB(skb).pid; + + err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).pid, + nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, + 1, exp); + if (err <= 0) + goto out; + + ip_conntrack_expect_put(exp); + + err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); + if (err < 0) + goto free; + + return err; + +out: + ip_conntrack_expect_put(exp); +free: + if (skb2) + kfree_skb(skb2); + return err; +} + +static int +ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) +{ + struct ip_conntrack_expect *exp, *tmp; + struct ip_conntrack_tuple tuple; + struct ip_conntrack_helper *h; + int err; + + /* delete by tuple needs either orig or reply tuple */ + if (cda[CTA_TUPLE_ORIG-1]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG); + else if (cda[CTA_TUPLE_REPLY-1]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY); + else if (cda[CTA_HELP_NAME-1]) { + char *name = NFA_DATA(cda[CTA_HELP_NAME-1]); + + /* delete all expectations for this helper */ + write_lock_bh(&ip_conntrack_lock); + h = __ip_conntrack_helper_find_byname(name); + if (!h) { + write_unlock_bh(&ip_conntrack_lock); + return -EINVAL; + } + list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, + list) { + if (exp->master->helper == h + && del_timer(&exp->timeout)) + __ip_ct_expect_unlink_destroy(exp); + } + write_unlock(&ip_conntrack_lock); + return 0; + } else { + /* This basically means we have to flush everything*/ + write_lock_bh(&ip_conntrack_lock); + list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, + list) { + if (del_timer(&exp->timeout)) + __ip_ct_expect_unlink_destroy(exp); + } + write_unlock_bh(&ip_conntrack_lock); + return 0; + } + + if (err < 0) + return err; + + /* bump usage count to 2 */ + exp = ip_conntrack_expect_find_get(&tuple); + if (!exp) + return -ENOENT; + + if (cda[CTA_EXPECT_ID-1]) { + u_int32_t id = *(u_int32_t *)NFA_DATA(cda[CTA_EXPECT_ID-1]); + if (exp->id != ntohl(id)) { + ip_conntrack_expect_put(exp); + return -ENOENT; + } + } + + /* after list removal, usage count == 1 */ + ip_conntrack_unexpect_related(exp); + /* have to put what we 'get' above. after this line usage count == 0 */ + ip_conntrack_expect_put(exp); + + return 0; +} +static int +ctnetlink_change_expect(struct ip_conntrack_expect *x, struct nfattr *cda[]) +{ + return -EOPNOTSUPP; +} + +static int +ctnetlink_create_expect(struct nfattr *cda[]) +{ + struct ip_conntrack_tuple tuple, mask, master_tuple; + struct ip_conntrack_tuple_hash *h = NULL; + struct ip_conntrack_expect *exp; + struct ip_conntrack *ct; + int err = 0; + + DEBUGP("entered %s\n", __FUNCTION__); + + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE); + if (err < 0) + return err; + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASK); + if (err < 0) + return err; + + if (cda[CTA_TUPLE_ORIG-1]) + err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_TUPLE_ORIG); + else if (cda[CTA_TUPLE_REPLY-1]) + err = ctnetlink_parse_tuple(cda, &master_tuple, + CTA_TUPLE_REPLY); + else + return -EINVAL; + + if (err < 0) + return err; + + /* Look for master conntrack of this expectation */ + h = ip_conntrack_find_get(&master_tuple, NULL); + if (!h) + return -ENOENT; + ct = tuplehash_to_ctrack(h); + + if (!ct->helper) { + /* such conntrack hasn't got any helper, abort */ + err = -EINVAL; + goto out; + } + + exp = ip_conntrack_expect_alloc(ct); + if (!exp) { + err = -ENOMEM; + goto out; + } + + exp->expectfn = NULL; + exp->master = ct; + memcpy(&exp->tuple, &tuple, sizeof(struct ip_conntrack_tuple)); + memcpy(&exp->mask, &mask, sizeof(struct ip_conntrack_tuple)); + + err = ip_conntrack_expect_related(exp); + ip_conntrack_expect_put(exp); + +out: + ip_conntrack_put(tuplehash_to_ctrack(h)); + return err; +} + +static int +ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) +{ + struct ip_conntrack_tuple tuple; + struct ip_conntrack_expect *exp; + int err = 0; + + DEBUGP("entered %s\n", __FUNCTION__); + + if (!cda[CTA_EXPECT_TUPLE-1] || !cda[CTA_EXPECT_MASK-1]) + return -EINVAL; + + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE); + if (err < 0) + return err; + + write_lock_bh(&ip_conntrack_lock); + exp = __ip_conntrack_expect_find(&tuple); + + if (!exp) { + write_unlock_bh(&ip_conntrack_lock); + err = -ENOENT; + if (nlh->nlmsg_flags & NLM_F_CREATE) + err = ctnetlink_create_expect(cda); + return err; + } + + err = -EEXIST; + if (!(nlh->nlmsg_flags & NLM_F_EXCL)) + err = ctnetlink_change_expect(exp, cda); + write_unlock_bh(&ip_conntrack_lock); + + DEBUGP("leaving\n"); + + return err; +} + +#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS +static struct notifier_block ctnl_notifier = { + .notifier_call = ctnetlink_conntrack_event, +}; + +static struct notifier_block ctnl_notifier_exp = { + .notifier_call = ctnetlink_expect_event, +}; +#endif + +static struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = { + [IPCTNL_MSG_CT_NEW] = { .call = ctnetlink_new_conntrack, + .cap_required = CAP_NET_ADMIN }, + [IPCTNL_MSG_CT_GET] = { .call = ctnetlink_get_conntrack, + .cap_required = CAP_NET_ADMIN }, + [IPCTNL_MSG_CT_DELETE] = { .call = ctnetlink_del_conntrack, + .cap_required = CAP_NET_ADMIN }, + [IPCTNL_MSG_CT_GET_CTRZERO] = { .call = ctnetlink_get_conntrack, + .cap_required = CAP_NET_ADMIN }, +}; + +static struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_MAX] = { + [IPCTNL_MSG_EXP_GET] = { .call = ctnetlink_get_expect, + .cap_required = CAP_NET_ADMIN }, + [IPCTNL_MSG_EXP_NEW] = { .call = ctnetlink_new_expect, + .cap_required = CAP_NET_ADMIN }, + [IPCTNL_MSG_EXP_DELETE] = { .call = ctnetlink_del_expect, + .cap_required = CAP_NET_ADMIN }, +}; + +static struct nfnetlink_subsystem ctnl_subsys = { + .name = "conntrack", + .subsys_id = NFNL_SUBSYS_CTNETLINK, + .cb_count = IPCTNL_MSG_MAX, + .attr_count = CTA_MAX, + .cb = ctnl_cb, +}; + +static struct nfnetlink_subsystem ctnl_exp_subsys = { + .name = "conntrack_expect", + .subsys_id = NFNL_SUBSYS_CTNETLINK_EXP, + .cb_count = IPCTNL_MSG_EXP_MAX, + .attr_count = CTA_MAX, + .cb = ctnl_exp_cb, +}; + +static int __init ctnetlink_init(void) +{ + int ret; + + printk("ctnetlink v%s: registering with nfnetlink.\n", version); + ret = nfnetlink_subsys_register(&ctnl_subsys); + if (ret < 0) { + printk("ctnetlink_init: cannot register with nfnetlink.\n"); + goto err_out; + } + + ret = nfnetlink_subsys_register(&ctnl_exp_subsys); + if (ret < 0) { + printk("ctnetlink_init: cannot register exp with nfnetlink.\n"); + goto err_unreg_subsys; + } + +#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS + ret = ip_conntrack_register_notifier(&ctnl_notifier); + if (ret < 0) { + printk("ctnetlink_init: cannot register notifier.\n"); + goto err_unreg_exp_subsys; + } + + ret = ip_conntrack_expect_register_notifier(&ctnl_notifier_exp); + if (ret < 0) { + printk("ctnetlink_init: cannot expect register notifier.\n"); + goto err_unreg_notifier; + } +#endif + + return 0; + +#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS +err_unreg_notifier: + ip_conntrack_unregister_notifier(&ctnl_notifier); +err_unreg_exp_subsys: + nfnetlink_subsys_unregister(&ctnl_exp_subsys); +#endif +err_unreg_subsys: + nfnetlink_subsys_unregister(&ctnl_subsys); +err_out: + return ret; +} + +static void __exit ctnetlink_exit(void) +{ + printk("ctnetlink: unregistering from nfnetlink.\n"); + +#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS + ip_conntrack_unregister_notifier(&ctnl_notifier_exp); + ip_conntrack_unregister_notifier(&ctnl_notifier); +#endif + + nfnetlink_subsys_unregister(&ctnl_exp_subsys); + nfnetlink_subsys_unregister(&ctnl_subsys); + return; +} + +module_init(ctnetlink_init); +module_exit(ctnetlink_exit); diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c index dca1f63..3f90cb9 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c @@ -109,16 +109,17 @@ static int icmp_packet(struct ip_conntrack *ct, return NF_ACCEPT; } +static u_int8_t valid_new[] = { + [ICMP_ECHO] = 1, + [ICMP_TIMESTAMP] = 1, + [ICMP_INFO_REQUEST] = 1, + [ICMP_ADDRESS] = 1 +}; + /* Called when a new connection for this protocol found. */ static int icmp_new(struct ip_conntrack *conntrack, const struct sk_buff *skb) { - static u_int8_t valid_new[] - = { [ICMP_ECHO] = 1, - [ICMP_TIMESTAMP] = 1, - [ICMP_INFO_REQUEST] = 1, - [ICMP_ADDRESS] = 1 }; - if (conntrack->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type]) { /* Can't create a new ICMP `conn' with this. */ @@ -159,11 +160,12 @@ icmp_error_message(struct sk_buff *skb, return NF_ACCEPT; } - innerproto = ip_ct_find_proto(inside->ip.protocol); + innerproto = ip_conntrack_proto_find_get(inside->ip.protocol); dataoff = skb->nh.iph->ihl*4 + sizeof(inside->icmp) + inside->ip.ihl*4; /* Are they talking about one of our connections? */ if (!ip_ct_get_tuple(&inside->ip, skb, dataoff, &origtuple, innerproto)) { DEBUGP("icmp_error: ! get_tuple p=%u", inside->ip.protocol); + ip_conntrack_proto_put(innerproto); return NF_ACCEPT; } @@ -171,8 +173,10 @@ icmp_error_message(struct sk_buff *skb, been preserved inside the ICMP. */ if (!ip_ct_invert_tuple(&innertuple, &origtuple, innerproto)) { DEBUGP("icmp_error_track: Can't invert tuple\n"); + ip_conntrack_proto_put(innerproto); return NF_ACCEPT; } + ip_conntrack_proto_put(innerproto); *ctinfo = IP_CT_RELATED; @@ -266,6 +270,47 @@ checksum_skipped: return icmp_error_message(skb, ctinfo, hooknum); } +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) +static int icmp_tuple_to_nfattr(struct sk_buff *skb, + const struct ip_conntrack_tuple *t) +{ + NFA_PUT(skb, CTA_PROTO_ICMP_ID, sizeof(u_int16_t), + &t->src.u.icmp.id); + NFA_PUT(skb, CTA_PROTO_ICMP_TYPE, sizeof(u_int8_t), + &t->dst.u.icmp.type); + NFA_PUT(skb, CTA_PROTO_ICMP_CODE, sizeof(u_int8_t), + &t->dst.u.icmp.code); + + if (t->dst.u.icmp.type >= sizeof(valid_new) + || !valid_new[t->dst.u.icmp.type]) + return -EINVAL; + + return 0; + +nfattr_failure: + return -1; +} + +static int icmp_nfattr_to_tuple(struct nfattr *tb[], + struct ip_conntrack_tuple *tuple) +{ + if (!tb[CTA_PROTO_ICMP_TYPE-1] + || !tb[CTA_PROTO_ICMP_CODE-1] + || !tb[CTA_PROTO_ICMP_ID-1]) + return -1; + + tuple->dst.u.icmp.type = + *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_TYPE-1]); + tuple->dst.u.icmp.code = + *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_CODE-1]); + tuple->src.u.icmp.id = + *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_ID-1]); + + return 0; +} +#endif + struct ip_conntrack_protocol ip_conntrack_protocol_icmp = { .proto = IPPROTO_ICMP, @@ -277,4 +322,9 @@ struct ip_conntrack_protocol ip_conntrack_protocol_icmp = .packet = icmp_packet, .new = icmp_new, .error = icmp_error, +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) + .tuple_to_nfattr = icmp_tuple_to_nfattr, + .nfattr_to_tuple = icmp_nfattr_to_tuple, +#endif }; diff --git a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c index 3d5f878..a875f35 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c @@ -505,7 +505,12 @@ static struct ip_conntrack_protocol ip_conntrack_protocol_sctp = { .packet = sctp_packet, .new = sctp_new, .destroy = NULL, - .me = THIS_MODULE + .me = THIS_MODULE, +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) + .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr, + .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple, +#endif }; #ifdef CONFIG_SYSCTL diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index a569ad1..c2bce22 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -336,6 +336,23 @@ static int tcp_print_conntrack(struct seq_file *s, return seq_printf(s, "%s ", tcp_conntrack_names[state]); } +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) +static int tcp_to_nfattr(struct sk_buff *skb, struct nfattr *nfa, + const struct ip_conntrack *ct) +{ + read_lock_bh(&tcp_lock); + NFA_PUT(skb, CTA_PROTOINFO_TCP_STATE, sizeof(u_int8_t), + &ct->proto.tcp.state); + read_unlock_bh(&tcp_lock); + + return 0; + +nfattr_failure: + return -1; +} +#endif + static unsigned int get_conntrack_index(const struct tcphdr *tcph) { if (tcph->rst) return TCP_RST_SET; @@ -1100,4 +1117,10 @@ struct ip_conntrack_protocol ip_conntrack_protocol_tcp = .packet = tcp_packet, .new = tcp_new, .error = tcp_error, +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) + .to_nfattr = tcp_to_nfattr, + .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr, + .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple, +#endif }; diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c index 6066eaf..1413016 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c @@ -145,4 +145,9 @@ struct ip_conntrack_protocol ip_conntrack_protocol_udp = .packet = udp_packet, .new = udp_new, .error = udp_error, +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) + .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr, + .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple, +#endif }; diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c index f088000..ca97c3a 100644 --- a/net/ipv4/netfilter/ip_conntrack_standalone.c +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c @@ -5,7 +5,7 @@ */ /* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team + * (C) 2002-2005 Netfilter Core Team * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -147,8 +147,7 @@ static int ct_seq_show(struct seq_file *s, void *v) if (DIRECTION(hash)) return 0; - proto = ip_ct_find_proto(conntrack->tuplehash[IP_CT_DIR_ORIGINAL] - .tuple.dst.protonum); + proto = __ip_conntrack_proto_find(conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum); IP_NF_ASSERT(proto); if (seq_printf(s, "%-8s %u %ld ", @@ -283,7 +282,7 @@ static int exp_seq_show(struct seq_file *s, void *v) seq_printf(s, "proto=%u ", expect->tuple.dst.protonum); print_tuple(s, &expect->tuple, - ip_ct_find_proto(expect->tuple.dst.protonum)); + __ip_conntrack_proto_find(expect->tuple.dst.protonum)); return seq_putc(s, '\n'); } @@ -992,12 +991,16 @@ EXPORT_SYMBOL(ip_conntrack_helper_register); EXPORT_SYMBOL(ip_conntrack_helper_unregister); EXPORT_SYMBOL(ip_ct_iterate_cleanup); EXPORT_SYMBOL(ip_ct_refresh_acct); -EXPORT_SYMBOL(ip_ct_protos); -EXPORT_SYMBOL(ip_ct_find_proto); + EXPORT_SYMBOL(ip_conntrack_expect_alloc); EXPORT_SYMBOL(ip_conntrack_expect_put); +EXPORT_SYMBOL_GPL(ip_conntrack_expect_find_get); EXPORT_SYMBOL(ip_conntrack_expect_related); EXPORT_SYMBOL(ip_conntrack_unexpect_related); +EXPORT_SYMBOL_GPL(ip_conntrack_expect_list); +EXPORT_SYMBOL_GPL(__ip_conntrack_expect_find); +EXPORT_SYMBOL_GPL(__ip_ct_expect_unlink_destroy); + EXPORT_SYMBOL(ip_conntrack_tuple_taken); EXPORT_SYMBOL(ip_ct_gather_frags); EXPORT_SYMBOL(ip_conntrack_htable_size); @@ -1005,7 +1008,28 @@ EXPORT_SYMBOL(ip_conntrack_lock); EXPORT_SYMBOL(ip_conntrack_hash); EXPORT_SYMBOL(ip_conntrack_untracked); EXPORT_SYMBOL_GPL(ip_conntrack_find_get); -EXPORT_SYMBOL_GPL(ip_conntrack_put); #ifdef CONFIG_IP_NF_NAT_NEEDED EXPORT_SYMBOL(ip_conntrack_tcp_update); #endif + +EXPORT_SYMBOL_GPL(ip_conntrack_flush); +EXPORT_SYMBOL_GPL(__ip_conntrack_find); + +EXPORT_SYMBOL_GPL(ip_conntrack_alloc); +EXPORT_SYMBOL_GPL(ip_conntrack_free); +EXPORT_SYMBOL_GPL(ip_conntrack_hash_insert); + +EXPORT_SYMBOL_GPL(ip_ct_remove_expectations); + +EXPORT_SYMBOL_GPL(ip_conntrack_helper_find_get); +EXPORT_SYMBOL_GPL(ip_conntrack_helper_put); +EXPORT_SYMBOL_GPL(__ip_conntrack_helper_find_byname); + +EXPORT_SYMBOL_GPL(ip_conntrack_proto_find_get); +EXPORT_SYMBOL_GPL(ip_conntrack_proto_put); +EXPORT_SYMBOL_GPL(__ip_conntrack_proto_find); +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) +EXPORT_SYMBOL_GPL(ip_ct_port_tuple_to_nfattr); +EXPORT_SYMBOL_GPL(ip_ct_port_nfattr_to_tuple); +#endif diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c index ed4d731..567c802 100644 --- a/net/ipv4/netfilter/ip_nat_core.c +++ b/net/ipv4/netfilter/ip_nat_core.c @@ -47,8 +47,39 @@ DEFINE_RWLOCK(ip_nat_lock); static unsigned int ip_nat_htable_size; static struct list_head *bysource; + +#define MAX_IP_NAT_PROTO 256 struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO]; +static inline struct ip_nat_protocol * +__ip_nat_proto_find(u_int8_t protonum) +{ + return ip_nat_protos[protonum]; +} + +struct ip_nat_protocol * +ip_nat_proto_find_get(u_int8_t protonum) +{ + struct ip_nat_protocol *p; + + /* we need to disable preemption to make sure 'p' doesn't get + * removed until we've grabbed the reference */ + preempt_disable(); + p = __ip_nat_proto_find(protonum); + if (p) { + if (!try_module_get(p->me)) + p = &ip_nat_unknown_protocol; + } + preempt_enable(); + + return p; +} + +void +ip_nat_proto_put(struct ip_nat_protocol *p) +{ + module_put(p->me); +} /* We keep an extra hash for each conntrack, for fast searching. */ static inline unsigned int @@ -103,7 +134,8 @@ static int in_range(const struct ip_conntrack_tuple *tuple, const struct ip_nat_range *range) { - struct ip_nat_protocol *proto = ip_nat_find_proto(tuple->dst.protonum); + struct ip_nat_protocol *proto = + __ip_nat_proto_find(tuple->dst.protonum); /* If we are supposed to map IPs, then we must be in the range specified, otherwise let this drag us onto a new src IP. */ @@ -216,8 +248,7 @@ get_unique_tuple(struct ip_conntrack_tuple *tuple, struct ip_conntrack *conntrack, enum ip_nat_manip_type maniptype) { - struct ip_nat_protocol *proto - = ip_nat_find_proto(orig_tuple->dst.protonum); + struct ip_nat_protocol *proto; /* 1) If this srcip/proto/src-proto-part is currently mapped, and that same mapping gives a unique tuple within the given @@ -242,14 +273,20 @@ get_unique_tuple(struct ip_conntrack_tuple *tuple, /* 3) The per-protocol part of the manip is made to map into the range to make a unique tuple. */ + proto = ip_nat_proto_find_get(orig_tuple->dst.protonum); + /* Only bother mapping if it's not already in range and unique */ if ((!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) || proto->in_range(tuple, maniptype, &range->min, &range->max)) - && !ip_nat_used_tuple(tuple, conntrack)) + && !ip_nat_used_tuple(tuple, conntrack)) { + ip_nat_proto_put(proto); return; + } /* Last change: get protocol to try to obtain unique tuple. */ proto->unique_tuple(tuple, range, maniptype, conntrack); + + ip_nat_proto_put(proto); } unsigned int @@ -320,6 +357,7 @@ manip_pkt(u_int16_t proto, enum ip_nat_manip_type maniptype) { struct iphdr *iph; + struct ip_nat_protocol *p; if (!skb_ip_make_writable(pskb, iphdroff + sizeof(*iph))) return 0; @@ -327,9 +365,12 @@ manip_pkt(u_int16_t proto, iph = (void *)(*pskb)->data + iphdroff; /* Manipulate protcol part. */ - if (!ip_nat_find_proto(proto)->manip_pkt(pskb, iphdroff, - target, maniptype)) + p = ip_nat_proto_find_get(proto); + if (!p->manip_pkt(pskb, iphdroff, target, maniptype)) { + ip_nat_proto_put(p); return 0; + } + ip_nat_proto_put(p); iph = (void *)(*pskb)->data + iphdroff; @@ -425,7 +466,8 @@ int icmp_reply_translation(struct sk_buff **pskb, if (!ip_ct_get_tuple(&inside->ip, *pskb, (*pskb)->nh.iph->ihl*4 + sizeof(struct icmphdr) + inside->ip.ihl*4, - &inner, ip_ct_find_proto(inside->ip.protocol))) + &inner, + __ip_conntrack_proto_find(inside->ip.protocol))) return 0; /* Change inner back to look like incoming packet. We do the @@ -495,6 +537,49 @@ void ip_nat_protocol_unregister(struct ip_nat_protocol *proto) synchronize_net(); } +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) +int +ip_nat_port_range_to_nfattr(struct sk_buff *skb, + const struct ip_nat_range *range) +{ + NFA_PUT(skb, CTA_PROTONAT_PORT_MIN, sizeof(u_int16_t), + &range->min.tcp.port); + NFA_PUT(skb, CTA_PROTONAT_PORT_MAX, sizeof(u_int16_t), + &range->max.tcp.port); + + return 0; + +nfattr_failure: + return -1; +} + +int +ip_nat_port_nfattr_to_range(struct nfattr *tb[], struct ip_nat_range *range) +{ + int ret = 0; + + /* we have to return whether we actually parsed something or not */ + + if (tb[CTA_PROTONAT_PORT_MIN-1]) { + ret = 1; + range->min.tcp.port = + *(u_int16_t *)NFA_DATA(tb[CTA_PROTONAT_PORT_MIN-1]); + } + + if (!tb[CTA_PROTONAT_PORT_MAX-1]) { + if (ret) + range->max.tcp.port = range->min.tcp.port; + } else { + ret = 1; + range->max.tcp.port = + *(u_int16_t *)NFA_DATA(tb[CTA_PROTONAT_PORT_MAX-1]); + } + + return ret; +} +#endif + int __init ip_nat_init(void) { size_t i; diff --git a/net/ipv4/netfilter/ip_nat_proto_icmp.c b/net/ipv4/netfilter/ip_nat_proto_icmp.c index 6596c9e..38fdfc2 100644 --- a/net/ipv4/netfilter/ip_nat_proto_icmp.c +++ b/net/ipv4/netfilter/ip_nat_proto_icmp.c @@ -107,10 +107,15 @@ icmp_print_range(char *buffer, const struct ip_nat_range *range) } struct ip_nat_protocol ip_nat_protocol_icmp -= { "ICMP", IPPROTO_ICMP, += { "ICMP", IPPROTO_ICMP, THIS_MODULE, icmp_manip_pkt, icmp_in_range, icmp_unique_tuple, icmp_print, - icmp_print_range + icmp_print_range, +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) + ip_nat_port_range_to_nfattr, + ip_nat_port_nfattr_to_range, +#endif }; diff --git a/net/ipv4/netfilter/ip_nat_proto_tcp.c b/net/ipv4/netfilter/ip_nat_proto_tcp.c index a98e36d..f03cd0f 100644 --- a/net/ipv4/netfilter/ip_nat_proto_tcp.c +++ b/net/ipv4/netfilter/ip_nat_proto_tcp.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -170,10 +171,15 @@ tcp_print_range(char *buffer, const struct ip_nat_range *range) } struct ip_nat_protocol ip_nat_protocol_tcp -= { "TCP", IPPROTO_TCP, += { "TCP", IPPROTO_TCP, THIS_MODULE, tcp_manip_pkt, tcp_in_range, tcp_unique_tuple, tcp_print, - tcp_print_range + tcp_print_range, +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) + ip_nat_port_range_to_nfattr, + ip_nat_port_nfattr_to_range, +#endif }; diff --git a/net/ipv4/netfilter/ip_nat_proto_udp.c b/net/ipv4/netfilter/ip_nat_proto_udp.c index 9f66e56..7a4e66e 100644 --- a/net/ipv4/netfilter/ip_nat_proto_udp.c +++ b/net/ipv4/netfilter/ip_nat_proto_udp.c @@ -157,10 +157,15 @@ udp_print_range(char *buffer, const struct ip_nat_range *range) } struct ip_nat_protocol ip_nat_protocol_udp -= { "UDP", IPPROTO_UDP, += { "UDP", IPPROTO_UDP, THIS_MODULE, udp_manip_pkt, udp_in_range, udp_unique_tuple, udp_print, - udp_print_range + udp_print_range, +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) + ip_nat_port_range_to_nfattr, + ip_nat_port_nfattr_to_range, +#endif }; diff --git a/net/ipv4/netfilter/ip_nat_proto_unknown.c b/net/ipv4/netfilter/ip_nat_proto_unknown.c index f5525bd..512d8f2 100644 --- a/net/ipv4/netfilter/ip_nat_proto_unknown.c +++ b/net/ipv4/netfilter/ip_nat_proto_unknown.c @@ -61,7 +61,7 @@ unknown_print_range(char *buffer, const struct ip_nat_range *range) } struct ip_nat_protocol ip_nat_unknown_protocol = { - "unknown", 0, + "unknown", 0, THIS_MODULE, unknown_manip_pkt, unknown_in_range, unknown_unique_tuple, diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c index 9ecba97..89db052 100644 --- a/net/ipv4/netfilter/ip_nat_standalone.c +++ b/net/ipv4/netfilter/ip_nat_standalone.c @@ -394,6 +394,8 @@ module_exit(fini); EXPORT_SYMBOL(ip_nat_setup_info); EXPORT_SYMBOL(ip_nat_protocol_register); EXPORT_SYMBOL(ip_nat_protocol_unregister); +EXPORT_SYMBOL_GPL(ip_nat_proto_find_get); +EXPORT_SYMBOL_GPL(ip_nat_proto_put); EXPORT_SYMBOL(ip_nat_cheat_check); EXPORT_SYMBOL(ip_nat_mangle_tcp_packet); EXPORT_SYMBOL(ip_nat_mangle_udp_packet); diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 710acd7..b0ed5798 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -121,6 +121,7 @@ void __nfa_fill(struct sk_buff *skb, int attrtype, int attrlen, nfa->nfa_type = attrtype; nfa->nfa_len = size; memcpy(NFA_DATA(nfa), data, attrlen); + memset(NFA_DATA(nfa) + attrlen, 0, NFA_ALIGN(size) - size); } int nfattr_parse(struct nfattr *tb[], int maxattr, struct nfattr *nfa, int len) -- cgit v0.10.2 From 83e3609eba3818f6e18b8bf9442195169ac306b7 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 19:33:31 -0700 Subject: [REQSK]: Move the syn_table destroy from tcp_listen_stop to reqsk_queue_destroy Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 72fd6f5..334717b 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -89,6 +89,7 @@ struct listen_sock { int qlen_young; int clock_hand; u32 hash_rnd; + u32 nr_table_entries; struct request_sock *syn_table[0]; }; @@ -129,11 +130,13 @@ static inline struct listen_sock *reqsk_queue_yank_listen_sk(struct request_sock return lopt; } -static inline void reqsk_queue_destroy(struct request_sock_queue *queue) +static inline void __reqsk_queue_destroy(struct request_sock_queue *queue) { kfree(reqsk_queue_yank_listen_sk(queue)); } +extern void reqsk_queue_destroy(struct request_sock_queue *queue); + static inline struct request_sock * reqsk_queue_yank_acceptq(struct request_sock_queue *queue) { diff --git a/net/core/request_sock.c b/net/core/request_sock.c index bb55675..4e99ce5 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -53,6 +53,7 @@ int reqsk_queue_alloc(struct request_sock_queue *queue, get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd)); rwlock_init(&queue->syn_wait_lock); queue->rskq_accept_head = queue->rskq_accept_head = NULL; + lopt->nr_table_entries = nr_table_entries; write_lock_bh(&queue->syn_wait_lock); queue->listen_opt = lopt; @@ -62,3 +63,28 @@ int reqsk_queue_alloc(struct request_sock_queue *queue, } EXPORT_SYMBOL(reqsk_queue_alloc); + +void reqsk_queue_destroy(struct request_sock_queue *queue) +{ + /* make all the listen_opt local to us */ + struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue); + + if (lopt->qlen != 0) { + int i; + + for (i = 0; i < lopt->nr_table_entries; i++) { + struct request_sock *req; + + while ((req = lopt->syn_table[i]) != NULL) { + lopt->syn_table[i] = req->dl_next; + lopt->qlen--; + reqsk_free(req); + } + } + } + + BUG_TRAP(lopt->qlen == 0); + kfree(lopt); +} + +EXPORT_SYMBOL(reqsk_queue_destroy); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index d2696af..42a2e2c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -487,7 +487,7 @@ int tcp_listen_start(struct sock *sk) } sk->sk_state = TCP_CLOSE; - reqsk_queue_destroy(&tp->accept_queue); + __reqsk_queue_destroy(&tp->accept_queue); return -EADDRINUSE; } @@ -499,38 +499,23 @@ int tcp_listen_start(struct sock *sk) static void tcp_listen_stop (struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - struct listen_sock *lopt; struct request_sock *acc_req; struct request_sock *req; - int i; tcp_delete_keepalive_timer(sk); /* make all the listen_opt local to us */ - lopt = reqsk_queue_yank_listen_sk(&tp->accept_queue); acc_req = reqsk_queue_yank_acceptq(&tp->accept_queue); - if (lopt->qlen) { - for (i = 0; i < TCP_SYNQ_HSIZE; i++) { - while ((req = lopt->syn_table[i]) != NULL) { - lopt->syn_table[i] = req->dl_next; - lopt->qlen--; - reqsk_free(req); - - /* Following specs, it would be better either to send FIN - * (and enter FIN-WAIT-1, it is normal close) - * or to send active reset (abort). - * Certainly, it is pretty dangerous while synflood, but it is - * bad justification for our negligence 8) - * To be honest, we are not able to make either - * of the variants now. --ANK - */ - } - } - } - BUG_TRAP(!lopt->qlen); - - kfree(lopt); + /* Following specs, it would be better either to send FIN + * (and enter FIN-WAIT-1, it is normal close) + * or to send active reset (abort). + * Certainly, it is pretty dangerous while synflood, but it is + * bad justification for our negligence 8) + * To be honest, we are not able to make either + * of the variants now. --ANK + */ + reqsk_queue_destroy(&tp->accept_queue); while ((req = acc_req) != NULL) { struct sock *child = req->sk; -- cgit v0.10.2 From b6b99eb5409d75ae35390057cd28f3aedfbd4cf4 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 9 Aug 2005 19:33:51 -0700 Subject: [NET]: Reduce tc_index/tc_verd to u16 Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 4aeadb1..af4f02e 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -265,9 +265,9 @@ struct sk_buff { #endif #endif /* CONFIG_NETFILTER */ #ifdef CONFIG_NET_SCHED - __u32 tc_index; /* traffic control index */ + __u16 tc_index; /* traffic control index */ #ifdef CONFIG_NET_CLS_ACT - __u32 tc_verd; /* traffic control verdict */ + __u16 tc_verd; /* traffic control verdict */ #endif #endif -- cgit v0.10.2 From f2ccd8fa06c8e302116e71df372f5c1f83432e03 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 9 Aug 2005 19:34:12 -0700 Subject: [NET]: Kill skb->real_dev Bonding just wants the device before the skb_bond() decapsulation occurs, so simply pass that original device into packet_type->func() as an argument. It remains to be seen whether we can use this same exact thing to get rid of skb->input_dev as well. Signed-off-by: David S. Miller diff --git a/drivers/block/aoe/aoenet.c b/drivers/block/aoe/aoenet.c index 9e6f51c..4be9769 100644 --- a/drivers/block/aoe/aoenet.c +++ b/drivers/block/aoe/aoenet.c @@ -120,7 +120,7 @@ aoenet_xmit(struct sk_buff *sl) * (1) len doesn't include the header by default. I want this. */ static int -aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt) +aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt, struct net_device *orig_dev) { struct aoe_hdr *h; u32 n; diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c index a2e8dda..d2f34d5 100644 --- a/drivers/net/bonding/bond_3ad.c +++ b/drivers/net/bonding/bond_3ad.c @@ -2419,22 +2419,19 @@ out: return 0; } -int bond_3ad_lacpdu_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type* ptype) +int bond_3ad_lacpdu_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type* ptype, struct net_device *orig_dev) { struct bonding *bond = dev->priv; struct slave *slave = NULL; int ret = NET_RX_DROP; - if (!(dev->flags & IFF_MASTER)) { + if (!(dev->flags & IFF_MASTER)) goto out; - } read_lock(&bond->lock); - slave = bond_get_slave_by_dev((struct bonding *)dev->priv, - skb->real_dev); - if (slave == NULL) { + slave = bond_get_slave_by_dev((struct bonding *)dev->priv, orig_dev); + if (!slave) goto out_unlock; - } bond_3ad_rx_indication((struct lacpdu *) skb->data, slave, skb->len); diff --git a/drivers/net/bonding/bond_3ad.h b/drivers/net/bonding/bond_3ad.h index f468238..673a30a 100644 --- a/drivers/net/bonding/bond_3ad.h +++ b/drivers/net/bonding/bond_3ad.h @@ -295,6 +295,6 @@ void bond_3ad_adapter_duplex_changed(struct slave *slave); void bond_3ad_handle_link_change(struct slave *slave, char link); int bond_3ad_get_active_agg_info(struct bonding *bond, struct ad_info *ad_info); int bond_3ad_xmit_xor(struct sk_buff *skb, struct net_device *dev); -int bond_3ad_lacpdu_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type* ptype); +int bond_3ad_lacpdu_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type* ptype, struct net_device *orig_dev); #endif //__BOND_3AD_H__ diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c index 19e829b..f8fce39 100644 --- a/drivers/net/bonding/bond_alb.c +++ b/drivers/net/bonding/bond_alb.c @@ -354,15 +354,14 @@ static void rlb_update_entry_from_arp(struct bonding *bond, struct arp_pkt *arp) _unlock_rx_hashtbl(bond); } -static int rlb_arp_recv(struct sk_buff *skb, struct net_device *bond_dev, struct packet_type *ptype) +static int rlb_arp_recv(struct sk_buff *skb, struct net_device *bond_dev, struct packet_type *ptype, struct net_device *orig_dev) { struct bonding *bond = bond_dev->priv; struct arp_pkt *arp = (struct arp_pkt *)skb->data; int res = NET_RX_DROP; - if (!(bond_dev->flags & IFF_MASTER)) { + if (!(bond_dev->flags & IFF_MASTER)) goto out; - } if (!arp) { dprintk("Packet has no ARP data\n"); diff --git a/drivers/net/hamradio/bpqether.c b/drivers/net/hamradio/bpqether.c index ba9f058..2946e03 100644 --- a/drivers/net/hamradio/bpqether.c +++ b/drivers/net/hamradio/bpqether.c @@ -98,7 +98,7 @@ static char bcast_addr[6]={0xFF,0xFF,0xFF,0xFF,0xFF,0xFF}; static char bpq_eth_addr[6]; -static int bpq_rcv(struct sk_buff *, struct net_device *, struct packet_type *); +static int bpq_rcv(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *); static int bpq_device_event(struct notifier_block *, unsigned long, void *); static const char *bpq_print_ethaddr(const unsigned char *); @@ -165,7 +165,7 @@ static inline int dev_is_ethdev(struct net_device *dev) /* * Receive an AX.25 frame via an ethernet interface. */ -static int bpq_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *ptype) +static int bpq_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *ptype, struct net_device *orig_dev) { int len; char * ptr; diff --git a/drivers/net/pppoe.c b/drivers/net/pppoe.c index ce1a9bf..82f236c 100644 --- a/drivers/net/pppoe.c +++ b/drivers/net/pppoe.c @@ -377,7 +377,8 @@ abort_kfree: ***********************************************************************/ static int pppoe_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) + struct packet_type *pt, + struct net_device *orig_dev) { struct pppoe_hdr *ph; @@ -426,7 +427,8 @@ out: ***********************************************************************/ static int pppoe_disc_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) + struct packet_type *pt, + struct net_device *orig_dev) { struct pppoe_hdr *ph; diff --git a/drivers/net/wan/hdlc_generic.c b/drivers/net/wan/hdlc_generic.c index a63f6a2..cdd4c09c 100644 --- a/drivers/net/wan/hdlc_generic.c +++ b/drivers/net/wan/hdlc_generic.c @@ -61,7 +61,7 @@ static struct net_device_stats *hdlc_get_stats(struct net_device *dev) static int hdlc_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *p) + struct packet_type *p, struct net_device *orig_dev) { hdlc_device *hdlc = dev_to_hdlc(dev); if (hdlc->proto.netif_rx) diff --git a/drivers/net/wan/lapbether.c b/drivers/net/wan/lapbether.c index 7f2e365..6c302e9 100644 --- a/drivers/net/wan/lapbether.c +++ b/drivers/net/wan/lapbether.c @@ -86,7 +86,7 @@ static __inline__ int dev_is_ethdev(struct net_device *dev) /* * Receive a LAPB frame via an ethernet interface. */ -static int lapbeth_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *ptype) +static int lapbeth_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *ptype, struct net_device *orig_dev) { int len, err; struct lapbethdev *lapbeth; diff --git a/drivers/net/wan/syncppp.c b/drivers/net/wan/syncppp.c index 84b65c6..f58c794 100644 --- a/drivers/net/wan/syncppp.c +++ b/drivers/net/wan/syncppp.c @@ -1447,7 +1447,7 @@ static void sppp_print_bytes (u_char *p, u16 len) * after interrupt servicing to process frames queued via netif_rx. */ -static int sppp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *p) +static int sppp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *p, struct net_device *orig_dev) { if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) return NET_RX_DROP; diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 62a9d89..17d0c0d 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -155,7 +155,6 @@ static inline int __vlan_hwaccel_rx(struct sk_buff *skb, { struct net_device_stats *stats; - skb->real_dev = skb->dev; skb->dev = grp->vlan_devices[vlan_tag & VLAN_VID_MASK]; if (skb->dev == NULL) { dev_kfree_skb_any(skb); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3a0ed7f..296cf93 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -497,10 +497,12 @@ static inline void *netdev_priv(struct net_device *dev) #define SET_NETDEV_DEV(net, pdev) ((net)->class_dev.dev = (pdev)) struct packet_type { - __be16 type; /* This is really htons(ether_type). */ - struct net_device *dev; /* NULL is wildcarded here */ - int (*func) (struct sk_buff *, struct net_device *, - struct packet_type *); + __be16 type; /* This is really htons(ether_type). */ + struct net_device *dev; /* NULL is wildcarded here */ + int (*func) (struct sk_buff *, + struct net_device *, + struct packet_type *, + struct net_device *); void *af_packet_priv; struct list_head list; }; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index af4f02e..60b3215 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -164,7 +164,6 @@ struct skb_shared_info { * @stamp: Time we arrived * @dev: Device we arrived on/are leaving by * @input_dev: Device we arrived on - * @real_dev: The real device we are using * @h: Transport layer header * @nh: Network layer header * @mac: Link layer header @@ -206,7 +205,6 @@ struct sk_buff { struct timeval stamp; struct net_device *dev; struct net_device *input_dev; - struct net_device *real_dev; union { struct tcphdr *th; diff --git a/include/net/arp.h b/include/net/arp.h index a1f09fa..a13e30c 100644 --- a/include/net/arp.h +++ b/include/net/arp.h @@ -11,7 +11,7 @@ extern struct neigh_table arp_tbl; extern void arp_init(void); extern int arp_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt); + struct packet_type *pt, struct net_device *orig_dev); extern int arp_find(unsigned char *haddr, struct sk_buff *skb); extern int arp_ioctl(unsigned int cmd, void __user *arg); extern void arp_send(int type, int ptype, u32 dest_ip, diff --git a/include/net/ax25.h b/include/net/ax25.h index 3696f98..926eed5 100644 --- a/include/net/ax25.h +++ b/include/net/ax25.h @@ -316,7 +316,7 @@ extern int ax25_protocol_is_registered(unsigned int); /* ax25_in.c */ extern int ax25_rx_iframe(ax25_cb *, struct sk_buff *); -extern int ax25_kiss_rcv(struct sk_buff *, struct net_device *, struct packet_type *); +extern int ax25_kiss_rcv(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *); /* ax25_ip.c */ extern int ax25_encapsulate(struct sk_buff *, struct net_device *, unsigned short, void *, void *, unsigned int); diff --git a/include/net/datalink.h b/include/net/datalink.h index 5797ba3..deb7ca7 100644 --- a/include/net/datalink.h +++ b/include/net/datalink.h @@ -9,7 +9,7 @@ struct datalink_proto { unsigned short header_length; int (*rcvfunc)(struct sk_buff *, struct net_device *, - struct packet_type *); + struct packet_type *, struct net_device *); int (*request)(struct datalink_proto *, struct sk_buff *, unsigned char *); struct list_head node; diff --git a/include/net/ip.h b/include/net/ip.h index 32360bb..2570b53 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -86,7 +86,7 @@ extern int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, u32 saddr, u32 daddr, struct ip_options *opt); extern int ip_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt); + struct packet_type *pt, struct net_device *orig_dev); extern int ip_local_deliver(struct sk_buff *skb); extern int ip_mr_input(struct sk_buff *skb); extern int ip_output(struct sk_buff *skb); diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 6932446..533fc07 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -346,7 +346,8 @@ static inline int ipv6_addr_any(const struct in6_addr *a) extern int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt); + struct packet_type *pt, + struct net_device *orig_dev); /* * upper-layer output functions diff --git a/include/net/llc.h b/include/net/llc.h index c9aed2a..71769a5 100644 --- a/include/net/llc.h +++ b/include/net/llc.h @@ -46,7 +46,8 @@ struct llc_sap { unsigned char f_bit; int (*rcv_func)(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt); + struct packet_type *pt, + struct net_device *orig_dev); struct llc_addr laddr; struct list_head node; struct { @@ -64,7 +65,7 @@ extern rwlock_t llc_sap_list_lock; extern unsigned char llc_station_mac_sa[ETH_ALEN]; extern int llc_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt); + struct packet_type *pt, struct net_device *orig_dev); extern int llc_mac_hdr_init(struct sk_buff *skb, unsigned char *sa, unsigned char *da); @@ -78,7 +79,8 @@ extern void llc_set_station_handler(void (*handler)(struct sk_buff *skb)); extern struct llc_sap *llc_sap_open(unsigned char lsap, int (*rcv)(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt)); + struct packet_type *pt, + struct net_device *orig_dev)); extern void llc_sap_close(struct llc_sap *sap); extern struct llc_sap *llc_sap_find(unsigned char sap_value); diff --git a/include/net/p8022.h b/include/net/p8022.h index 3c99a86..223f8fa 100644 --- a/include/net/p8022.h +++ b/include/net/p8022.h @@ -4,7 +4,8 @@ extern struct datalink_proto * register_8022_client(unsigned char type, int (*func)(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt)); + struct packet_type *pt, + struct net_device *orig_dev)); extern void unregister_8022_client(struct datalink_proto *proto); #endif diff --git a/include/net/psnap.h b/include/net/psnap.h index 9c94e8f..b2e01cc 100644 --- a/include/net/psnap.h +++ b/include/net/psnap.h @@ -1,7 +1,7 @@ #ifndef _NET_PSNAP_H #define _NET_PSNAP_H -extern struct datalink_proto *register_snap_client(unsigned char *desc, int (*rcvfunc)(struct sk_buff *, struct net_device *, struct packet_type *)); +extern struct datalink_proto *register_snap_client(unsigned char *desc, int (*rcvfunc)(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *orig_dev)); extern void unregister_snap_client(struct datalink_proto *proto); #endif diff --git a/include/net/x25.h b/include/net/x25.h index 8b39b98..fee62ff 100644 --- a/include/net/x25.h +++ b/include/net/x25.h @@ -175,7 +175,7 @@ extern void x25_kill_by_neigh(struct x25_neigh *); /* x25_dev.c */ extern void x25_send_frame(struct sk_buff *, struct x25_neigh *); -extern int x25_lapb_receive_frame(struct sk_buff *, struct net_device *, struct packet_type *); +extern int x25_lapb_receive_frame(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *); extern void x25_establish_link(struct x25_neigh *); extern void x25_terminate_link(struct x25_neigh *); diff --git a/net/802/p8022.c b/net/802/p8022.c index 5ae6341..b24817c 100644 --- a/net/802/p8022.c +++ b/net/802/p8022.c @@ -35,7 +35,8 @@ static int p8022_request(struct datalink_proto *dl, struct sk_buff *skb, struct datalink_proto *register_8022_client(unsigned char type, int (*func)(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt)) + struct packet_type *pt, + struct net_device *orig_dev)) { struct datalink_proto *proto; diff --git a/net/802/psnap.c b/net/802/psnap.c index 1053821..ab80b1f 100644 --- a/net/802/psnap.c +++ b/net/802/psnap.c @@ -47,7 +47,7 @@ static struct datalink_proto *find_snap_client(unsigned char *desc) * A SNAP packet has arrived */ static int snap_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) + struct packet_type *pt, struct net_device *orig_dev) { int rc = 1; struct datalink_proto *proto; @@ -61,7 +61,7 @@ static int snap_rcv(struct sk_buff *skb, struct net_device *dev, /* Pass the frame on. */ skb->h.raw += 5; skb_pull(skb, 5); - rc = proto->rcvfunc(skb, dev, &snap_packet_type); + rc = proto->rcvfunc(skb, dev, &snap_packet_type, orig_dev); } else { skb->sk = NULL; kfree_skb(skb); @@ -118,7 +118,8 @@ module_exit(snap_exit); struct datalink_proto *register_snap_client(unsigned char *desc, int (*rcvfunc)(struct sk_buff *, struct net_device *, - struct packet_type *)) + struct packet_type *, + struct net_device *)) { struct datalink_proto *proto = NULL; diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h index 508b1fa..9ae3a14 100644 --- a/net/8021q/vlan.h +++ b/net/8021q/vlan.h @@ -51,7 +51,7 @@ struct net_device *__find_vlan_dev(struct net_device* real_dev, /* found in vlan_dev.c */ int vlan_dev_rebuild_header(struct sk_buff *skb); int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev, - struct packet_type* ptype); + struct packet_type *ptype, struct net_device *orig_dev); int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, void *daddr, void *saddr, unsigned len); diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 49c4874..145f5cd 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -113,7 +113,7 @@ static inline struct sk_buff *vlan_check_reorder_header(struct sk_buff *skb) * */ int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev, - struct packet_type* ptype) + struct packet_type* ptype, struct net_device *orig_dev) { unsigned char *rawp = NULL; struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data); diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c index c34614e..7076097 100644 --- a/net/appletalk/aarp.c +++ b/net/appletalk/aarp.c @@ -698,7 +698,7 @@ static void __aarp_resolved(struct aarp_entry **list, struct aarp_entry *a, * frame. We currently only support Ethernet. */ static int aarp_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) + struct packet_type *pt, struct net_device *orig_dev) { struct elapaarp *ea = aarp_hdr(skb); int hash, ret = 0; diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 192b529..ffde33c 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1390,7 +1390,7 @@ free_it: * [ie ARPHRD_ETHERTALK] */ static int atalk_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) + struct packet_type *pt, struct net_device *orig_dev) { struct ddpehdr *ddp; struct sock *sock; @@ -1482,7 +1482,7 @@ freeit: * header and append a long one. */ static int ltalk_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) + struct packet_type *pt, struct net_device *orig_dev) { /* Expand any short form frames */ if (skb->mac.raw[2] == 1) { @@ -1528,7 +1528,7 @@ static int ltalk_rcv(struct sk_buff *skb, struct net_device *dev, } skb->h.raw = skb->data; - return atalk_rcv(skb, dev, pt); + return atalk_rcv(skb, dev, pt, orig_dev); freeit: kfree_skb(skb); return 0; diff --git a/net/ax25/ax25_in.c b/net/ax25/ax25_in.c index 3dc808f..124eec8 100644 --- a/net/ax25/ax25_in.c +++ b/net/ax25/ax25_in.c @@ -132,7 +132,7 @@ int ax25_rx_iframe(ax25_cb *ax25, struct sk_buff *skb) skb->dev = ax25->ax25_dev->dev; skb->pkt_type = PACKET_HOST; skb->protocol = htons(ETH_P_IP); - ip_rcv(skb, skb->dev, NULL); /* Wrong ptype */ + ip_rcv(skb, skb->dev, NULL, skb->dev); /* Wrong ptype */ return 1; } #endif @@ -258,7 +258,7 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev, skb->dev = dev; skb->pkt_type = PACKET_HOST; skb->protocol = htons(ETH_P_IP); - ip_rcv(skb, dev, ptype); /* Note ptype here is the wrong one, fix me later */ + ip_rcv(skb, dev, ptype, dev); /* Note ptype here is the wrong one, fix me later */ break; case AX25_P_ARP: @@ -268,7 +268,7 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev, skb->dev = dev; skb->pkt_type = PACKET_HOST; skb->protocol = htons(ETH_P_ARP); - arp_rcv(skb, dev, ptype); /* Note ptype here is wrong... */ + arp_rcv(skb, dev, ptype, dev); /* Note ptype here is wrong... */ break; #endif case AX25_P_TEXT: @@ -454,7 +454,7 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev, * Receive an AX.25 frame via a SLIP interface. */ int ax25_kiss_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *ptype) + struct packet_type *ptype, struct net_device *orig_dev) { skb->sk = NULL; /* Initially we don't know who it's for */ skb->destructor = NULL; /* Who initializes this, dammit?! */ diff --git a/net/core/dev.c b/net/core/dev.c index faf59b0..e1cc162 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1058,7 +1058,7 @@ void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) skb2->h.raw = skb2->nh.raw; skb2->pkt_type = PACKET_OUTGOING; - ptype->func(skb2, skb->dev, ptype); + ptype->func(skb2, skb->dev, ptype, skb->dev); } } rcu_read_unlock(); @@ -1425,14 +1425,14 @@ int netif_rx_ni(struct sk_buff *skb) EXPORT_SYMBOL(netif_rx_ni); -static __inline__ void skb_bond(struct sk_buff *skb) +static inline struct net_device *skb_bond(struct sk_buff *skb) { struct net_device *dev = skb->dev; - if (dev->master) { - skb->real_dev = skb->dev; + if (dev->master) skb->dev = dev->master; - } + + return dev; } static void net_tx_action(struct softirq_action *h) @@ -1482,10 +1482,11 @@ static void net_tx_action(struct softirq_action *h) } static __inline__ int deliver_skb(struct sk_buff *skb, - struct packet_type *pt_prev) + struct packet_type *pt_prev, + struct net_device *orig_dev) { atomic_inc(&skb->users); - return pt_prev->func(skb, skb->dev, pt_prev); + return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE) @@ -1496,7 +1497,8 @@ struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br, void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent); static __inline__ int handle_bridge(struct sk_buff **pskb, - struct packet_type **pt_prev, int *ret) + struct packet_type **pt_prev, int *ret, + struct net_device *orig_dev) { struct net_bridge_port *port; @@ -1505,14 +1507,14 @@ static __inline__ int handle_bridge(struct sk_buff **pskb, return 0; if (*pt_prev) { - *ret = deliver_skb(*pskb, *pt_prev); + *ret = deliver_skb(*pskb, *pt_prev, orig_dev); *pt_prev = NULL; } return br_handle_frame_hook(port, pskb); } #else -#define handle_bridge(skb, pt_prev, ret) (0) +#define handle_bridge(skb, pt_prev, ret, orig_dev) (0) #endif #ifdef CONFIG_NET_CLS_ACT @@ -1559,6 +1561,7 @@ static int ing_filter(struct sk_buff *skb) int netif_receive_skb(struct sk_buff *skb) { struct packet_type *ptype, *pt_prev; + struct net_device *orig_dev; int ret = NET_RX_DROP; unsigned short type; @@ -1569,7 +1572,7 @@ int netif_receive_skb(struct sk_buff *skb) if (!skb->stamp.tv_sec) net_timestamp(&skb->stamp); - skb_bond(skb); + orig_dev = skb_bond(skb); __get_cpu_var(netdev_rx_stat).total++; @@ -1590,14 +1593,14 @@ int netif_receive_skb(struct sk_buff *skb) list_for_each_entry_rcu(ptype, &ptype_all, list) { if (!ptype->dev || ptype->dev == skb->dev) { if (pt_prev) - ret = deliver_skb(skb, pt_prev); + ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; } } #ifdef CONFIG_NET_CLS_ACT if (pt_prev) { - ret = deliver_skb(skb, pt_prev); + ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = NULL; /* noone else should process this after*/ } else { skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd); @@ -1616,7 +1619,7 @@ ncls: handle_diverter(skb); - if (handle_bridge(&skb, &pt_prev, &ret)) + if (handle_bridge(&skb, &pt_prev, &ret, orig_dev)) goto out; type = skb->protocol; @@ -1624,13 +1627,13 @@ ncls: if (ptype->type == type && (!ptype->dev || ptype->dev == skb->dev)) { if (pt_prev) - ret = deliver_skb(skb, pt_prev); + ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; } } if (pt_prev) { - ret = pt_prev->func(skb, skb->dev, pt_prev); + ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } else { kfree_skb(skb); /* Jamal, now you will not able to escape explaining diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 16df7bd..ef498cb 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -333,7 +333,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, unsigned int __nocast gfp_mask) n->sk = NULL; C(stamp); C(dev); - C(real_dev); C(h); C(nh); C(mac); @@ -397,7 +396,6 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->sk = NULL; new->dev = old->dev; - new->real_dev = old->real_dev; new->priority = old->priority; new->protocol = old->protocol; new->dst = dst_clone(old->dst); diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 0c30409..bd49dd9 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -2064,7 +2064,7 @@ static struct notifier_block dn_dev_notifier = { .notifier_call = dn_device_event, }; -extern int dn_route_rcv(struct sk_buff *, struct net_device *, struct packet_type *); +extern int dn_route_rcv(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *); static struct packet_type dn_dix_packet_type = { .type = __constant_htons(ETH_P_DNA_RT), diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 2399fa8..2c915f3 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -572,7 +572,7 @@ static int dn_route_ptp_hello(struct sk_buff *skb) return NET_RX_SUCCESS; } -int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct dn_skb_cb *cb; unsigned char flags = 0; diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c index b807a31..8f06399 100644 --- a/net/econet/af_econet.c +++ b/net/econet/af_econet.c @@ -1009,7 +1009,7 @@ release: * Receive an Econet frame from a device. */ -static int econet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +static int econet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct ec_framehdr *hdr; struct sock *sk; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index a642fd6..6eb9c54 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -700,7 +700,7 @@ void arp_send(int type, int ptype, u32 dest_ip, static void parp_redo(struct sk_buff *skb) { nf_reset(skb); - arp_rcv(skb, skb->dev, NULL); + arp_rcv(skb, skb->dev, NULL, skb->dev); } /* @@ -927,7 +927,7 @@ out: * Receive an arp request from the device layer. */ -int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct arphdr *arp; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index c703528..d603247 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -358,7 +358,7 @@ drop: /* * Main IP Receive routine. */ -int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct iphdr *iph; diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index d2bf8e1..63e1066 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -393,7 +393,7 @@ static int __init ic_defaults(void) #ifdef IPCONFIG_RARP -static int ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt); +static int ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); static struct packet_type rarp_packet_type __initdata = { .type = __constant_htons(ETH_P_RARP), @@ -414,7 +414,7 @@ static inline void ic_rarp_cleanup(void) * Process received RARP packet. */ static int __init -ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct arphdr *rarp; unsigned char *rarp_ptr; @@ -555,7 +555,7 @@ struct bootp_pkt { /* BOOTP packet format */ #define DHCPRELEASE 7 #define DHCPINFORM 8 -static int ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt); +static int ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); static struct packet_type bootp_packet_type __initdata = { .type = __constant_htons(ETH_P_IP), @@ -823,7 +823,7 @@ static void __init ic_do_bootp_ext(u8 *ext) /* * Receive BOOTP reply. */ -static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct bootp_pkt *b; struct iphdr *h; diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 10fbb50..ab51c03 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -56,7 +56,7 @@ static inline int ip6_rcv_finish( struct sk_buff *skb) return dst_input(skb); } -int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct ipv6hdr *hdr; u32 pkt_len; diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index 5a27e5d..3a13c5d 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -1627,7 +1627,7 @@ out: return rc; } -static int ipx_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +static int ipx_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { /* NULL here for pt means the packet was looped back */ struct ipx_interface *intrfc; diff --git a/net/irda/irlap_frame.c b/net/irda/irlap_frame.c index eb65b49..3e9a06a 100644 --- a/net/irda/irlap_frame.c +++ b/net/irda/irlap_frame.c @@ -1303,7 +1303,7 @@ static void irlap_recv_test_frame(struct irlap_cb *self, struct sk_buff *skb, * Jean II */ int irlap_driver_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *ptype) + struct packet_type *ptype, struct net_device *orig_dev) { struct irlap_info info; struct irlap_cb *self; diff --git a/net/irda/irmod.c b/net/irda/irmod.c index 6ffaed4..634901d 100644 --- a/net/irda/irmod.c +++ b/net/irda/irmod.c @@ -54,7 +54,7 @@ extern int irsock_init(void); extern void irsock_cleanup(void); /* irlap_frame.c */ extern int irlap_driver_rcv(struct sk_buff *, struct net_device *, - struct packet_type *); + struct packet_type *, struct net_device *); /* * Module parameters diff --git a/net/llc/llc_core.c b/net/llc/llc_core.c index 5ff02c0..9727455 100644 --- a/net/llc/llc_core.c +++ b/net/llc/llc_core.c @@ -103,7 +103,8 @@ out: struct llc_sap *llc_sap_open(unsigned char lsap, int (*func)(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt)) + struct packet_type *pt, + struct net_device *orig_dev)) { struct llc_sap *sap = llc_sap_find(lsap); diff --git a/net/llc/llc_input.c b/net/llc/llc_input.c index 4da6976..13b4624 100644 --- a/net/llc/llc_input.c +++ b/net/llc/llc_input.c @@ -132,7 +132,7 @@ static inline int llc_fixup_skb(struct sk_buff *skb) * data now), it queues this frame in the connection's backlog. */ int llc_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) + struct packet_type *pt, struct net_device *orig_dev) { struct llc_sap *sap; struct llc_pdu_sn *pdu; @@ -165,7 +165,7 @@ int llc_rcv(struct sk_buff *skb, struct net_device *dev, * LLC functionality */ if (sap->rcv_func) { - sap->rcv_func(skb, dev, pt); + sap->rcv_func(skb, dev, pt, orig_dev); goto out; } dest = llc_pdu_type(skb); diff --git a/net/netrom/nr_dev.c b/net/netrom/nr_dev.c index 220bf74..83eb41d 100644 --- a/net/netrom/nr_dev.c +++ b/net/netrom/nr_dev.c @@ -64,7 +64,7 @@ int nr_rx_ip(struct sk_buff *skb, struct net_device *dev) skb->nh.raw = skb->data; skb->pkt_type = PACKET_HOST; - ip_rcv(skb, skb->dev, NULL); + ip_rcv(skb, skb->dev, NULL, skb->dev); return 1; } diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index c9d5980..deb5f6f 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -241,7 +241,7 @@ static struct proto_ops packet_ops; #ifdef CONFIG_SOCK_PACKET static struct proto_ops packet_ops_spkt; -static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct sock *sk; struct sockaddr_pkt *spkt; @@ -441,7 +441,7 @@ static inline unsigned run_filter(struct sk_buff *skb, struct sock *sk, unsigned we will not harm anyone. */ -static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct sock *sk; struct sockaddr_ll *sll; @@ -546,7 +546,7 @@ drop: } #ifdef CONFIG_PACKET_MMAP -static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct sock *sk; struct packet_sock *po; diff --git a/net/x25/x25_dev.c b/net/x25/x25_dev.c index 36fc3bf..adfe7b8 100644 --- a/net/x25/x25_dev.c +++ b/net/x25/x25_dev.c @@ -81,7 +81,7 @@ static int x25_receive_data(struct sk_buff *skb, struct x25_neigh *nb) } int x25_lapb_receive_frame(struct sk_buff *skb, struct net_device *dev, - struct packet_type *ptype) + struct packet_type *ptype, struct net_device *orig_dev) { struct sk_buff *nskb; struct x25_neigh *nb; -- cgit v0.10.2 From 0742fd53a3774781255bd1e471e7aa2e4a82d5f7 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Tue, 9 Aug 2005 19:35:47 -0700 Subject: [IPV4]: possible cleanups This patch contains the following possible cleanups: - make needlessly global code static - #if 0 the following unused global function: - xfrm4_state.c: xfrm4_state_fini - remove the following unneeded EXPORT_SYMBOL's: - ip_output.c: ip_finish_output - ip_output.c: sysctl_ip_default_ttl - fib_frontend.c: ip_dev_find - inetpeer.c: inet_peer_idlock - ip_options.c: ip_options_compile - ip_options.c: ip_options_undo - net/core/request_sock.c: sysctl_max_syn_backlog Signed-off-by: Adrian Bunk Signed-off-by: David S. Miller diff --git a/include/net/ip.h b/include/net/ip.h index 2570b53..c16fb6a 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -140,8 +140,6 @@ struct ip_reply_arg { void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg, unsigned int len); -extern int ip_finish_output(struct sk_buff *skb); - struct ipv4_config { int log_martians; diff --git a/include/net/route.h b/include/net/route.h index c3cd069..63c9455 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -105,10 +105,6 @@ struct rt_cache_stat unsigned int out_hlist_search; }; -extern struct rt_cache_stat *rt_cache_stat; -#define RT_CACHE_STAT_INC(field) \ - (per_cpu_ptr(rt_cache_stat, raw_smp_processor_id())->field++) - extern struct ip_rt_acct *ip_rt_acct; struct in_device; diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 868ef88..a9d0d8c 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -818,7 +818,6 @@ extern void xfrm6_init(void); extern void xfrm6_fini(void); extern void xfrm_state_init(void); extern void xfrm4_state_init(void); -extern void xfrm4_state_fini(void); extern void xfrm6_state_init(void); extern void xfrm6_state_fini(void); diff --git a/net/core/request_sock.c b/net/core/request_sock.c index 4e99ce5..98f0fc9 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -32,7 +32,6 @@ * Further increasing requires to change hash table size. */ int sysctl_max_syn_backlog = 256; -EXPORT_SYMBOL(sysctl_max_syn_backlog); int reqsk_queue_alloc(struct request_sock_queue *queue, const int nr_table_entries) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index cd8e45a..e572208 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -662,5 +662,4 @@ void __init ip_fib_init(void) } EXPORT_SYMBOL(inet_addr_type); -EXPORT_SYMBOL(ip_dev_find); EXPORT_SYMBOL(ip_rt_ioctl); diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index ab18a85..3c513ce 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -459,5 +459,3 @@ static void peer_check_expire(unsigned long dummy) peer_total / inet_peer_threshold * HZ; add_timer(&peer_periodic_timer); } - -EXPORT_SYMBOL(inet_peer_idlock); diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 6d89f3f..7e02ba5 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -620,6 +620,3 @@ int ip_options_rcv_srr(struct sk_buff *skb) } return 0; } - -EXPORT_SYMBOL(ip_options_compile); -EXPORT_SYMBOL(ip_options_undo); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 766564c..c934f53 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -205,7 +205,7 @@ static inline int ip_finish_output2(struct sk_buff *skb) return -EINVAL; } -int ip_finish_output(struct sk_buff *skb) +static int ip_finish_output(struct sk_buff *skb) { struct net_device *dev = skb->dst->dev; @@ -1328,12 +1328,7 @@ void __init ip_init(void) #endif } -EXPORT_SYMBOL(ip_finish_output); EXPORT_SYMBOL(ip_fragment); EXPORT_SYMBOL(ip_generic_getfrag); EXPORT_SYMBOL(ip_queue_xmit); EXPORT_SYMBOL(ip_send_check); - -#ifdef CONFIG_SYSCTL -EXPORT_SYMBOL(sysctl_ip_default_ttl); -#endif diff --git a/net/ipv4/multipath_drr.c b/net/ipv4/multipath_drr.c index c9cf872..db67373 100644 --- a/net/ipv4/multipath_drr.c +++ b/net/ipv4/multipath_drr.c @@ -107,7 +107,7 @@ static int drr_dev_event(struct notifier_block *this, return NOTIFY_DONE; } -struct notifier_block drr_dev_notifier = { +static struct notifier_block drr_dev_notifier = { .notifier_call = drr_dev_event, }; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index d675ff8..3aef0e1 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -240,7 +240,9 @@ static unsigned rt_hash_mask; static int rt_hash_log; static unsigned int rt_hash_rnd; -struct rt_cache_stat *rt_cache_stat; +static struct rt_cache_stat *rt_cache_stat; +#define RT_CACHE_STAT_INC(field) \ + (per_cpu_ptr(rt_cache_stat, raw_smp_processor_id())->field++) static int rt_intern_hash(unsigned hash, struct rtable *rth, struct rtable **res); diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index 050611d..d23e07f 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c @@ -128,8 +128,10 @@ void __init xfrm4_state_init(void) xfrm_state_register_afinfo(&xfrm4_state_afinfo); } +#if 0 void __exit xfrm4_state_fini(void) { xfrm_state_unregister_afinfo(&xfrm4_state_afinfo); } +#endif /* 0 */ -- cgit v0.10.2 From 86e65da9c1fc6fb421b9f796b597b3eced6b55ab Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 9 Aug 2005 19:36:29 -0700 Subject: [NET]: Remove explicit initializations of skb->input_dev Instead, set it in one place, namely the beginning of netif_receive_skb(). Based upon suggestions from Jamal Hadi Salim. Signed-off-by: David S. Miller diff --git a/drivers/isdn/i4l/isdn_net.c b/drivers/isdn/i4l/isdn_net.c index f30e8e6..96c115e 100644 --- a/drivers/isdn/i4l/isdn_net.c +++ b/drivers/isdn/i4l/isdn_net.c @@ -1786,7 +1786,6 @@ isdn_net_receive(struct net_device *ndev, struct sk_buff *skb) lp->stats.rx_bytes += skb->len; } skb->dev = ndev; - skb->input_dev = ndev; skb->pkt_type = PACKET_HOST; skb->mac.raw = skb->data; #ifdef ISDN_DEBUG_NET_DUMP diff --git a/drivers/isdn/i4l/isdn_ppp.c b/drivers/isdn/i4l/isdn_ppp.c index 260a323..d97a9be 100644 --- a/drivers/isdn/i4l/isdn_ppp.c +++ b/drivers/isdn/i4l/isdn_ppp.c @@ -1177,7 +1177,6 @@ isdn_ppp_push_higher(isdn_net_dev * net_dev, isdn_net_local * lp, struct sk_buff mlp->huptimer = 0; #endif /* CONFIG_IPPP_FILTER */ skb->dev = dev; - skb->input_dev = dev; skb->mac.raw = skb->data; netif_rx(skb); /* net_dev->local->stats.rx_packets++; done in isdn_net.c */ diff --git a/drivers/net/ppp_generic.c b/drivers/net/ppp_generic.c index a32668e..bb71638 100644 --- a/drivers/net/ppp_generic.c +++ b/drivers/net/ppp_generic.c @@ -1657,7 +1657,6 @@ ppp_receive_nonmp_frame(struct ppp *ppp, struct sk_buff *skb) skb->dev = ppp->dev; skb->protocol = htons(npindex_to_ethertype[npi]); skb->mac.raw = skb->data; - skb->input_dev = ppp->dev; netif_rx(skb); ppp->dev->last_rx = jiffies; } diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 4abda6a..b902d24 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -352,10 +352,10 @@ tcf_change_indev(struct tcf_proto *tp, char *indev, struct rtattr *indev_tlv) static inline int tcf_match_indev(struct sk_buff *skb, char *indev) { - if (0 != indev[0]) { - if (NULL == skb->input_dev) + if (indev[0]) { + if (!skb->input_dev) return 0; - else if (0 != strcmp(indev, skb->input_dev->name)) + if (strcmp(indev, skb->input_dev->name)) return 0; } diff --git a/include/net/x25device.h b/include/net/x25device.h index d45ae88..1a31837 100644 --- a/include/net/x25device.h +++ b/include/net/x25device.h @@ -8,7 +8,6 @@ static inline __be16 x25_type_trans(struct sk_buff *skb, struct net_device *dev) { skb->mac.raw = skb->data; - skb->input_dev = skb->dev = dev; skb->pkt_type = PACKET_HOST; return htons(ETH_P_X25); diff --git a/net/core/dev.c b/net/core/dev.c index e1cc162..9d153eb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1536,17 +1536,14 @@ static int ing_filter(struct sk_buff *skb) __u32 ttl = (__u32) G_TC_RTTL(skb->tc_verd); if (MAX_RED_LOOP < ttl++) { printk("Redir loop detected Dropping packet (%s->%s)\n", - skb->input_dev?skb->input_dev->name:"??",skb->dev->name); + skb->input_dev->name, skb->dev->name); return TC_ACT_SHOT; } skb->tc_verd = SET_TC_RTTL(skb->tc_verd,ttl); skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_INGRESS); - if (NULL == skb->input_dev) { - skb->input_dev = skb->dev; - printk("ing_filter: fixed %s out %s\n",skb->input_dev->name,skb->dev->name); - } + spin_lock(&dev->ingress_lock); if ((q = dev->qdisc_ingress) != NULL) result = q->enqueue(skb, q); @@ -1572,6 +1569,9 @@ int netif_receive_skb(struct sk_buff *skb) if (!skb->stamp.tv_sec) net_timestamp(&skb->stamp); + if (!skb->input_dev) + skb->input_dev = skb->dev; + orig_dev = skb_bond(skb); __get_cpu_var(netdev_rx_stat).total++; diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index f6dbfb9..f444a2f 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -163,7 +163,6 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) skb->mac.raw=skb->data; skb_pull(skb,ETH_HLEN); eth = eth_hdr(skb); - skb->input_dev = dev; if(*eth->h_dest&1) { -- cgit v0.10.2 From 373ac73595491b7c1f2f10cb37e9b7bae6901227 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 9 Aug 2005 19:36:53 -0700 Subject: [NETFILTER]: C99 initizalizers for NAT protocols Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller diff --git a/net/ipv4/netfilter/ip_nat_proto_icmp.c b/net/ipv4/netfilter/ip_nat_proto_icmp.c index 38fdfc2..7ed2fdb 100644 --- a/net/ipv4/netfilter/ip_nat_proto_icmp.c +++ b/net/ipv4/netfilter/ip_nat_proto_icmp.c @@ -106,16 +106,18 @@ icmp_print_range(char *buffer, const struct ip_nat_range *range) else return 0; } -struct ip_nat_protocol ip_nat_protocol_icmp -= { "ICMP", IPPROTO_ICMP, THIS_MODULE, - icmp_manip_pkt, - icmp_in_range, - icmp_unique_tuple, - icmp_print, - icmp_print_range, +struct ip_nat_protocol ip_nat_protocol_icmp = { + .name = "ICMP", + .protonum = IPPROTO_ICMP, + .me = THIS_MODULE, + .manip_pkt = icmp_manip_pkt, + .in_range = icmp_in_range, + .unique_tuple = icmp_unique_tuple, + .print = icmp_print, + .print_range = icmp_print_range, #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) - ip_nat_port_range_to_nfattr, - ip_nat_port_nfattr_to_range, + .range_to_nfattr = ip_nat_port_range_to_nfattr, + .nfattr_to_range = ip_nat_port_nfattr_to_range, #endif }; diff --git a/net/ipv4/netfilter/ip_nat_proto_tcp.c b/net/ipv4/netfilter/ip_nat_proto_tcp.c index f03cd0f..6113a16 100644 --- a/net/ipv4/netfilter/ip_nat_proto_tcp.c +++ b/net/ipv4/netfilter/ip_nat_proto_tcp.c @@ -170,16 +170,18 @@ tcp_print_range(char *buffer, const struct ip_nat_range *range) else return 0; } -struct ip_nat_protocol ip_nat_protocol_tcp -= { "TCP", IPPROTO_TCP, THIS_MODULE, - tcp_manip_pkt, - tcp_in_range, - tcp_unique_tuple, - tcp_print, - tcp_print_range, +struct ip_nat_protocol ip_nat_protocol_tcp = { + .name = "TCP", + .protonum = IPPROTO_TCP, + .me = THIS_MODULE, + .manip_pkt = tcp_manip_pkt, + .in_range = tcp_in_range, + .unique_tuple = tcp_unique_tuple, + .print = tcp_print, + .print_range = tcp_print_range, #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) - ip_nat_port_range_to_nfattr, - ip_nat_port_nfattr_to_range, + .range_to_nfattr = ip_nat_port_range_to_nfattr, + .nfattr_to_range = ip_nat_port_nfattr_to_range, #endif }; diff --git a/net/ipv4/netfilter/ip_nat_proto_udp.c b/net/ipv4/netfilter/ip_nat_proto_udp.c index 7a4e66e..689478e 100644 --- a/net/ipv4/netfilter/ip_nat_proto_udp.c +++ b/net/ipv4/netfilter/ip_nat_proto_udp.c @@ -156,16 +156,18 @@ udp_print_range(char *buffer, const struct ip_nat_range *range) else return 0; } -struct ip_nat_protocol ip_nat_protocol_udp -= { "UDP", IPPROTO_UDP, THIS_MODULE, - udp_manip_pkt, - udp_in_range, - udp_unique_tuple, - udp_print, - udp_print_range, +struct ip_nat_protocol ip_nat_protocol_udp = { + .name = "UDP", + .protonum = IPPROTO_UDP, + .me = THIS_MODULE, + .manip_pkt = udp_manip_pkt, + .in_range = udp_in_range, + .unique_tuple = udp_unique_tuple, + .print = udp_print, + .print_range = udp_print_range, #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) - ip_nat_port_range_to_nfattr, - ip_nat_port_nfattr_to_range, + .range_to_nfattr = ip_nat_port_range_to_nfattr, + .nfattr_to_range = ip_nat_port_nfattr_to_range, #endif }; diff --git a/net/ipv4/netfilter/ip_nat_proto_unknown.c b/net/ipv4/netfilter/ip_nat_proto_unknown.c index 512d8f2..99bbef5 100644 --- a/net/ipv4/netfilter/ip_nat_proto_unknown.c +++ b/net/ipv4/netfilter/ip_nat_proto_unknown.c @@ -61,10 +61,11 @@ unknown_print_range(char *buffer, const struct ip_nat_range *range) } struct ip_nat_protocol ip_nat_unknown_protocol = { - "unknown", 0, THIS_MODULE, - unknown_manip_pkt, - unknown_in_range, - unknown_unique_tuple, - unknown_print, - unknown_print_range + .name = "unknown", + .me = THIS_MODULE, + .manip_pkt = unknown_manip_pkt, + .in_range = unknown_in_range, + .unique_tuple = unknown_unique_tuple, + .print = unknown_print, + .print_range = unknown_print_range }; -- cgit v0.10.2 From 089af26c706d1473f641c909fee7c878d29c1f1a Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 19:37:23 -0700 Subject: [NETFILTER]: Rename skb_ip_make_writable() to skb_make_writable() There is nothing IPv4-specific in it. In fact, it was already used by IPv6, too... Upcoming nfnetlink_queue code will use it for any kind of packet. Signed-off-by: Harald Welte Signed-off-by: David S. Miller diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index ec60856..54b97a1 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -193,6 +193,11 @@ extern void nf_ct_attach(struct sk_buff *, struct sk_buff *); /* FIXME: Before cache is ever used, this must be implemented for real. */ extern void nf_invalidate_cache(int pf); +/* Call this before modifying an existing packet: ensures it is + modifiable and linear to the point you care about (writable_len). + Returns true or false. */ +extern int skb_make_writable(struct sk_buff **pskb, unsigned int writable_len); + #else /* !CONFIG_NETFILTER */ #define NF_HOOK(pf, hook, skb, indev, outdev, okfn) (okfn)(skb) static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {} diff --git a/include/linux/netfilter_ipv4.h b/include/linux/netfilter_ipv4.h index 552815b..fdc4a95 100644 --- a/include/linux/netfilter_ipv4.h +++ b/include/linux/netfilter_ipv4.h @@ -80,11 +80,6 @@ enum nf_ip_hook_priorities { #ifdef __KERNEL__ extern int ip_route_me_harder(struct sk_buff **pskb); -/* Call this before modifying an existing IP packet: ensures it is - modifiable and linear to the point you care about (writable_len). - Returns true or false. */ -extern int skb_ip_make_writable(struct sk_buff **pskb, - unsigned int writable_len); #endif /*__KERNEL__*/ #endif /*__LINUX_IP_NETFILTER_H*/ diff --git a/net/core/netfilter.c b/net/core/netfilter.c index 076c156..bbf9081 100644 --- a/net/core/netfilter.c +++ b/net/core/netfilter.c @@ -512,8 +512,9 @@ int ip_route_me_harder(struct sk_buff **pskb) return 0; } EXPORT_SYMBOL(ip_route_me_harder); +#endif /*CONFIG_INET*/ -int skb_ip_make_writable(struct sk_buff **pskb, unsigned int writable_len) +int skb_make_writable(struct sk_buff **pskb, unsigned int writable_len) { struct sk_buff *nskb; @@ -540,8 +541,7 @@ copy_skb: *pskb = nskb; return 1; } -EXPORT_SYMBOL(skb_ip_make_writable); -#endif /*CONFIG_INET*/ +EXPORT_SYMBOL(skb_make_writable); /* Internal logging interface, which relies on the real LOG target modules */ diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c index 567c802..1adedb7 100644 --- a/net/ipv4/netfilter/ip_nat_core.c +++ b/net/ipv4/netfilter/ip_nat_core.c @@ -359,7 +359,7 @@ manip_pkt(u_int16_t proto, struct iphdr *iph; struct ip_nat_protocol *p; - if (!skb_ip_make_writable(pskb, iphdroff + sizeof(*iph))) + if (!skb_make_writable(pskb, iphdroff + sizeof(*iph))) return 0; iph = (void *)(*pskb)->data + iphdroff; @@ -431,7 +431,7 @@ int icmp_reply_translation(struct sk_buff **pskb, struct ip_conntrack_tuple inner, target; int hdrlen = (*pskb)->nh.iph->ihl * 4; - if (!skb_ip_make_writable(pskb, hdrlen + sizeof(*inside))) + if (!skb_make_writable(pskb, hdrlen + sizeof(*inside))) return 0; inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4; diff --git a/net/ipv4/netfilter/ip_nat_helper.c b/net/ipv4/netfilter/ip_nat_helper.c index 158f34f..d2dd5d3 100644 --- a/net/ipv4/netfilter/ip_nat_helper.c +++ b/net/ipv4/netfilter/ip_nat_helper.c @@ -168,7 +168,7 @@ ip_nat_mangle_tcp_packet(struct sk_buff **pskb, struct tcphdr *tcph; int datalen; - if (!skb_ip_make_writable(pskb, (*pskb)->len)) + if (!skb_make_writable(pskb, (*pskb)->len)) return 0; if (rep_len > match_len @@ -228,7 +228,7 @@ ip_nat_mangle_udp_packet(struct sk_buff **pskb, match_offset + match_len) return 0; - if (!skb_ip_make_writable(pskb, (*pskb)->len)) + if (!skb_make_writable(pskb, (*pskb)->len)) return 0; if (rep_len > match_len @@ -315,7 +315,7 @@ ip_nat_sack_adjust(struct sk_buff **pskb, optoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct tcphdr); optend = (*pskb)->nh.iph->ihl*4 + tcph->doff*4; - if (!skb_ip_make_writable(pskb, optend)) + if (!skb_make_writable(pskb, optend)) return 0; dir = CTINFO2DIR(ctinfo); @@ -363,7 +363,7 @@ ip_nat_seq_adjust(struct sk_buff **pskb, this_way = &ct->nat.info.seq[dir]; other_way = &ct->nat.info.seq[!dir]; - if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph))) + if (!skb_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph))) return 0; tcph = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4; diff --git a/net/ipv4/netfilter/ip_nat_proto_icmp.c b/net/ipv4/netfilter/ip_nat_proto_icmp.c index 7ed2fdb..9387190 100644 --- a/net/ipv4/netfilter/ip_nat_proto_icmp.c +++ b/net/ipv4/netfilter/ip_nat_proto_icmp.c @@ -62,7 +62,7 @@ icmp_manip_pkt(struct sk_buff **pskb, struct icmphdr *hdr; unsigned int hdroff = iphdroff + iph->ihl*4; - if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr))) + if (!skb_make_writable(pskb, hdroff + sizeof(*hdr))) return 0; hdr = (struct icmphdr *)((*pskb)->data + hdroff); diff --git a/net/ipv4/netfilter/ip_nat_proto_tcp.c b/net/ipv4/netfilter/ip_nat_proto_tcp.c index 6113a16..1d381bf 100644 --- a/net/ipv4/netfilter/ip_nat_proto_tcp.c +++ b/net/ipv4/netfilter/ip_nat_proto_tcp.c @@ -103,7 +103,7 @@ tcp_manip_pkt(struct sk_buff **pskb, if ((*pskb)->len >= hdroff + sizeof(struct tcphdr)) hdrsize = sizeof(struct tcphdr); - if (!skb_ip_make_writable(pskb, hdroff + hdrsize)) + if (!skb_make_writable(pskb, hdroff + hdrsize)) return 0; iph = (struct iphdr *)((*pskb)->data + iphdroff); diff --git a/net/ipv4/netfilter/ip_nat_proto_udp.c b/net/ipv4/netfilter/ip_nat_proto_udp.c index 689478e..c4906e1 100644 --- a/net/ipv4/netfilter/ip_nat_proto_udp.c +++ b/net/ipv4/netfilter/ip_nat_proto_udp.c @@ -94,7 +94,7 @@ udp_manip_pkt(struct sk_buff **pskb, u32 oldip, newip; u16 *portptr, newport; - if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr))) + if (!skb_make_writable(pskb, hdroff + sizeof(*hdr))) return 0; iph = (struct iphdr *)((*pskb)->data + iphdroff); diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c index 2a48b6e..93b2c51 100644 --- a/net/ipv4/netfilter/ip_nat_snmp_basic.c +++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c @@ -1275,7 +1275,7 @@ static int help(struct sk_buff **pskb, return NF_DROP; } - if (!skb_ip_make_writable(pskb, (*pskb)->len)) + if (!skb_make_writable(pskb, (*pskb)->len)) return NF_DROP; spin_lock_bh(&snmp_lock); diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index bc0af8d..ae975ac 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -388,7 +388,7 @@ ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct ipq_queue_entry *e) } skb_put(e->skb, diff); } - if (!skb_ip_make_writable(&e->skb, v->data_len)) + if (!skb_make_writable(&e->skb, v->data_len)) return -ENOMEM; memcpy(e->skb->data, v->payload, v->data_len); e->skb->ip_summed = CHECKSUM_NONE; diff --git a/net/ipv4/netfilter/ipt_DSCP.c b/net/ipv4/netfilter/ipt_DSCP.c index 975476f..6e31957 100644 --- a/net/ipv4/netfilter/ipt_DSCP.c +++ b/net/ipv4/netfilter/ipt_DSCP.c @@ -39,7 +39,7 @@ target(struct sk_buff **pskb, if (((*pskb)->nh.iph->tos & IPT_DSCP_MASK) != sh_dscp) { u_int16_t diffs[2]; - if (!skb_ip_make_writable(pskb, sizeof(struct iphdr))) + if (!skb_make_writable(pskb, sizeof(struct iphdr))) return NF_DROP; diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF; diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c index f63a9bc..a131969 100644 --- a/net/ipv4/netfilter/ipt_ECN.c +++ b/net/ipv4/netfilter/ipt_ECN.c @@ -31,7 +31,7 @@ set_ect_ip(struct sk_buff **pskb, const struct ipt_ECN_info *einfo) != (einfo->ip_ect & IPT_ECN_IP_MASK)) { u_int16_t diffs[2]; - if (!skb_ip_make_writable(pskb, sizeof(struct iphdr))) + if (!skb_make_writable(pskb, sizeof(struct iphdr))) return 0; diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF; @@ -66,7 +66,7 @@ set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo, int inward) tcph->cwr == einfo->proto.tcp.cwr))) return 1; - if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph))) + if (!skb_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph))) return 0; tcph = (void *)(*pskb)->nh.iph + (*pskb)->nh.iph->ihl*4; diff --git a/net/ipv4/netfilter/ipt_TCPMSS.c b/net/ipv4/netfilter/ipt_TCPMSS.c index 9492883..8db70d6 100644 --- a/net/ipv4/netfilter/ipt_TCPMSS.c +++ b/net/ipv4/netfilter/ipt_TCPMSS.c @@ -58,7 +58,7 @@ ipt_tcpmss_target(struct sk_buff **pskb, unsigned int i; u_int8_t *opt; - if (!skb_ip_make_writable(pskb, (*pskb)->len)) + if (!skb_make_writable(pskb, (*pskb)->len)) return NF_DROP; if ((*pskb)->ip_summed == CHECKSUM_HW && diff --git a/net/ipv4/netfilter/ipt_TOS.c b/net/ipv4/netfilter/ipt_TOS.c index 49abb7e..deadb36 100644 --- a/net/ipv4/netfilter/ipt_TOS.c +++ b/net/ipv4/netfilter/ipt_TOS.c @@ -33,7 +33,7 @@ target(struct sk_buff **pskb, if (((*pskb)->nh.iph->tos & IPTOS_TOS_MASK) != tosinfo->tos) { u_int16_t diffs[2]; - if (!skb_ip_make_writable(pskb, sizeof(struct iphdr))) + if (!skb_make_writable(pskb, sizeof(struct iphdr))) return NF_DROP; diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF; diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c index 83ccedc..7130603 100644 --- a/net/ipv6/netfilter/ip6_queue.c +++ b/net/ipv6/netfilter/ip6_queue.c @@ -384,7 +384,7 @@ ipq_mangle_ipv6(ipq_verdict_msg_t *v, struct ipq_queue_entry *e) } skb_put(e->skb, diff); } - if (!skb_ip_make_writable(&e->skb, v->data_len)) + if (!skb_make_writable(&e->skb, v->data_len)) return -ENOMEM; memcpy(e->skb->data, v->payload, v->data_len); e->skb->ip_summed = CHECKSUM_NONE; -- cgit v0.10.2 From 020b4c12dbe3868d792a01d7c1470cd837abe10f Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 19:39:00 -0700 Subject: [NETFILTER]: Move ipv4 specific code from net/core/netfilter.c to net/ipv4/netfilter.c Netfilter cleanup - Move ipv4 code from net/core/netfilter.c to net/ipv4/netfilter.c - Move ipv6 netfilter code from net/ipv6/ip6_output.c to net/ipv6/netfilter.c Signed-off-by: Harald Welte Signed-off-by: David S. Miller diff --git a/net/core/netfilter.c b/net/core/netfilter.c index bbf9081..9849357 100644 --- a/net/core/netfilter.c +++ b/net/core/netfilter.c @@ -22,12 +22,7 @@ #include #include #include -#include -#include -#include #include -#include -#include /* In this code, we can be waiting indefinitely for userspace to * service a packet if a hook returns NF_QUEUE. We could keep a count @@ -447,73 +442,6 @@ void nf_reinject(struct sk_buff *skb, struct nf_info *info, return; } -#ifdef CONFIG_INET -/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ -int ip_route_me_harder(struct sk_buff **pskb) -{ - struct iphdr *iph = (*pskb)->nh.iph; - struct rtable *rt; - struct flowi fl = {}; - struct dst_entry *odst; - unsigned int hh_len; - - /* some non-standard hacks like ipt_REJECT.c:send_reset() can cause - * packets with foreign saddr to appear on the NF_IP_LOCAL_OUT hook. - */ - if (inet_addr_type(iph->saddr) == RTN_LOCAL) { - fl.nl_u.ip4_u.daddr = iph->daddr; - fl.nl_u.ip4_u.saddr = iph->saddr; - fl.nl_u.ip4_u.tos = RT_TOS(iph->tos); - fl.oif = (*pskb)->sk ? (*pskb)->sk->sk_bound_dev_if : 0; -#ifdef CONFIG_IP_ROUTE_FWMARK - fl.nl_u.ip4_u.fwmark = (*pskb)->nfmark; -#endif - fl.proto = iph->protocol; - if (ip_route_output_key(&rt, &fl) != 0) - return -1; - - /* Drop old route. */ - dst_release((*pskb)->dst); - (*pskb)->dst = &rt->u.dst; - } else { - /* non-local src, find valid iif to satisfy - * rp-filter when calling ip_route_input. */ - fl.nl_u.ip4_u.daddr = iph->saddr; - if (ip_route_output_key(&rt, &fl) != 0) - return -1; - - odst = (*pskb)->dst; - if (ip_route_input(*pskb, iph->daddr, iph->saddr, - RT_TOS(iph->tos), rt->u.dst.dev) != 0) { - dst_release(&rt->u.dst); - return -1; - } - dst_release(&rt->u.dst); - dst_release(odst); - } - - if ((*pskb)->dst->error) - return -1; - - /* Change in oif may mean change in hh_len. */ - hh_len = (*pskb)->dst->dev->hard_header_len; - if (skb_headroom(*pskb) < hh_len) { - struct sk_buff *nskb; - - nskb = skb_realloc_headroom(*pskb, hh_len); - if (!nskb) - return -1; - if ((*pskb)->sk) - skb_set_owner_w(nskb, (*pskb)->sk); - kfree_skb(*pskb); - *pskb = nskb; - } - - return 0; -} -EXPORT_SYMBOL(ip_route_me_harder); -#endif /*CONFIG_INET*/ - int skb_make_writable(struct sk_buff **pskb, unsigned int writable_len) { struct sk_buff *nskb; diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 55dc6cc..61c7386 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -8,7 +8,7 @@ obj-y := route.o inetpeer.o protocol.o \ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ tcp_minisocks.o tcp_cong.o \ datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ - sysctl_net_ipv4.o fib_frontend.o fib_semantics.o + sysctl_net_ipv4.o fib_frontend.o fib_semantics.o netfilter.o obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c new file mode 100644 index 0000000..6594d1c --- /dev/null +++ b/net/ipv4/netfilter.c @@ -0,0 +1,79 @@ +#include + +#ifdef CONFIG_NETFILTER + +/* IPv4 specific functions of netfilter core */ +#include +#include + +#include +#include +#include +#include +#include + +/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ +int ip_route_me_harder(struct sk_buff **pskb) +{ + struct iphdr *iph = (*pskb)->nh.iph; + struct rtable *rt; + struct flowi fl = {}; + struct dst_entry *odst; + unsigned int hh_len; + + /* some non-standard hacks like ipt_REJECT.c:send_reset() can cause + * packets with foreign saddr to appear on the NF_IP_LOCAL_OUT hook. + */ + if (inet_addr_type(iph->saddr) == RTN_LOCAL) { + fl.nl_u.ip4_u.daddr = iph->daddr; + fl.nl_u.ip4_u.saddr = iph->saddr; + fl.nl_u.ip4_u.tos = RT_TOS(iph->tos); + fl.oif = (*pskb)->sk ? (*pskb)->sk->sk_bound_dev_if : 0; +#ifdef CONFIG_IP_ROUTE_FWMARK + fl.nl_u.ip4_u.fwmark = (*pskb)->nfmark; +#endif + fl.proto = iph->protocol; + if (ip_route_output_key(&rt, &fl) != 0) + return -1; + + /* Drop old route. */ + dst_release((*pskb)->dst); + (*pskb)->dst = &rt->u.dst; + } else { + /* non-local src, find valid iif to satisfy + * rp-filter when calling ip_route_input. */ + fl.nl_u.ip4_u.daddr = iph->saddr; + if (ip_route_output_key(&rt, &fl) != 0) + return -1; + + odst = (*pskb)->dst; + if (ip_route_input(*pskb, iph->daddr, iph->saddr, + RT_TOS(iph->tos), rt->u.dst.dev) != 0) { + dst_release(&rt->u.dst); + return -1; + } + dst_release(&rt->u.dst); + dst_release(odst); + } + + if ((*pskb)->dst->error) + return -1; + + /* Change in oif may mean change in hh_len. */ + hh_len = (*pskb)->dst->dev->hard_header_len; + if (skb_headroom(*pskb) < hh_len) { + struct sk_buff *nskb; + + nskb = skb_realloc_headroom(*pskb, hh_len); + if (!nskb) + return -1; + if ((*pskb)->sk) + skb_set_owner_w(nskb, (*pskb)->sk); + kfree_skb(*pskb); + *pskb = nskb; + } + + return 0; +} +EXPORT_SYMBOL(ip_route_me_harder); +#endif /* CONFIG_NETFILTER */ diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index b39e049..5bccea2 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -8,7 +8,7 @@ ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o sit.o \ route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o raw.o \ protocol.o icmp.o mcast.o reassembly.o tcp_ipv6.o \ exthdrs.o sysctl_net_ipv6.o datagram.o proc.o \ - ip6_flowlabel.o ipv6_syms.o + ip6_flowlabel.o ipv6_syms.o netfilter.o ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \ xfrm6_output.o diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 590d2b7..a7fcbcc 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -153,38 +153,6 @@ int ip6_output(struct sk_buff *skb) return ip6_output2(skb); } -#ifdef CONFIG_NETFILTER -int ip6_route_me_harder(struct sk_buff *skb) -{ - struct ipv6hdr *iph = skb->nh.ipv6h; - struct dst_entry *dst; - struct flowi fl = { - .oif = skb->sk ? skb->sk->sk_bound_dev_if : 0, - .nl_u = - { .ip6_u = - { .daddr = iph->daddr, - .saddr = iph->saddr, } }, - .proto = iph->nexthdr, - }; - - dst = ip6_route_output(skb->sk, &fl); - - if (dst->error) { - IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES); - LIMIT_NETDEBUG( - printk(KERN_DEBUG "ip6_route_me_harder: No more route.\n")); - dst_release(dst); - return -EINVAL; - } - - /* Drop old route. */ - dst_release(skb->dst); - - skb->dst = dst; - return 0; -} -#endif - /* * xmit an sk_buff (used by TCP) */ diff --git a/net/ipv6/ipv6_syms.c b/net/ipv6/ipv6_syms.c index 5ade5a5..37a4a99 100644 --- a/net/ipv6/ipv6_syms.c +++ b/net/ipv6/ipv6_syms.c @@ -15,9 +15,6 @@ EXPORT_SYMBOL(ndisc_mc_map); EXPORT_SYMBOL(register_inet6addr_notifier); EXPORT_SYMBOL(unregister_inet6addr_notifier); EXPORT_SYMBOL(ip6_route_output); -#ifdef CONFIG_NETFILTER -EXPORT_SYMBOL(ip6_route_me_harder); -#endif EXPORT_SYMBOL(addrconf_lock); EXPORT_SYMBOL(ipv6_setsockopt); EXPORT_SYMBOL(ipv6_getsockopt); diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c new file mode 100644 index 0000000..5656d09 --- /dev/null +++ b/net/ipv6/netfilter.c @@ -0,0 +1,43 @@ +#include +#include + +#ifdef CONFIG_NETFILTER + +#include +#include +#include +#include +#include + +int ip6_route_me_harder(struct sk_buff *skb) +{ + struct ipv6hdr *iph = skb->nh.ipv6h; + struct dst_entry *dst; + struct flowi fl = { + .oif = skb->sk ? skb->sk->sk_bound_dev_if : 0, + .nl_u = + { .ip6_u = + { .daddr = iph->daddr, + .saddr = iph->saddr, } }, + .proto = iph->nexthdr, + }; + + dst = ip6_route_output(skb->sk, &fl); + + if (dst->error) { + IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES); + LIMIT_NETDEBUG( + printk(KERN_DEBUG "ip6_route_me_harder: No more route.\n")); + dst_release(dst); + return -EINVAL; + } + + /* Drop old route. */ + dst_release(skb->dst); + + skb->dst = dst; + return 0; +} +EXPORT_SYMBOL(ip6_route_me_harder); + +#endif /* CONFIG_NETFILTER */ -- cgit v0.10.2 From 4fdb3bb723db469717c6d38fda667d8b0fa86ebd Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 19:40:55 -0700 Subject: [NETLINK]: Add properly module refcounting for kernel netlink sockets. - Remove bogus code for compiling netlink as module - Add module refcounting support for modules implementing a netlink protocol - Add support for autoloading modules that implement a netlink protocol as soon as someone opens a socket for that protocol Signed-off-by: Harald Welte Signed-off-by: David S. Miller diff --git a/drivers/w1/w1_int.c b/drivers/w1/w1_int.c index b5a5e04..8809788 100644 --- a/drivers/w1/w1_int.c +++ b/drivers/w1/w1_int.c @@ -88,7 +88,7 @@ static struct w1_master * w1_alloc_dev(u32 id, int slave_count, int slave_ttl, dev->groups = 23; dev->seq = 1; - dev->nls = netlink_kernel_create(NETLINK_W1, NULL); + dev->nls = netlink_kernel_create(NETLINK_W1, NULL, THIS_MODULE); if (!dev->nls) { printk(KERN_ERR "Failed to create new netlink socket(%u) for w1 master %s.\n", NETLINK_NFLOG, dev->dev.bus_id); @@ -225,3 +225,5 @@ void w1_remove_master_device(struct w1_bus_master *bm) EXPORT_SYMBOL(w1_add_master_device); EXPORT_SYMBOL(w1_remove_master_device); + +MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_W1); diff --git a/include/linux/net.h b/include/linux/net.h index 20cb226..3990661 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -282,5 +282,8 @@ static struct proto_ops name##_ops = { \ #define MODULE_ALIAS_NETPROTO(proto) \ MODULE_ALIAS("net-pf-" __stringify(proto)) +#define MODULE_ALIAS_NET_PF_PROTO(pf, proto) \ + MODULE_ALIAS("net-pf-" __stringify(pf) "-proto-" __stringify(proto)) + #endif /* __KERNEL__ */ #endif /* _LINUX_NET_H */ diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 6552b71..1c50fea 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -117,7 +117,7 @@ struct netlink_skb_parms #define NETLINK_CREDS(skb) (&NETLINK_CB((skb)).creds) -extern struct sock *netlink_kernel_create(int unit, void (*input)(struct sock *sk, int len)); +extern struct sock *netlink_kernel_create(int unit, void (*input)(struct sock *sk, int len), struct module *module); extern void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err); extern int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 pid, int nonblock); extern int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 pid, diff --git a/kernel/audit.c b/kernel/audit.c index ef35166..ed40195 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -514,7 +514,8 @@ static int __init audit_init(void) { printk(KERN_INFO "audit: initializing netlink socket (%s)\n", audit_default ? "enabled" : "disabled"); - audit_sock = netlink_kernel_create(NETLINK_AUDIT, audit_receive); + audit_sock = netlink_kernel_create(NETLINK_AUDIT, audit_receive, + THIS_MODULE); if (!audit_sock) audit_panic("cannot initialize netlink socket"); diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c index 8e49d21..88f4d74 100644 --- a/lib/kobject_uevent.c +++ b/lib/kobject_uevent.c @@ -153,7 +153,8 @@ EXPORT_SYMBOL_GPL(kobject_uevent_atomic); static int __init kobject_uevent_init(void) { - uevent_sock = netlink_kernel_create(NETLINK_KOBJECT_UEVENT, NULL); + uevent_sock = netlink_kernel_create(NETLINK_KOBJECT_UEVENT, NULL, + THIS_MODULE); if (!uevent_sock) { printk(KERN_ERR diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c index 01af4fc..561d75c 100644 --- a/net/bridge/netfilter/ebt_ulog.c +++ b/net/bridge/netfilter/ebt_ulog.c @@ -258,7 +258,7 @@ static int __init init(void) spin_lock_init(&ulog_buffers[i].lock); } - ebtulognl = netlink_kernel_create(NETLINK_NFLOG, NULL); + ebtulognl = netlink_kernel_create(NETLINK_NFLOG, NULL, THIS_MODULE); if (!ebtulognl) ret = -ENOMEM; else if ((ret = ebt_register_watcher(&ulog))) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 4b1bb30..9b3c61f 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -708,7 +708,7 @@ void __init rtnetlink_init(void) if (!rta_buf) panic("rtnetlink_init: cannot allocate rta_buf\n"); - rtnl = netlink_kernel_create(NETLINK_ROUTE, rtnetlink_rcv); + rtnl = netlink_kernel_create(NETLINK_ROUTE, rtnetlink_rcv, THIS_MODULE); if (rtnl == NULL) panic("rtnetlink_init: cannot initialize rtnetlink\n"); netlink_set_nonroot(NETLINK_ROUTE, NL_NONROOT_RECV); diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c index 284a999..3068fdd 100644 --- a/net/decnet/netfilter/dn_rtmsg.c +++ b/net/decnet/netfilter/dn_rtmsg.c @@ -138,7 +138,8 @@ static int __init init(void) { int rv = 0; - dnrmg = netlink_kernel_create(NETLINK_DNRTMSG, dnrmg_receive_user_sk); + dnrmg = netlink_kernel_create(NETLINK_DNRTMSG, dnrmg_receive_user_sk, + THIS_MODULE); if (dnrmg == NULL) { printk(KERN_ERR "dn_rtmsg: Cannot create netlink socket"); return -ENOMEM; @@ -162,6 +163,7 @@ static void __exit fini(void) MODULE_DESCRIPTION("DECnet Routing Message Grabulator"); MODULE_AUTHOR("Steven Whitehouse "); MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_DNRTMSG); module_init(init); module_exit(fini); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index e572208..b5e2f15 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -567,7 +567,7 @@ static void nl_fib_input(struct sock *sk, int len) static void nl_fib_lookup_init(void) { - netlink_kernel_create(NETLINK_FIB_LOOKUP, nl_fib_input); + netlink_kernel_create(NETLINK_FIB_LOOKUP, nl_fib_input, THIS_MODULE); } static void fib_disable_ip(struct net_device *dev, int force) diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index ae975ac..b237f7f 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -692,7 +692,8 @@ init_or_cleanup(int init) goto cleanup; netlink_register_notifier(&ipq_nl_notifier); - ipqnl = netlink_kernel_create(NETLINK_FIREWALL, ipq_rcv_sk); + ipqnl = netlink_kernel_create(NETLINK_FIREWALL, ipq_rcv_sk, + THIS_MODULE); if (ipqnl == NULL) { printk(KERN_ERR "ip_queue: failed to create netlink socket\n"); goto cleanup_netlink_notifier; diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index 52a0076..4ea8371 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -62,6 +62,7 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte "); MODULE_DESCRIPTION("iptables userspace logging module"); +MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NFLOG); #define ULOG_NL_EVENT 111 /* Harald's favorite number */ #define ULOG_MAXNLGROUPS 32 /* numer of nlgroups */ @@ -372,7 +373,7 @@ static int __init init(void) ulog_buffers[i].timer.data = i; } - nflognl = netlink_kernel_create(NETLINK_NFLOG, NULL); + nflognl = netlink_kernel_create(NETLINK_NFLOG, NULL, THIS_MODULE); if (!nflognl) return -ENOMEM; diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index f66945c..f79bd11 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -774,7 +774,8 @@ static void tcpdiag_rcv(struct sock *sk, int len) static int __init tcpdiag_init(void) { - tcpnl = netlink_kernel_create(NETLINK_TCPDIAG, tcpdiag_rcv); + tcpnl = netlink_kernel_create(NETLINK_TCPDIAG, tcpdiag_rcv, + THIS_MODULE); if (tcpnl == NULL) return -ENOMEM; return 0; diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c index 7130603..1c3d247 100644 --- a/net/ipv6/netfilter/ip6_queue.c +++ b/net/ipv6/netfilter/ip6_queue.c @@ -685,7 +685,7 @@ init_or_cleanup(int init) goto cleanup; netlink_register_notifier(&ipq_nl_notifier); - ipqnl = netlink_kernel_create(NETLINK_IP6_FW, ipq_rcv_sk); + ipqnl = netlink_kernel_create(NETLINK_IP6_FW, ipq_rcv_sk, THIS_MODULE); if (ipqnl == NULL) { printk(KERN_ERR "ip6_queue: failed to create netlink socket\n"); goto cleanup_netlink_notifier; diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index b0ed5798..6210ca4 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -38,6 +38,8 @@ #include MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Harald Welte "); +MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER); static char __initdata nfversion[] = "0.30"; @@ -324,7 +326,8 @@ int __init nfnetlink_init(void) { printk("Netfilter messages via NETLINK v%s.\n", nfversion); - nfnl = netlink_kernel_create(NETLINK_NETFILTER, nfnetlink_rcv); + nfnl = netlink_kernel_create(NETLINK_NETFILTER, nfnetlink_rcv, + THIS_MODULE); if (!nfnl) { printk(KERN_ERR "cannot initialize nfnetlink!\n"); return -1; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index ff774a0..5d487cd 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -13,7 +13,12 @@ * added netlink_proto_exit * Tue Jan 22 18:32:44 BRST 2002 Arnaldo C. de Melo * use nlk_sk, as sk->protinfo is on a diet 8) - * + * Fri Jul 22 19:51:12 MEST 2005 Harald Welte + * - inc module use count of module that owns + * the kernel socket in case userspace opens + * socket of same protocol + * - remove all module support, since netlink is + * mandatory if CONFIG_NET=y these days */ #include @@ -92,6 +97,7 @@ struct netlink_table { struct nl_pid_hash hash; struct hlist_head mc_list; unsigned int nl_nonroot; + struct proto_ops *p_ops; }; static struct netlink_table *nl_table; @@ -341,7 +347,21 @@ static int netlink_create(struct socket *sock, int protocol) if (protocol<0 || protocol >= MAX_LINKS) return -EPROTONOSUPPORT; - sock->ops = &netlink_ops; + netlink_table_grab(); + if (!nl_table[protocol].hash.entries) { +#ifdef CONFIG_KMOD + /* We do 'best effort'. If we find a matching module, + * it is loaded. If not, we don't return an error to + * allow pure userspace<->userspace communication. -HW + */ + netlink_table_ungrab(); + request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol); + netlink_table_grab(); +#endif + } + netlink_table_ungrab(); + + sock->ops = nl_table[protocol].p_ops; sk = sk_alloc(PF_NETLINK, GFP_KERNEL, &netlink_proto, 1); if (!sk) @@ -394,6 +414,22 @@ static int netlink_release(struct socket *sock) }; notifier_call_chain(&netlink_chain, NETLINK_URELEASE, &n); } + + /* When this is a kernel socket, we need to remove the owner pointer, + * since we don't know whether the module will be dying at any given + * point - HW + */ + if (!nlk->pid) { + struct proto_ops *p_tmp; + + netlink_table_grab(); + p_tmp = nl_table[sk->sk_protocol].p_ops; + if (p_tmp != &netlink_ops) { + nl_table[sk->sk_protocol].p_ops = &netlink_ops; + kfree(p_tmp); + } + netlink_table_ungrab(); + } sock_put(sk); return 0; @@ -1023,8 +1059,9 @@ static void netlink_data_ready(struct sock *sk, int len) */ struct sock * -netlink_kernel_create(int unit, void (*input)(struct sock *sk, int len)) +netlink_kernel_create(int unit, void (*input)(struct sock *sk, int len), struct module *module) { + struct proto_ops *p_ops; struct socket *sock; struct sock *sk; @@ -1034,22 +1071,63 @@ netlink_kernel_create(int unit, void (*input)(struct sock *sk, int len)) if (unit<0 || unit>=MAX_LINKS) return NULL; + /* Do a quick check, to make us not go down to netlink_insert() + * if protocol already has kernel socket. + */ + sk = netlink_lookup(unit, 0); + if (unlikely(sk)) { + sock_put(sk); + return NULL; + } + if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock)) return NULL; + sk = NULL; + if (module) { + /* Every registering protocol implemented in a module needs + * it's own p_ops, since the socket code cannot deal with + * module refcounting otherwise. -HW + */ + p_ops = kmalloc(sizeof(*p_ops), GFP_KERNEL); + if (!p_ops) + goto out_sock_release; + + memcpy(p_ops, &netlink_ops, sizeof(*p_ops)); + p_ops->owner = module; + } else + p_ops = &netlink_ops; + + netlink_table_grab(); + nl_table[unit].p_ops = p_ops; + netlink_table_ungrab(); + if (netlink_create(sock, unit) < 0) { - sock_release(sock); - return NULL; + sk = NULL; + goto out_kfree_p_ops; } + sk = sock->sk; sk->sk_data_ready = netlink_data_ready; if (input) nlk_sk(sk)->data_ready = input; if (netlink_insert(sk, 0)) { - sock_release(sock); - return NULL; + sk = NULL; + goto out_kfree_p_ops; + } + + return sk; + +out_kfree_p_ops: + netlink_table_grab(); + if (nl_table[unit].p_ops != &netlink_ops) { + kfree(nl_table[unit].p_ops); + nl_table[unit].p_ops = &netlink_ops; } + netlink_table_ungrab(); +out_sock_release: + sock_release(sock); return sk; } @@ -1413,6 +1491,8 @@ enomem: for (i = 0; i < MAX_LINKS; i++) { struct nl_pid_hash *hash = &nl_table[i].hash; + nl_table[i].p_ops = &netlink_ops; + hash->table = nl_pid_hash_alloc(1 * sizeof(*hash->table)); if (!hash->table) { while (i-- > 0) @@ -1438,21 +1518,7 @@ out: return err; } -static void __exit netlink_proto_exit(void) -{ - sock_unregister(PF_NETLINK); - proc_net_remove("netlink"); - kfree(nl_table); - nl_table = NULL; - proto_unregister(&netlink_proto); -} - core_initcall(netlink_proto_init); -module_exit(netlink_proto_exit); - -MODULE_LICENSE("GPL"); - -MODULE_ALIAS_NETPROTO(PF_NETLINK); EXPORT_SYMBOL(netlink_ack); EXPORT_SYMBOL(netlink_broadcast); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 8da3e25..33ceeea 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1519,7 +1519,8 @@ static int __init xfrm_user_init(void) { printk(KERN_INFO "Initializing IPsec netlink socket\n"); - xfrm_nl = netlink_kernel_create(NETLINK_XFRM, xfrm_netlink_rcv); + xfrm_nl = netlink_kernel_create(NETLINK_XFRM, xfrm_netlink_rcv, + THIS_MODULE); if (xfrm_nl == NULL) return -ENOMEM; @@ -1537,3 +1538,4 @@ static void __exit xfrm_user_exit(void) module_init(xfrm_user_init); module_exit(xfrm_user_exit); MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_XFRM); diff --git a/security/selinux/netlink.c b/security/selinux/netlink.c index 18d08ac..341dbe2 100644 --- a/security/selinux/netlink.c +++ b/security/selinux/netlink.c @@ -103,7 +103,7 @@ void selnl_notify_policyload(u32 seqno) static int __init selnl_init(void) { - selnl = netlink_kernel_create(NETLINK_SELINUX, NULL); + selnl = netlink_kernel_create(NETLINK_SELINUX, NULL, THIS_MODULE); if (selnl == NULL) panic("SELinux: Cannot create netlink socket."); netlink_set_nonroot(NETLINK_SELINUX, NL_NONROOT_RECV); -- cgit v0.10.2 From 2cc7d5730957c4a3f3659d17d2ba5e06d5581c1f Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 19:42:34 -0700 Subject: [NETFILTER]: Move reroute-after-queue code up to the nf_queue layer. The rerouting functionality is required by the core, therefore it has to be implemented by the core and not in individual queue handlers. Signed-off-by: Harald Welte Signed-off-by: David S. Miller diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 54b97a1..d163e20 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -198,6 +198,17 @@ extern void nf_invalidate_cache(int pf); Returns true or false. */ extern int skb_make_writable(struct sk_buff **pskb, unsigned int writable_len); +struct nf_queue_rerouter { + void (*save)(const struct sk_buff *skb, struct nf_info *info); + int (*reroute)(struct sk_buff **skb, const struct nf_info *info); + int rer_size; +}; + +#define nf_info_reroute(x) ((void *)x + sizeof(struct nf_info)) + +extern int nf_register_queue_rerouter(int pf, struct nf_queue_rerouter *rer); +extern int nf_unregister_queue_rerouter(int pf); + #else /* !CONFIG_NETFILTER */ #define NF_HOOK(pf, hook, skb, indev, outdev, okfn) (okfn)(skb) static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {} diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h index 20c069a..5d204ee 100644 --- a/include/linux/netfilter_ipv6.h +++ b/include/linux/netfilter_ipv6.h @@ -71,4 +71,7 @@ enum nf_ip6_hook_priorities { NF_IP6_PRI_LAST = INT_MAX, }; +int ipv6_netfilter_init(void); +void ipv6_netfilter_fini(void); + #endif /*__LINUX_IP6_NETFILTER_H*/ diff --git a/net/core/netfilter.c b/net/core/netfilter.c index 9849357..1ed4f31 100644 --- a/net/core/netfilter.c +++ b/net/core/netfilter.c @@ -53,6 +53,9 @@ static struct nf_queue_handler_t { nf_queue_outfn_t outfn; void *data; } queue_handler[NPROTO]; + +static struct nf_queue_rerouter *queue_rerouter; + static DEFINE_RWLOCK(queue_handler_lock); int nf_register_hook(struct nf_hook_ops *reg) @@ -260,11 +263,34 @@ int nf_unregister_queue_handler(int pf) return 0; } +int nf_register_queue_rerouter(int pf, struct nf_queue_rerouter *rer) +{ + if (pf >= NPROTO) + return -EINVAL; + + write_lock_bh(&queue_handler_lock); + memcpy(&queue_rerouter[pf], rer, sizeof(queue_rerouter[pf])); + write_unlock_bh(&queue_handler_lock); + + return 0; +} + +int nf_unregister_queue_rerouter(int pf) +{ + if (pf >= NPROTO) + return -EINVAL; + + write_lock_bh(&queue_handler_lock); + memset(&queue_rerouter[pf], 0, sizeof(queue_rerouter[pf])); + write_unlock_bh(&queue_handler_lock); + return 0; +} + /* * Any packet that leaves via this function must come back * through nf_reinject(). */ -static int nf_queue(struct sk_buff *skb, +static int nf_queue(struct sk_buff **skb, struct list_head *elem, int pf, unsigned int hook, struct net_device *indev, @@ -282,17 +308,17 @@ static int nf_queue(struct sk_buff *skb, read_lock(&queue_handler_lock); if (!queue_handler[pf].outfn) { read_unlock(&queue_handler_lock); - kfree_skb(skb); + kfree_skb(*skb); return 1; } - info = kmalloc(sizeof(*info), GFP_ATOMIC); + info = kmalloc(sizeof(*info)+queue_rerouter[pf].rer_size, GFP_ATOMIC); if (!info) { if (net_ratelimit()) printk(KERN_ERR "OOM queueing packet %p\n", - skb); + *skb); read_unlock(&queue_handler_lock); - kfree_skb(skb); + kfree_skb(*skb); return 1; } @@ -311,15 +337,21 @@ static int nf_queue(struct sk_buff *skb, if (outdev) dev_hold(outdev); #ifdef CONFIG_BRIDGE_NETFILTER - if (skb->nf_bridge) { - physindev = skb->nf_bridge->physindev; + if ((*skb)->nf_bridge) { + physindev = (*skb)->nf_bridge->physindev; if (physindev) dev_hold(physindev); - physoutdev = skb->nf_bridge->physoutdev; + physoutdev = (*skb)->nf_bridge->physoutdev; if (physoutdev) dev_hold(physoutdev); } #endif + if (queue_rerouter[pf].save) + queue_rerouter[pf].save(*skb, info); + + status = queue_handler[pf].outfn(*skb, info, queue_handler[pf].data); + + if (status >= 0 && queue_rerouter[pf].reroute) + status = queue_rerouter[pf].reroute(skb, info); - status = queue_handler[pf].outfn(skb, info, queue_handler[pf].data); read_unlock(&queue_handler_lock); if (status < 0) { @@ -332,9 +364,11 @@ static int nf_queue(struct sk_buff *skb, #endif module_put(info->elem->owner); kfree(info); - kfree_skb(skb); + kfree_skb(*skb); + return 1; } + return 1; } @@ -365,7 +399,7 @@ next_hook: ret = -EPERM; } else if (verdict == NF_QUEUE) { NFDEBUG("nf_hook: Verdict = QUEUE.\n"); - if (!nf_queue(*pskb, elem, pf, hook, indev, outdev, okfn)) + if (!nf_queue(pskb, elem, pf, hook, indev, outdev, okfn)) goto next_hook; } unlock: @@ -428,7 +462,7 @@ void nf_reinject(struct sk_buff *skb, struct nf_info *info, break; case NF_QUEUE: - if (!nf_queue(skb, elem, info->pf, info->hook, + if (!nf_queue(&skb, elem, info->pf, info->hook, info->indev, info->outdev, info->okfn)) goto next_hook; break; @@ -555,6 +589,12 @@ void __init netfilter_init(void) { int i, h; + queue_rerouter = kmalloc(NPROTO * sizeof(struct nf_queue_rerouter), + GFP_KERNEL); + if (!queue_rerouter) + panic("netfilter: cannot allocate queue rerouter array\n"); + memset(queue_rerouter, 0, NPROTO * sizeof(struct nf_queue_rerouter)); + for (i = 0; i < NPROTO; i++) { for (h = 0; h < NF_MAX_HOOKS; h++) INIT_LIST_HEAD(&nf_hooks[i][h]); @@ -573,4 +613,6 @@ EXPORT_SYMBOL(nf_reinject); EXPORT_SYMBOL(nf_setsockopt); EXPORT_SYMBOL(nf_unregister_hook); EXPORT_SYMBOL(nf_unregister_queue_handler); +EXPORT_SYMBOL_GPL(nf_register_queue_rerouter); +EXPORT_SYMBOL_GPL(nf_unregister_queue_rerouter); EXPORT_SYMBOL(nf_unregister_sockopt); diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 6594d1c..ae0779d 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -1,10 +1,11 @@ -#include +/* IPv4 specific functions of netfilter core */ +#include #ifdef CONFIG_NETFILTER -/* IPv4 specific functions of netfilter core */ #include #include +#include #include #include @@ -76,4 +77,63 @@ int ip_route_me_harder(struct sk_buff **pskb) return 0; } EXPORT_SYMBOL(ip_route_me_harder); + +/* + * Extra routing may needed on local out, as the QUEUE target never + * returns control to the table. + */ + +struct ip_rt_info { + u_int32_t daddr; + u_int32_t saddr; + u_int8_t tos; +}; + +static void queue_save(const struct sk_buff *skb, struct nf_info *info) +{ + struct ip_rt_info *rt_info = nf_info_reroute(info); + + if (info->hook == NF_IP_LOCAL_OUT) { + const struct iphdr *iph = skb->nh.iph; + + rt_info->tos = iph->tos; + rt_info->daddr = iph->daddr; + rt_info->saddr = iph->saddr; + } +} + +static int queue_reroute(struct sk_buff **pskb, const struct nf_info *info) +{ + const struct ip_rt_info *rt_info = nf_info_reroute(info); + + if (info->hook == NF_IP_LOCAL_OUT) { + struct iphdr *iph = (*pskb)->nh.iph; + + if (!(iph->tos == rt_info->tos + && iph->daddr == rt_info->daddr + && iph->saddr == rt_info->saddr)) + return ip_route_me_harder(pskb); + } + return 0; +} + +static struct nf_queue_rerouter ip_reroute = { + .rer_size = sizeof(struct ip_rt_info), + .save = queue_save, + .reroute = queue_reroute, +}; + +static int init(void) +{ + return nf_register_queue_rerouter(PF_INET, &ip_reroute); +} + +static void fini(void) +{ + nf_unregister_queue_rerouter(PF_INET); +} + +module_init(init); +module_exit(fini); + #endif /* CONFIG_NETFILTER */ diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index b237f7f..7889298 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -43,17 +43,10 @@ #define NET_IPQ_QMAX 2088 #define NET_IPQ_QMAX_NAME "ip_queue_maxlen" -struct ipq_rt_info { - __u8 tos; - __u32 daddr; - __u32 saddr; -}; - struct ipq_queue_entry { struct list_head list; struct nf_info *info; struct sk_buff *skb; - struct ipq_rt_info rt_info; }; typedef int (*ipq_cmpfn)(struct ipq_queue_entry *, unsigned long); @@ -305,14 +298,6 @@ ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data) entry->info = info; entry->skb = skb; - if (entry->info->hook == NF_IP_LOCAL_OUT) { - struct iphdr *iph = skb->nh.iph; - - entry->rt_info.tos = iph->tos; - entry->rt_info.daddr = iph->daddr; - entry->rt_info.saddr = iph->saddr; - } - nskb = ipq_build_packet_message(entry, &status); if (nskb == NULL) goto err_out_free; @@ -393,18 +378,6 @@ ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct ipq_queue_entry *e) memcpy(e->skb->data, v->payload, v->data_len); e->skb->ip_summed = CHECKSUM_NONE; - /* - * Extra routing may needed on local out, as the QUEUE target never - * returns control to the table. - */ - if (e->info->hook == NF_IP_LOCAL_OUT) { - struct iphdr *iph = e->skb->nh.iph; - - if (!(iph->tos == e->rt_info.tos - && iph->daddr == e->rt_info.daddr - && iph->saddr == e->rt_info.saddr)) - return ip_route_me_harder(&e->skb); - } return 0; } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 28d9bca..5740473 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include @@ -757,6 +758,9 @@ static int __init inet6_init(void) err = igmp6_init(&inet6_family_ops); if (err) goto igmp_fail; + err = ipv6_netfilter_init(); + if (err) + goto netfilter_fail; /* Create /proc/foo6 entries. */ #ifdef CONFIG_PROC_FS err = -ENOMEM; @@ -813,6 +817,8 @@ proc_tcp6_fail: raw6_proc_exit(); proc_raw6_fail: #endif + ipv6_netfilter_fini(); +netfilter_fail: igmp6_cleanup(); igmp_fail: ndisc_cleanup(); @@ -852,6 +858,7 @@ static void __exit inet6_exit(void) ip6_route_cleanup(); ipv6_packet_cleanup(); igmp6_cleanup(); + ipv6_netfilter_fini(); ndisc_cleanup(); icmpv6_cleanup(); #ifdef CONFIG_SYSCTL diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 5656d09..c8daef9 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -5,6 +5,8 @@ #include #include +#include +#include #include #include #include @@ -40,4 +42,64 @@ int ip6_route_me_harder(struct sk_buff *skb) } EXPORT_SYMBOL(ip6_route_me_harder); +/* + * Extra routing may needed on local out, as the QUEUE target never + * returns control to the table. + */ + +struct ip6_rt_info { + struct in6_addr daddr; + struct in6_addr saddr; +}; + +static void save(const struct sk_buff *skb, struct nf_info *info) +{ + struct ip6_rt_info *rt_info = nf_info_reroute(info); + + if (info->hook == NF_IP6_LOCAL_OUT) { + struct ipv6hdr *iph = skb->nh.ipv6h; + + rt_info->daddr = iph->daddr; + rt_info->saddr = iph->saddr; + } +} + +static int reroute(struct sk_buff **pskb, const struct nf_info *info) +{ + struct ip6_rt_info *rt_info = nf_info_reroute(info); + + if (info->hook == NF_IP6_LOCAL_OUT) { + struct ipv6hdr *iph = (*pskb)->nh.ipv6h; + if (!ipv6_addr_equal(&iph->daddr, &rt_info->daddr) || + !ipv6_addr_equal(&iph->saddr, &rt_info->saddr)) + return ip6_route_me_harder(*pskb); + } + return 0; +} + +static struct nf_queue_rerouter ip6_reroute = { + .rer_size = sizeof(struct ip6_rt_info), + .save = &save, + .reroute = &reroute, +}; + +int __init ipv6_netfilter_init(void) +{ + return nf_register_queue_rerouter(PF_INET6, &ip6_reroute); +} + +void ipv6_netfilter_fini(void) +{ + nf_unregister_queue_rerouter(PF_INET6); +} + +#else /* CONFIG_NETFILTER */ +int __init ipv6_netfilter_init(void) +{ + return 0; +} + +void ipv6_netfilter_fini(void) +{ +} #endif /* CONFIG_NETFILTER */ diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c index 1c3d247..c45d8f8 100644 --- a/net/ipv6/netfilter/ip6_queue.c +++ b/net/ipv6/netfilter/ip6_queue.c @@ -47,16 +47,10 @@ #define NET_IPQ_QMAX 2088 #define NET_IPQ_QMAX_NAME "ip6_queue_maxlen" -struct ipq_rt_info { - struct in6_addr daddr; - struct in6_addr saddr; -}; - struct ipq_queue_entry { struct list_head list; struct nf_info *info; struct sk_buff *skb; - struct ipq_rt_info rt_info; }; typedef int (*ipq_cmpfn)(struct ipq_queue_entry *, unsigned long); @@ -302,13 +296,6 @@ ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data) entry->info = info; entry->skb = skb; - if (entry->info->hook == NF_IP_LOCAL_OUT) { - struct ipv6hdr *iph = skb->nh.ipv6h; - - entry->rt_info.daddr = iph->daddr; - entry->rt_info.saddr = iph->saddr; - } - nskb = ipq_build_packet_message(entry, &status); if (nskb == NULL) goto err_out_free; @@ -389,17 +376,6 @@ ipq_mangle_ipv6(ipq_verdict_msg_t *v, struct ipq_queue_entry *e) memcpy(e->skb->data, v->payload, v->data_len); e->skb->ip_summed = CHECKSUM_NONE; - /* - * Extra routing may needed on local out, as the QUEUE target never - * returns control to the table. - * Not a nice way to cmp, but works - */ - if (e->info->hook == NF_IP_LOCAL_OUT) { - struct ipv6hdr *iph = e->skb->nh.ipv6h; - if (!ipv6_addr_equal(&iph->daddr, &e->rt_info.daddr) || - !ipv6_addr_equal(&iph->saddr, &e->rt_info.saddr)) - return ip6_route_me_harder(e->skb); - } return 0; } -- cgit v0.10.2 From 0ab43f84995f2c2fcc5cc58a9accaa1095e1317f Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 19:43:44 -0700 Subject: [NETFILTER]: Core changes required by upcoming nfnetlink_queue code - split netfiler verdict in 16bit verdict and 16bit queue number - add 'queuenum' argument to nf_queue_outfn_t and its users ip[6]_queue - move NFNL_SUBSYS_ definitions from enum to #define - introduce autoloading for nfnetlink subsystem modules - add MODULE_ALIAS_NFNL_SUBSYS macro - add nf_unregister_queue_handlers() to register all handlers for a given nf_queue_outfn_t - add more verbose DEBUGP macro definition to nfnetlink.c - make nfnetlink_subsys_register fail if subsys already exists - add some more comments and debug statements to nfnetlink.c Signed-off-by: Harald Welte Signed-off-by: David S. Miller diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index d163e20..711e05f 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -21,6 +21,16 @@ #define NF_STOP 5 #define NF_MAX_VERDICT NF_STOP +/* we overload the higher bits for encoding auxiliary data such as the queue + * number. Not nice, but better than additional function arguments. */ +#define NF_VERDICT_MASK 0x0000ffff +#define NF_VERDICT_BITS 16 + +#define NF_VERDICT_QMASK 0xffff0000 +#define NF_VERDICT_QBITS 16 + +#define NF_QUEUE_NR(x) ((x << NF_VERDICT_QBITS) & NF_VERDICT_QMASK || NF_QUEUE) + /* only for userspace compatibility */ #ifndef __KERNEL__ /* Generic cache responses from hook functions. @@ -179,10 +189,12 @@ int nf_getsockopt(struct sock *sk, int pf, int optval, char __user *opt, /* Packet queuing */ typedef int (*nf_queue_outfn_t)(struct sk_buff *skb, - struct nf_info *info, void *data); + struct nf_info *info, + unsigned int queuenum, void *data); extern int nf_register_queue_handler(int pf, nf_queue_outfn_t outfn, void *data); extern int nf_unregister_queue_handler(int pf); +extern void nf_unregister_queue_handlers(nf_queue_outfn_t outfn); extern void nf_reinject(struct sk_buff *skb, struct nf_info *info, unsigned int verdict); diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index ace7a7b..561f9df 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -69,15 +69,14 @@ struct nfgenmsg { #define NFNL_SUBSYS_ID(x) ((x & 0xff00) >> 8) #define NFNL_MSG_TYPE(x) (x & 0x00ff) -enum nfnl_subsys_id { - NFNL_SUBSYS_NONE = 0, - NFNL_SUBSYS_CTNETLINK, - NFNL_SUBSYS_CTNETLINK_EXP, - NFNL_SUBSYS_IPTNETLINK, - NFNL_SUBSYS_QUEUE, - NFNL_SUBSYS_ULOG, - NFNL_SUBSYS_COUNT, -}; +/* No enum here, otherwise __stringify() trick of MODULE_ALIAS_NFNL_SUBSYS() + * won't work anymore */ +#define NFNL_SUBSYS_NONE 0 +#define NFNL_SUBSYS_CTNETLINK 1 +#define NFNL_SUBSYS_CTNETLINK_EXP 2 +#define NFNL_SUBSYS_QUEUE 3 +#define NFNL_SUBSYS_ULOG 4 +#define NFNL_SUBSYS_COUNT 5 #ifdef __KERNEL__ @@ -142,5 +141,8 @@ extern int nfnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo); extern int nfnetlink_unicast(struct sk_buff *skb, u_int32_t pid, int flags); +#define MODULE_ALIAS_NFNL_SUBSYS(subsys) \ + MODULE_ALIAS("nfnetlink-subsys-" __stringify(subsys)) + #endif /* __KERNEL__ */ #endif /* _NFNETLINK_H */ diff --git a/net/core/netfilter.c b/net/core/netfilter.c index 1ed4f31..3e38084 100644 --- a/net/core/netfilter.c +++ b/net/core/netfilter.c @@ -221,7 +221,8 @@ static unsigned int nf_iterate(struct list_head *head, verdict = elem->hook(hook, skb, indev, outdev, okfn); if (verdict != NF_ACCEPT) { #ifdef CONFIG_NETFILTER_DEBUG - if (unlikely(verdict > NF_MAX_VERDICT)) { + if (unlikely((verdict & NF_VERDICT_MASK) + > NF_MAX_VERDICT)) { NFDEBUG("Evil return from %p(%u).\n", elem->hook, hook); continue; @@ -239,6 +240,9 @@ int nf_register_queue_handler(int pf, nf_queue_outfn_t outfn, void *data) { int ret; + if (pf >= NPROTO) + return -EINVAL; + write_lock_bh(&queue_handler_lock); if (queue_handler[pf].outfn) ret = -EBUSY; @@ -255,6 +259,9 @@ int nf_register_queue_handler(int pf, nf_queue_outfn_t outfn, void *data) /* The caller must flush their queue before this */ int nf_unregister_queue_handler(int pf) { + if (pf >= NPROTO) + return -EINVAL; + write_lock_bh(&queue_handler_lock); queue_handler[pf].outfn = NULL; queue_handler[pf].data = NULL; @@ -286,6 +293,20 @@ int nf_unregister_queue_rerouter(int pf) return 0; } +void nf_unregister_queue_handlers(nf_queue_outfn_t outfn) +{ + int pf; + + write_lock_bh(&queue_handler_lock); + for (pf = 0; pf < NPROTO; pf++) { + if (queue_handler[pf].outfn == outfn) { + queue_handler[pf].outfn = NULL; + queue_handler[pf].data = NULL; + } + } + write_unlock_bh(&queue_handler_lock); +} + /* * Any packet that leaves via this function must come back * through nf_reinject(). @@ -295,7 +316,8 @@ static int nf_queue(struct sk_buff **skb, int pf, unsigned int hook, struct net_device *indev, struct net_device *outdev, - int (*okfn)(struct sk_buff *)) + int (*okfn)(struct sk_buff *), + unsigned int queuenum) { int status; struct nf_info *info; @@ -347,7 +369,8 @@ static int nf_queue(struct sk_buff **skb, if (queue_rerouter[pf].save) queue_rerouter[pf].save(*skb, info); - status = queue_handler[pf].outfn(*skb, info, queue_handler[pf].data); + status = queue_handler[pf].outfn(*skb, info, queuenum, + queue_handler[pf].data); if (status >= 0 && queue_rerouter[pf].reroute) status = queue_rerouter[pf].reroute(skb, info); @@ -397,9 +420,10 @@ next_hook: } else if (verdict == NF_DROP) { kfree_skb(*pskb); ret = -EPERM; - } else if (verdict == NF_QUEUE) { + } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) { NFDEBUG("nf_hook: Verdict = QUEUE.\n"); - if (!nf_queue(pskb, elem, pf, hook, indev, outdev, okfn)) + if (!nf_queue(pskb, elem, pf, hook, indev, outdev, okfn, + verdict >> NF_VERDICT_BITS)) goto next_hook; } unlock: @@ -456,14 +480,15 @@ void nf_reinject(struct sk_buff *skb, struct nf_info *info, info->okfn, INT_MIN); } - switch (verdict) { + switch (verdict & NF_VERDICT_MASK) { case NF_ACCEPT: info->okfn(skb); break; case NF_QUEUE: if (!nf_queue(&skb, elem, info->pf, info->hook, - info->indev, info->outdev, info->okfn)) + info->indev, info->outdev, info->okfn, + verdict >> NF_VERDICT_BITS)) goto next_hook; break; } @@ -613,6 +638,7 @@ EXPORT_SYMBOL(nf_reinject); EXPORT_SYMBOL(nf_setsockopt); EXPORT_SYMBOL(nf_unregister_hook); EXPORT_SYMBOL(nf_unregister_queue_handler); +EXPORT_SYMBOL_GPL(nf_unregister_queue_handlers); EXPORT_SYMBOL_GPL(nf_register_queue_rerouter); EXPORT_SYMBOL_GPL(nf_unregister_queue_rerouter); EXPORT_SYMBOL(nf_unregister_sockopt); diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index 7889298..cfc886f 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -280,7 +280,8 @@ nlmsg_failure: } static int -ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data) +ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, + unsigned int queuenum, void *data) { int status = -EINVAL; struct sk_buff *nskb; diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c index c45d8f8..5af4cee 100644 --- a/net/ipv6/netfilter/ip6_queue.c +++ b/net/ipv6/netfilter/ip6_queue.c @@ -278,7 +278,8 @@ nlmsg_failure: } static int -ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data) +ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, + unsigned int queuenum, void *data) { int status = -EINVAL; struct sk_buff *nskb; diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 6210ca4..30b25f4 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -44,7 +44,9 @@ MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER); static char __initdata nfversion[] = "0.30"; #if 0 -#define DEBUGP printk +#define DEBUGP(format, args...) \ + printk(KERN_DEBUG "%s(%d):%s(): " format, __FILE__, \ + __LINE__, __FUNCTION__, ## args) #else #define DEBUGP(format, args...) #endif @@ -67,11 +69,11 @@ int nfnetlink_subsys_register(struct nfnetlink_subsystem *n) { DEBUGP("registering subsystem ID %u\n", n->subsys_id); - /* If the netlink socket wasn't created, then fail */ - if (!nfnl) - return -1; - nfnl_lock(); + if (subsys_table[n->subsys_id]) { + nfnl_unlock(); + return -EBUSY; + } subsys_table[n->subsys_id] = n; nfnl_unlock(); @@ -227,8 +229,18 @@ static inline int nfnetlink_rcv_msg(struct sk_buff *skb, type = nlh->nlmsg_type; ss = nfnetlink_get_subsys(type); - if (!ss) + if (!ss) { +#ifdef CONFIG_KMOD + /* don't call nfnl_shunlock, since it would reenter + * with further packet processing */ + up(&nfnl_sem); + request_module("nfnetlink-subsys-%d", NFNL_SUBSYS_ID(type)); + nfnl_shlock(); + ss = nfnetlink_get_subsys(type); + if (!ss) +#endif goto err_inval; + } nc = nfnetlink_find_client(type, ss); if (!nc) { @@ -252,12 +264,14 @@ static inline int nfnetlink_rcv_msg(struct sk_buff *skb, if (err < 0) goto err_inval; + DEBUGP("calling handler\n"); err = nc->call(nfnl, skb, nlh, cda, errp); *errp = err; return err; } err_inval: + DEBUGP("returning -EINVAL\n"); *errp = -EINVAL; return -1; } @@ -311,6 +325,8 @@ static void nfnetlink_rcv(struct sock *sk, int len) kfree_skb(skb); } + /* don't call nfnl_shunlock, since it would reenter + * with further packet processing */ up(&nfnl_sem); } while(nfnl && nfnl->sk_receive_queue.qlen); } -- cgit v0.10.2 From 7af4cc3fa158ff1dda6e7451c7e6afa6b0bb85cb Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 19:44:15 -0700 Subject: [NETFILTER]: Add "nfnetlink_queue" netfilter queue handler over nfnetlink - Add new nfnetlink_queue module - Add new ipt_NFQUEUE and ip6t_NFQUEUE modules to access queue numbers 1-65535 - Mark ip_queue and ip6_queue Kconfig options as OBSOLETE - Update feature-removal-schedule to remove ip[6]_queue in December Signed-off-by: Harald Welte Signed-off-by: David S. Miller diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 8b1430b..0665cb1 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -135,3 +135,15 @@ Why: With the 16-bit PCMCIA subsystem now behaving (almost) like a pcmciautils package available at http://kernel.org/pub/linux/utils/kernel/pcmcia/ Who: Dominik Brodowski + +--------------------------- + +What: ip_queue and ip6_queue (old ipv4-only and ipv6-only netfilter queue) +When: December 2005 +Why: This interface has been obsoleted by the new layer3-independent + "nfnetlink_queue". The Kernel interface is compatible, so the old + ip[6]tables "QUEUE" targets still work and will transparently handle + all packets into nfnetlink queue number 0. Userspace users will have + to link against API-compatible library on top of libnfnetlink_queue + instead of the current 'libipq'. +Who: Harald Welte diff --git a/include/linux/netfilter/nfnetlink_queue.h b/include/linux/netfilter/nfnetlink_queue.h new file mode 100644 index 0000000..edb463a --- /dev/null +++ b/include/linux/netfilter/nfnetlink_queue.h @@ -0,0 +1,85 @@ +#ifndef _NFNETLINK_QUEUE_H +#define _NFNETLINK_QUEUE_H + +#include + +enum nfqnl_msg_types { + NFQNL_MSG_PACKET, /* packet from kernel to userspace */ + NFQNL_MSG_VERDICT, /* verdict from userspace to kernel */ + NFQNL_MSG_CONFIG, /* connect to a particular queue */ + + NFQNL_MSG_MAX +}; + +struct nfqnl_msg_packet_hdr { + u_int32_t packet_id; /* unique ID of packet in queue */ + u_int16_t hw_protocol; /* hw protocol (network order) */ + u_int8_t hook; /* netfilter hook */ +} __attribute__ ((packed)); + +struct nfqnl_msg_packet_hw { + u_int16_t hw_addrlen; + u_int16_t _pad; + u_int8_t hw_addr[8]; +} __attribute__ ((packed)); + +struct nfqnl_msg_packet_timestamp { + u_int64_t sec; + u_int64_t usec; +} __attribute__ ((packed)); + +enum nfqnl_attr_type { + NFQA_UNSPEC, + NFQA_PACKET_HDR, + NFQA_VERDICT_HDR, /* nfqnl_msg_verdict_hrd */ + NFQA_MARK, /* u_int32_t nfmark */ + NFQA_TIMESTAMP, /* nfqnl_msg_packet_timestamp */ + NFQA_IFINDEX_INDEV, /* u_int32_t ifindex */ + NFQA_IFINDEX_OUTDEV, /* u_int32_t ifindex */ + NFQA_HWADDR, /* nfqnl_msg_packet_hw */ + NFQA_PAYLOAD, /* opaque data payload */ + + __NFQA_MAX +}; +#define NFQA_MAX (__NFQA_MAX - 1) + +struct nfqnl_msg_verdict_hdr { + u_int32_t verdict; + u_int32_t id; +} __attribute__ ((packed)); + + +enum nfqnl_msg_config_cmds { + NFQNL_CFG_CMD_NONE, + NFQNL_CFG_CMD_BIND, + NFQNL_CFG_CMD_UNBIND, + NFQNL_CFG_CMD_PF_BIND, + NFQNL_CFG_CMD_PF_UNBIND, +}; + +struct nfqnl_msg_config_cmd { + u_int8_t command; /* nfqnl_msg_config_cmds */ + u_int8_t _pad; + u_int16_t pf; /* AF_xxx for PF_[UN]BIND */ +} __attribute__ ((packed)); + +enum nfqnl_config_mode { + NFQNL_COPY_NONE, + NFQNL_COPY_META, + NFQNL_COPY_PACKET, +}; + +struct nfqnl_msg_config_params { + u_int32_t copy_range; + u_int8_t copy_mode; /* enum nfqnl_config_mode */ +} __attribute__ ((packed)); + + +enum nfqnl_attr_config { + NFQA_CFG_UNSPEC, + NFQA_CFG_CMD, /* nfqnl_msg_config_cmd */ + NFQA_CFG_PARAMS, /* nfqnl_msg_config_params */ + __NFQA_CFG_MAX +}; + +#endif /* _NFNETLINK_QUEUE_H */ diff --git a/include/linux/netfilter_ipv4/ipt_NFQUEUE.h b/include/linux/netfilter_ipv4/ipt_NFQUEUE.h new file mode 100644 index 0000000..b5b2943 --- /dev/null +++ b/include/linux/netfilter_ipv4/ipt_NFQUEUE.h @@ -0,0 +1,16 @@ +/* iptables module for using NFQUEUE mechanism + * + * (C) 2005 Harald Welte + * + * This software is distributed under GNU GPL v2, 1991 + * +*/ +#ifndef _IPT_NFQ_TARGET_H +#define _IPT_NFQ_TARGET_H + +/* target info */ +struct ipt_NFQ_info { + u_int16_t queuenum; +}; + +#endif /* _IPT_DSCP_TARGET_H */ diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index e47ba39..2fa26a4 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -110,11 +110,15 @@ config IP_NF_AMANDA To compile it as a module, choose M here. If unsure, say Y. config IP_NF_QUEUE - tristate "Userspace queueing via NETLINK" + tristate "IP Userspace queueing via NETLINK (OBSOLETE)" help Netfilter has the ability to queue packets to user space: the netlink device can be used to access them using this driver. + This option enables the old IPv4-only "ip_queue" implementation + which has been obsoleted by the new "nfnetlink_queue" code (see + CONFIG_NETFILTER_NETLINK_QUEUE). + To compile it as a module, choose M here. If unsure, say N. config IP_NF_IPTABLES diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index abf2a7d..c2ae663 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -91,3 +91,4 @@ obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o +obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += ipt_NFQUEUE.o diff --git a/net/ipv4/netfilter/ipt_NFQUEUE.c b/net/ipv4/netfilter/ipt_NFQUEUE.c new file mode 100644 index 0000000..3cedc9b --- /dev/null +++ b/net/ipv4/netfilter/ipt_NFQUEUE.c @@ -0,0 +1,70 @@ +/* iptables module for using new netfilter netlink queue + * + * (C) 2005 by Harald Welte + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include +#include + +#include +#include +#include + +MODULE_AUTHOR("Harald Welte "); +MODULE_DESCRIPTION("iptables NFQUEUE target"); +MODULE_LICENSE("GPL"); + +static unsigned int +target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ipt_NFQ_info *tinfo = targinfo; + + return NF_QUEUE_NR(tinfo->queuenum); +} + +static int +checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_NFQ_info))) { + printk(KERN_WARNING "NFQUEUE: targinfosize %u != %Zu\n", + targinfosize, + IPT_ALIGN(sizeof(struct ipt_NFQ_info))); + return 0; + } + + return 1; +} + +static struct ipt_target ipt_NFQ_reg = { + .name = "NFQUEUE", + .target = target, + .checkentry = checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_target(&ipt_NFQ_reg); +} + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_NFQ_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 77ec704..cd15519 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -10,13 +10,16 @@ menu "IPv6: Netfilter Configuration (EXPERIMENTAL)" # dep_tristate ' FTP protocol support' CONFIG_IP6_NF_FTP $CONFIG_IP6_NF_CONNTRACK #fi config IP6_NF_QUEUE - tristate "Userspace queueing via NETLINK" + tristate "IP6 Userspace queueing via NETLINK (OBSOLETE)" ---help--- This option adds a queue handler to the kernel for IPv6 - packets which lets us to receive the filtered packets - with QUEUE target using libiptc as we can do with - the IPv4 now. + packets which enables users to receive the filtered packets + with QUEUE target using libipq. + + THis option enables the old IPv6-only "ip6_queue" implementation + which has been obsoleted by the new "nfnetlink_queue" code (see + CONFIG_NETFILTER_NETLINK_QUEUE). (C) Fernando Anton 2001 IPv64 Project - Work based in IPv64 draft by Arturo Azcorra. diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index 2e51714..847651d 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -24,3 +24,4 @@ obj-$(CONFIG_IP6_NF_QUEUE) += ip6_queue.o obj-$(CONFIG_IP6_NF_TARGET_LOG) += ip6t_LOG.o obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o obj-$(CONFIG_IP6_NF_MATCH_HL) += ip6t_hl.o +obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += ip6t_NFQUEUE.o diff --git a/net/ipv6/netfilter/ip6t_NFQUEUE.c b/net/ipv6/netfilter/ip6t_NFQUEUE.c new file mode 100644 index 0000000..c6e3730 --- /dev/null +++ b/net/ipv6/netfilter/ip6t_NFQUEUE.c @@ -0,0 +1,70 @@ +/* ip6tables module for using new netfilter netlink queue + * + * (C) 2005 by Harald Welte + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include +#include + +#include +#include +#include + +MODULE_AUTHOR("Harald Welte "); +MODULE_DESCRIPTION("ip6tables NFQUEUE target"); +MODULE_LICENSE("GPL"); + +static unsigned int +target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ipt_NFQ_info *tinfo = targinfo; + + return NF_QUEUE_NR(tinfo->queuenum); +} + +static int +checkentry(const char *tablename, + const struct ip6t_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + if (targinfosize != IP6T_ALIGN(sizeof(struct ipt_NFQ_info))) { + printk(KERN_WARNING "NFQUEUE: targinfosize %u != %Zu\n", + targinfosize, + IP6T_ALIGN(sizeof(struct ipt_NFQ_info))); + return 0; + } + + return 1; +} + +static struct ip6t_target ipt_NFQ_reg = { + .name = "NFQUEUE", + .target = target, + .checkentry = checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ip6t_register_target(&ipt_NFQ_reg); +} + +static void __exit fini(void) +{ + ip6t_unregister_target(&ipt_NFQ_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 3629d3d..f0eb23e 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -3,3 +3,11 @@ config NETFILTER_NETLINK help If this option is enabled, the kernel will include support for the new netfilter netlink interface. + +config NETFILTER_NETLINK_QUEUE + tristate "Netfilter NFQUEUE over NFNETLINK interface" + depends on NETFILTER_NETLINK + help + If this option isenabled, the kernel will include support + for queueing packets via NFNETLINK. + diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 02e67d3..14a0b18 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -1 +1,2 @@ obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o +obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c new file mode 100644 index 0000000..2403261 --- /dev/null +++ b/net/netfilter/nfnetlink_queue.c @@ -0,0 +1,877 @@ +/* + * This is a module which is used for queueing packets and communicating with + * userspace via nfetlink. + * + * (C) 2005 by Harald Welte + * + * Based on the old ipv4-only ip_queue.c: + * (C) 2000-2002 James Morris + * (C) 2003-2005 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define NFQNL_QMAX_DEFAULT 1024 + +#if 0 +#define QDEBUG(x, args ...) printk(KERN_DEBUG "%s(%d):%s(): " x, \ + __FILE__, __LINE__, __FUNCTION__, \ + ## args) +#else +#define QDEBUG(x, ...) +#endif + +struct nfqnl_queue_entry { + struct list_head list; + struct nf_info *info; + struct sk_buff *skb; + unsigned int id; +}; + +struct nfqnl_instance { + struct hlist_node hlist; /* global list of queues */ + + int peer_pid; + unsigned int queue_maxlen; + unsigned int copy_range; + unsigned int queue_total; + unsigned int queue_dropped; + unsigned int queue_user_dropped; + + atomic_t id_sequence; /* 'sequence' of pkt ids */ + + u_int16_t queue_num; /* number of this queue */ + u_int8_t copy_mode; + + spinlock_t lock; + + struct list_head queue_list; /* packets in queue */ +}; + +typedef int (*nfqnl_cmpfn)(struct nfqnl_queue_entry *, unsigned long); + +static DEFINE_RWLOCK(instances_lock); + +u_int64_t htonll(u_int64_t in) +{ + u_int64_t out; + int i; + + for (i = 0; i < sizeof(u_int64_t); i++) + ((u_int8_t *)&out)[sizeof(u_int64_t)-1] = ((u_int8_t *)&in)[i]; + + return out; +} + +#define INSTANCE_BUCKETS 16 +static struct hlist_head instance_table[INSTANCE_BUCKETS]; + +static inline u_int8_t instance_hashfn(u_int16_t queue_num) +{ + return ((queue_num >> 8) | queue_num) % INSTANCE_BUCKETS; +} + +static struct nfqnl_instance * +__instance_lookup(u_int16_t queue_num) +{ + struct hlist_head *head; + struct hlist_node *pos; + struct nfqnl_instance *inst; + + head = &instance_table[instance_hashfn(queue_num)]; + hlist_for_each_entry(inst, pos, head, hlist) { + if (inst->queue_num == queue_num) + return inst; + } + return NULL; +} + +static struct nfqnl_instance * +instance_lookup(u_int16_t queue_num) +{ + struct nfqnl_instance *inst; + + read_lock_bh(&instances_lock); + inst = __instance_lookup(queue_num); + read_unlock_bh(&instances_lock); + + return inst; +} + +static struct nfqnl_instance * +instance_create(u_int16_t queue_num, int pid) +{ + struct nfqnl_instance *inst; + + QDEBUG("entering for queue_num=%u, pid=%d\n", queue_num, pid); + + write_lock_bh(&instances_lock); + if (__instance_lookup(queue_num)) { + inst = NULL; + QDEBUG("aborting, instance already exists\n"); + goto out_unlock; + } + + inst = kmalloc(sizeof(*inst), GFP_ATOMIC); + if (!inst) + goto out_unlock; + + memset(inst, 0, sizeof(*inst)); + inst->queue_num = queue_num; + inst->peer_pid = pid; + inst->queue_maxlen = NFQNL_QMAX_DEFAULT; + inst->copy_range = 0xfffff; + inst->copy_mode = NFQNL_COPY_NONE; + atomic_set(&inst->id_sequence, 0); + inst->lock = SPIN_LOCK_UNLOCKED; + INIT_LIST_HEAD(&inst->queue_list); + + if (!try_module_get(THIS_MODULE)) + goto out_free; + + hlist_add_head(&inst->hlist, + &instance_table[instance_hashfn(queue_num)]); + + write_unlock_bh(&instances_lock); + + QDEBUG("successfully created new instance\n"); + + return inst; + +out_free: + kfree(inst); +out_unlock: + write_unlock_bh(&instances_lock); + return NULL; +} + +static void nfqnl_flush(struct nfqnl_instance *queue, int verdict); + +static void +_instance_destroy2(struct nfqnl_instance *inst, int lock) +{ + /* first pull it out of the global list */ + if (lock) + write_lock_bh(&instances_lock); + + QDEBUG("removing instance %p (queuenum=%u) from hash\n", + inst, inst->queue_num); + hlist_del(&inst->hlist); + + if (lock) + write_unlock_bh(&instances_lock); + + /* then flush all pending skbs from the queue */ + nfqnl_flush(inst, NF_DROP); + + /* and finally free the data structure */ + kfree(inst); + + module_put(THIS_MODULE); +} + +static inline void +__instance_destroy(struct nfqnl_instance *inst) +{ + _instance_destroy2(inst, 0); +} + +static inline void +instance_destroy(struct nfqnl_instance *inst) +{ + _instance_destroy2(inst, 1); +} + + + +static void +issue_verdict(struct nfqnl_queue_entry *entry, int verdict) +{ + QDEBUG("entering for entry %p, verdict %u\n", entry, verdict); + + /* TCP input path (and probably other bits) assume to be called + * from softirq context, not from syscall, like issue_verdict is + * called. TCP input path deadlocks with locks taken from timer + * softirq, e.g. We therefore emulate this by local_bh_disable() */ + + local_bh_disable(); + nf_reinject(entry->skb, entry->info, verdict); + local_bh_enable(); + + kfree(entry); +} + +static inline void +__enqueue_entry(struct nfqnl_instance *queue, + struct nfqnl_queue_entry *entry) +{ + list_add(&entry->list, &queue->queue_list); + queue->queue_total++; +} + +/* + * Find and return a queued entry matched by cmpfn, or return the last + * entry if cmpfn is NULL. + */ +static inline struct nfqnl_queue_entry * +__find_entry(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, + unsigned long data) +{ + struct list_head *p; + + list_for_each_prev(p, &queue->queue_list) { + struct nfqnl_queue_entry *entry = (struct nfqnl_queue_entry *)p; + + if (!cmpfn || cmpfn(entry, data)) + return entry; + } + return NULL; +} + +static inline void +__dequeue_entry(struct nfqnl_instance *q, struct nfqnl_queue_entry *entry) +{ + list_del(&entry->list); + q->queue_total--; +} + +static inline struct nfqnl_queue_entry * +__find_dequeue_entry(struct nfqnl_instance *queue, + nfqnl_cmpfn cmpfn, unsigned long data) +{ + struct nfqnl_queue_entry *entry; + + entry = __find_entry(queue, cmpfn, data); + if (entry == NULL) + return NULL; + + __dequeue_entry(queue, entry); + return entry; +} + + +static inline void +__nfqnl_flush(struct nfqnl_instance *queue, int verdict) +{ + struct nfqnl_queue_entry *entry; + + while ((entry = __find_dequeue_entry(queue, NULL, 0))) + issue_verdict(entry, verdict); +} + +static inline int +__nfqnl_set_mode(struct nfqnl_instance *queue, + unsigned char mode, unsigned int range) +{ + int status = 0; + + switch (mode) { + case NFQNL_COPY_NONE: + case NFQNL_COPY_META: + queue->copy_mode = mode; + queue->copy_range = 0; + break; + + case NFQNL_COPY_PACKET: + queue->copy_mode = mode; + /* we're using struct nfattr which has 16bit nfa_len */ + if (range > 0xffff) + queue->copy_range = 0xffff; + else + queue->copy_range = range; + break; + + default: + status = -EINVAL; + + } + return status; +} + +static struct nfqnl_queue_entry * +find_dequeue_entry(struct nfqnl_instance *queue, + nfqnl_cmpfn cmpfn, unsigned long data) +{ + struct nfqnl_queue_entry *entry; + + spin_lock_bh(&queue->lock); + entry = __find_dequeue_entry(queue, cmpfn, data); + spin_unlock_bh(&queue->lock); + + return entry; +} + +static void +nfqnl_flush(struct nfqnl_instance *queue, int verdict) +{ + spin_lock_bh(&queue->lock); + __nfqnl_flush(queue, verdict); + spin_unlock_bh(&queue->lock); +} + +static struct sk_buff * +nfqnl_build_packet_message(struct nfqnl_instance *queue, + struct nfqnl_queue_entry *entry, int *errp) +{ + unsigned char *old_tail; + size_t size; + size_t data_len = 0; + struct sk_buff *skb; + struct nfqnl_msg_packet_hdr pmsg; + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + unsigned int tmp_uint; + + QDEBUG("entered\n"); + + /* all macros expand to constant values at compile time */ + size = NLMSG_SPACE(sizeof(struct nfqnl_msg_packet_hdr)) + + NLMSG_SPACE(sizeof(u_int32_t)) /* ifindex */ + + NLMSG_SPACE(sizeof(u_int32_t)) /* ifindex */ + + NLMSG_SPACE(sizeof(u_int32_t)) /* mark */ + + NLMSG_SPACE(sizeof(struct nfqnl_msg_packet_hw)) + + NLMSG_SPACE(sizeof(struct nfqnl_msg_packet_timestamp)); + + spin_lock_bh(&queue->lock); + + switch (queue->copy_mode) { + case NFQNL_COPY_META: + case NFQNL_COPY_NONE: + data_len = 0; + break; + + case NFQNL_COPY_PACKET: + if (queue->copy_range == 0 + || queue->copy_range > entry->skb->len) + data_len = entry->skb->len; + else + data_len = queue->copy_range; + + size += NLMSG_SPACE(data_len); + break; + + default: + *errp = -EINVAL; + spin_unlock_bh(&queue->lock); + return NULL; + } + + spin_unlock_bh(&queue->lock); + + skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) + goto nlmsg_failure; + + old_tail= skb->tail; + nlh = NLMSG_PUT(skb, 0, 0, + NFNL_SUBSYS_QUEUE << 8 | NFQNL_MSG_PACKET, + sizeof(struct nfgenmsg)); + nfmsg = NLMSG_DATA(nlh); + nfmsg->nfgen_family = entry->info->pf; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = htons(queue->queue_num); + + pmsg.packet_id = htonl(entry->id); + pmsg.hw_protocol = htons(entry->skb->protocol); + pmsg.hook = entry->info->hook; + + NFA_PUT(skb, NFQA_PACKET_HDR, sizeof(pmsg), &pmsg); + + if (entry->info->indev) { + tmp_uint = htonl(entry->info->indev->ifindex); + NFA_PUT(skb, NFQA_IFINDEX_INDEV, sizeof(tmp_uint), &tmp_uint); + } + + if (entry->info->outdev) { + tmp_uint = htonl(entry->info->outdev->ifindex); + NFA_PUT(skb, NFQA_IFINDEX_OUTDEV, sizeof(tmp_uint), &tmp_uint); + } + + if (entry->skb->nfmark) { + tmp_uint = htonl(entry->skb->nfmark); + NFA_PUT(skb, NFQA_MARK, sizeof(u_int32_t), &tmp_uint); + } + + if (entry->info->indev && entry->skb->dev + && entry->skb->dev->hard_header_parse) { + struct nfqnl_msg_packet_hw phw; + + phw.hw_addrlen = + entry->skb->dev->hard_header_parse(entry->skb, + phw.hw_addr); + phw.hw_addrlen = htons(phw.hw_addrlen); + NFA_PUT(skb, NFQA_HWADDR, sizeof(phw), &phw); + } + + if (entry->skb->stamp.tv_sec) { + struct nfqnl_msg_packet_timestamp ts; + + ts.sec = htonll(entry->skb->stamp.tv_sec); + ts.usec = htonll(entry->skb->stamp.tv_usec); + + NFA_PUT(skb, NFQA_TIMESTAMP, sizeof(ts), &ts); + } + + if (data_len) { + struct nfattr *nfa; + int size = NFA_LENGTH(data_len); + + if (skb_tailroom(skb) < (int)NFA_SPACE(data_len)) { + printk(KERN_WARNING "nf_queue: no tailroom!\n"); + goto nlmsg_failure; + } + + nfa = (struct nfattr *)skb_put(skb, NFA_ALIGN(size)); + nfa->nfa_type = NFQA_PAYLOAD; + nfa->nfa_len = size; + + if (skb_copy_bits(entry->skb, 0, NFA_DATA(nfa), data_len)) + BUG(); + } + + nlh->nlmsg_len = skb->tail - old_tail; + return skb; + +nlmsg_failure: +nfattr_failure: + if (skb) + kfree_skb(skb); + *errp = -EINVAL; + if (net_ratelimit()) + printk(KERN_ERR "nf_queue: error creating packet message\n"); + return NULL; +} + +static int +nfqnl_enqueue_packet(struct sk_buff *skb, struct nf_info *info, + unsigned int queuenum, void *data) +{ + int status = -EINVAL; + struct sk_buff *nskb; + struct nfqnl_instance *queue; + struct nfqnl_queue_entry *entry; + + QDEBUG("entered\n"); + + queue = instance_lookup(queuenum); + if (!queue) { + QDEBUG("no queue instance matching\n"); + return -EINVAL; + } + + if (queue->copy_mode == NFQNL_COPY_NONE) { + QDEBUG("mode COPY_NONE, aborting\n"); + return -EAGAIN; + } + + entry = kmalloc(sizeof(*entry), GFP_ATOMIC); + if (entry == NULL) { + if (net_ratelimit()) + printk(KERN_ERR + "nf_queue: OOM in nfqnl_enqueue_packet()\n"); + return -ENOMEM; + } + + entry->info = info; + entry->skb = skb; + entry->id = atomic_inc_return(&queue->id_sequence); + + nskb = nfqnl_build_packet_message(queue, entry, &status); + if (nskb == NULL) + goto err_out_free; + + spin_lock_bh(&queue->lock); + + if (!queue->peer_pid) + goto err_out_free_nskb; + + if (queue->queue_total >= queue->queue_maxlen) { + queue->queue_dropped++; + status = -ENOSPC; + if (net_ratelimit()) + printk(KERN_WARNING "ip_queue: full at %d entries, " + "dropping packets(s). Dropped: %d\n", + queue->queue_total, queue->queue_dropped); + goto err_out_free_nskb; + } + + /* nfnetlink_unicast will either free the nskb or add it to a socket */ + status = nfnetlink_unicast(nskb, queue->peer_pid, MSG_DONTWAIT); + if (status < 0) { + queue->queue_user_dropped++; + goto err_out_unlock; + } + + __enqueue_entry(queue, entry); + + spin_unlock_bh(&queue->lock); + return status; + +err_out_free_nskb: + kfree_skb(nskb); + +err_out_unlock: + spin_unlock_bh(&queue->lock); + +err_out_free: + kfree(entry); + return status; +} + +static int +nfqnl_mangle(void *data, int data_len, struct nfqnl_queue_entry *e) +{ + int diff; + + diff = data_len - e->skb->len; + if (diff < 0) + skb_trim(e->skb, data_len); + else if (diff > 0) { + if (data_len > 0xFFFF) + return -EINVAL; + if (diff > skb_tailroom(e->skb)) { + struct sk_buff *newskb; + + newskb = skb_copy_expand(e->skb, + skb_headroom(e->skb), + diff, + GFP_ATOMIC); + if (newskb == NULL) { + printk(KERN_WARNING "ip_queue: OOM " + "in mangle, dropping packet\n"); + return -ENOMEM; + } + if (e->skb->sk) + skb_set_owner_w(newskb, e->skb->sk); + kfree_skb(e->skb); + e->skb = newskb; + } + skb_put(e->skb, diff); + } + if (!skb_make_writable(&e->skb, data_len)) + return -ENOMEM; + memcpy(e->skb->data, data, data_len); + + return 0; +} + +static inline int +id_cmp(struct nfqnl_queue_entry *e, unsigned long id) +{ + return (id == e->id); +} + +static int +nfqnl_set_mode(struct nfqnl_instance *queue, + unsigned char mode, unsigned int range) +{ + int status; + + spin_lock_bh(&queue->lock); + status = __nfqnl_set_mode(queue, mode, range); + spin_unlock_bh(&queue->lock); + + return status; +} + +static int +dev_cmp(struct nfqnl_queue_entry *entry, unsigned long ifindex) +{ + if (entry->info->indev) + if (entry->info->indev->ifindex == ifindex) + return 1; + + if (entry->info->outdev) + if (entry->info->outdev->ifindex == ifindex) + return 1; + + return 0; +} + +/* drop all packets with either indev or outdev == ifindex from all queue + * instances */ +static void +nfqnl_dev_drop(int ifindex) +{ + int i; + + QDEBUG("entering for ifindex %u\n", ifindex); + + /* this only looks like we have to hold the readlock for a way too long + * time, issue_verdict(), nf_reinject(), ... - but we always only + * issue NF_DROP, which is processed directly in nf_reinject() */ + read_lock_bh(&instances_lock); + + for (i = 0; i < INSTANCE_BUCKETS; i++) { + struct hlist_node *tmp; + struct nfqnl_instance *inst; + struct hlist_head *head = &instance_table[i]; + + hlist_for_each_entry(inst, tmp, head, hlist) { + struct nfqnl_queue_entry *entry; + while ((entry = find_dequeue_entry(inst, dev_cmp, + ifindex)) != NULL) + issue_verdict(entry, NF_DROP); + } + } + + read_unlock_bh(&instances_lock); +} + +#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) + +static int +nfqnl_rcv_dev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + + /* Drop any packets associated with the downed device */ + if (event == NETDEV_DOWN) + nfqnl_dev_drop(dev->ifindex); + return NOTIFY_DONE; +} + +static struct notifier_block nfqnl_dev_notifier = { + .notifier_call = nfqnl_rcv_dev_event, +}; + +static int +nfqnl_rcv_nl_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct netlink_notify *n = ptr; + + if (event == NETLINK_URELEASE && + n->protocol == NETLINK_NETFILTER && n->pid) { + int i; + + /* destroy all instances for this pid */ + write_lock_bh(&instances_lock); + for (i = 0; i < INSTANCE_BUCKETS; i++) { + struct hlist_node *tmp, *t2; + struct nfqnl_instance *inst; + struct hlist_head *head = &instance_table[i]; + + hlist_for_each_entry_safe(inst, tmp, t2, head, hlist) { + if (n->pid == inst->peer_pid) + __instance_destroy(inst); + } + } + write_unlock_bh(&instances_lock); + } + return NOTIFY_DONE; +} + +static struct notifier_block nfqnl_rtnl_notifier = { + .notifier_call = nfqnl_rcv_nl_event, +}; + +static int +nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp) +{ + struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); + u_int16_t queue_num = ntohs(nfmsg->res_id); + + struct nfqnl_msg_verdict_hdr *vhdr; + struct nfqnl_instance *queue; + unsigned int verdict; + struct nfqnl_queue_entry *entry; + + queue = instance_lookup(queue_num); + if (!queue) + return -ENODEV; + + if (queue->peer_pid != NETLINK_CB(skb).pid) + return -EPERM; + + if (!nfqa[NFQA_VERDICT_HDR-1]) + return -EINVAL; + + vhdr = NFA_DATA(nfqa[NFQA_VERDICT_HDR-1]); + verdict = ntohl(vhdr->verdict); + + if ((verdict & NF_VERDICT_MASK) > NF_MAX_VERDICT) + return -EINVAL; + + entry = find_dequeue_entry(queue, id_cmp, ntohl(vhdr->id)); + if (entry == NULL) + return -ENOENT; + + if (nfqa[NFQA_PAYLOAD-1]) { + if (nfqnl_mangle(NFA_DATA(nfqa[NFQA_PAYLOAD-1]), + NFA_PAYLOAD(nfqa[NFQA_PAYLOAD-1]), entry) < 0) + verdict = NF_DROP; + } + + if (nfqa[NFQA_MARK-1]) + skb->nfmark = ntohl(*(u_int32_t *)NFA_DATA(nfqa[NFQA_MARK-1])); + + issue_verdict(entry, verdict); + return 0; +} + +static int +nfqnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp) +{ + return -ENOTSUPP; +} + +static int +nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp) +{ + struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); + u_int16_t queue_num = ntohs(nfmsg->res_id); + struct nfqnl_instance *queue; + + QDEBUG("entering for msg %u\n", NFNL_MSG_TYPE(nlh->nlmsg_type)); + + queue = instance_lookup(queue_num); + if (nfqa[NFQA_CFG_CMD-1]) { + struct nfqnl_msg_config_cmd *cmd; + cmd = NFA_DATA(nfqa[NFQA_CFG_CMD-1]); + QDEBUG("found CFG_CMD\n"); + + switch (cmd->command) { + case NFQNL_CFG_CMD_BIND: + if (queue) + return -EBUSY; + + queue = instance_create(queue_num, NETLINK_CB(skb).pid); + if (!queue) + return -EINVAL; + break; + case NFQNL_CFG_CMD_UNBIND: + if (!queue) + return -ENODEV; + + if (queue->peer_pid != NETLINK_CB(skb).pid) + return -EPERM; + + instance_destroy(queue); + break; + case NFQNL_CFG_CMD_PF_BIND: + QDEBUG("registering queue handler for pf=%u\n", + ntohs(cmd->pf)); + return nf_register_queue_handler(ntohs(cmd->pf), + nfqnl_enqueue_packet, + NULL); + + break; + case NFQNL_CFG_CMD_PF_UNBIND: + QDEBUG("unregistering queue handler for pf=%u\n", + ntohs(cmd->pf)); + /* This is a bug and a feature. We can unregister + * other handlers(!) */ + return nf_unregister_queue_handler(ntohs(cmd->pf)); + break; + default: + return -EINVAL; + } + } else { + if (!queue) { + QDEBUG("no config command, and no instance ENOENT\n"); + return -ENOENT; + } + + if (queue->peer_pid != NETLINK_CB(skb).pid) { + QDEBUG("no config command, and wrong pid\n"); + return -EPERM; + } + } + + if (nfqa[NFQA_CFG_PARAMS-1]) { + struct nfqnl_msg_config_params *params; + params = NFA_DATA(nfqa[NFQA_CFG_PARAMS-1]); + + nfqnl_set_mode(queue, params->copy_mode, + ntohl(params->copy_range)); + } + + return 0; +} + +static struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = { + [NFQNL_MSG_PACKET] = { .call = nfqnl_recv_unsupp, + .cap_required = CAP_NET_ADMIN }, + [NFQNL_MSG_VERDICT] = { .call = nfqnl_recv_verdict, + .cap_required = CAP_NET_ADMIN }, + [NFQNL_MSG_CONFIG] = { .call = nfqnl_recv_config, + .cap_required = CAP_NET_ADMIN }, +}; + +static struct nfnetlink_subsystem nfqnl_subsys = { + .name = "nf_queue", + .subsys_id = NFNL_SUBSYS_QUEUE, + .cb_count = NFQNL_MSG_MAX, + .attr_count = NFQA_MAX, + .cb = nfqnl_cb, +}; + +static int +init_or_cleanup(int init) +{ + int status = -ENOMEM; + + if (!init) + goto cleanup; + + netlink_register_notifier(&nfqnl_rtnl_notifier); + status = nfnetlink_subsys_register(&nfqnl_subsys); + if (status < 0) { + printk(KERN_ERR "nf_queue: failed to create netlink socket\n"); + goto cleanup_netlink_notifier; + } + + register_netdevice_notifier(&nfqnl_dev_notifier); + return status; + +cleanup: + nf_unregister_queue_handlers(nfqnl_enqueue_packet); + unregister_netdevice_notifier(&nfqnl_dev_notifier); + nfnetlink_subsys_unregister(&nfqnl_subsys); + +cleanup_netlink_notifier: + netlink_unregister_notifier(&nfqnl_rtnl_notifier); + return status; +} + +static int __init init(void) +{ + + return init_or_cleanup(1); +} + +static void __exit fini(void) +{ + init_or_cleanup(0); +} + +MODULE_DESCRIPTION("netfilter packet queue handler"); +MODULE_AUTHOR("Harald Welte "); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_QUEUE); + +module_init(init); +module_exit(fini); -- cgit v0.10.2 From 0bd1b59b15e4057101c89d4db15a3683c0d897f7 Mon Sep 17 00:00:00 2001 From: Andrew McDonald Date: Tue, 9 Aug 2005 19:44:42 -0700 Subject: [IPV6]: Check interface bindings on IPv6 raw socket reception Take account of whether a socket is bound to a particular device when selecting an IPv6 raw socket to receive a packet. Also perform this check when receiving IPv6 packets with router alert options. Signed-off-by: Andrew McDonald Signed-off-by: David S. Miller diff --git a/include/net/rawv6.h b/include/net/rawv6.h index 23fd9a6..887009a 100644 --- a/include/net/rawv6.h +++ b/include/net/rawv6.h @@ -10,7 +10,8 @@ extern rwlock_t raw_v6_lock; extern void ipv6_raw_deliver(struct sk_buff *skb, int nexthdr); extern struct sock *__raw_v6_lookup(struct sock *sk, unsigned short num, - struct in6_addr *loc_addr, struct in6_addr *rmt_addr); + struct in6_addr *loc_addr, struct in6_addr *rmt_addr, + int dif); extern int rawv6_rcv(struct sock *sk, struct sk_buff *skb); diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index ff3ec98..ee9f1d3 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -551,7 +551,8 @@ static void icmpv6_notify(struct sk_buff *skb, int type, int code, u32 info) read_lock(&raw_v6_lock); if ((sk = sk_head(&raw_v6_htable[hash])) != NULL) { - while((sk = __raw_v6_lookup(sk, nexthdr, daddr, saddr))) { + while((sk = __raw_v6_lookup(sk, nexthdr, daddr, saddr, + skb->dev->ifindex))) { rawv6_err(sk, skb, NULL, type, code, inner_offset, info); sk = sk_next(sk); } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index a7fcbcc..00f8514 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -277,7 +277,9 @@ static int ip6_call_ra_chain(struct sk_buff *skb, int sel) read_lock(&ip6_ra_lock); for (ra = ip6_ra_chain; ra; ra = ra->next) { struct sock *sk = ra->sk; - if (sk && ra->sel == sel) { + if (sk && ra->sel == sel && + (!sk->sk_bound_dev_if || + sk->sk_bound_dev_if == skb->dev->ifindex)) { if (last) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 1d4d75b..9db0de8 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -81,7 +81,8 @@ static void raw_v6_unhash(struct sock *sk) /* Grumble... icmp and ip_input want to get at this... */ struct sock *__raw_v6_lookup(struct sock *sk, unsigned short num, - struct in6_addr *loc_addr, struct in6_addr *rmt_addr) + struct in6_addr *loc_addr, struct in6_addr *rmt_addr, + int dif) { struct hlist_node *node; int is_multicast = ipv6_addr_is_multicast(loc_addr); @@ -94,6 +95,9 @@ struct sock *__raw_v6_lookup(struct sock *sk, unsigned short num, !ipv6_addr_equal(&np->daddr, rmt_addr)) continue; + if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) + continue; + if (!ipv6_addr_any(&np->rcv_saddr)) { if (ipv6_addr_equal(&np->rcv_saddr, loc_addr)) goto found; @@ -160,7 +164,7 @@ void ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) if (sk == NULL) goto out; - sk = __raw_v6_lookup(sk, nexthdr, daddr, saddr); + sk = __raw_v6_lookup(sk, nexthdr, daddr, saddr, skb->dev->ifindex); while (sk) { if (nexthdr != IPPROTO_ICMPV6 || !icmpv6_filter(sk, skb)) { @@ -170,7 +174,8 @@ void ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) if (clone) rawv6_rcv(sk, clone); } - sk = __raw_v6_lookup(sk_next(sk), nexthdr, daddr, saddr); + sk = __raw_v6_lookup(sk_next(sk), nexthdr, daddr, saddr, + skb->dev->ifindex); } out: read_unlock(&raw_v6_lock); -- cgit v0.10.2 From d13964f4490157b8a290903362bfbc54f750a6bc Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 9 Aug 2005 19:45:02 -0700 Subject: [IPV4/6]: Check if packet was actually delivered to a raw socket to decide whether to send an ICMP unreachable Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller diff --git a/include/net/raw.h b/include/net/raw.h index 1c411c4..1c4bc3e 100644 --- a/include/net/raw.h +++ b/include/net/raw.h @@ -37,6 +37,6 @@ extern struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num, unsigned long raddr, unsigned long laddr, int dif); -extern void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash); +extern int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash); #endif /* _RAW_H */ diff --git a/include/net/rawv6.h b/include/net/rawv6.h index 887009a..14476a7 100644 --- a/include/net/rawv6.h +++ b/include/net/rawv6.h @@ -7,7 +7,7 @@ extern struct hlist_head raw_v6_htable[RAWV6_HTABLE_SIZE]; extern rwlock_t raw_v6_lock; -extern void ipv6_raw_deliver(struct sk_buff *skb, int nexthdr); +extern int ipv6_raw_deliver(struct sk_buff *skb, int nexthdr); extern struct sock *__raw_v6_lookup(struct sock *sk, unsigned short num, struct in6_addr *loc_addr, struct in6_addr *rmt_addr, diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index d603247..81e1802 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -225,8 +225,8 @@ static inline int ip_local_deliver_finish(struct sk_buff *skb) /* If there maybe a raw socket we must check - if not we * don't care less */ - if (raw_sk) - raw_v4_input(skb, skb->nh.iph, hash); + if (raw_sk && !raw_v4_input(skb, skb->nh.iph, hash)) + raw_sk = NULL; if ((ipprot = rcu_dereference(inet_protos[hash])) != NULL) { int ret; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index d1835b1..e222c5c 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -150,10 +150,11 @@ static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb) * RFC 1122: SHOULD pass TOS value up to the transport layer. * -> It does. And not only TOS, but all IP header. */ -void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash) +int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash) { struct sock *sk; struct hlist_head *head; + int delivered = 0; read_lock(&raw_v4_lock); head = &raw_v4_htable[hash]; @@ -164,6 +165,7 @@ void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash) skb->dev->ifindex); while (sk) { + delivered = 1; if (iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) { struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); @@ -177,6 +179,7 @@ void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash) } out: read_unlock(&raw_v4_lock); + return delivered; } void raw_err (struct sock *sk, struct sk_buff *skb, u32 info) diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index ab51c03..6e34804 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -166,8 +166,8 @@ resubmit: nexthdr = skb->nh.raw[nhoff]; raw_sk = sk_head(&raw_v6_htable[nexthdr & (MAX_INET_PROTOS - 1)]); - if (raw_sk) - ipv6_raw_deliver(skb, nexthdr); + if (raw_sk && !ipv6_raw_deliver(skb, nexthdr)) + raw_sk = NULL; hash = nexthdr & (MAX_INET_PROTOS - 1); if ((ipprot = rcu_dereference(inet6_protos[hash])) != NULL) { diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 9db0de8..a082646 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -141,11 +141,12 @@ static __inline__ int icmpv6_filter(struct sock *sk, struct sk_buff *skb) * * Caller owns SKB so we must make clones. */ -void ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) +int ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) { struct in6_addr *saddr; struct in6_addr *daddr; struct sock *sk; + int delivered = 0; __u8 hash; saddr = &skb->nh.ipv6h->saddr; @@ -167,6 +168,7 @@ void ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) sk = __raw_v6_lookup(sk, nexthdr, daddr, saddr, skb->dev->ifindex); while (sk) { + delivered = 1; if (nexthdr != IPPROTO_ICMPV6 || !icmpv6_filter(sk, skb)) { struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); @@ -179,6 +181,7 @@ void ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) } out: read_unlock(&raw_v6_lock); + return delivered; } /* This cleans up af_inet6 a bit. -DaveM */ -- cgit v0.10.2 From e6848976b721eeb5551cd94673faafeef78d9f35 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 19:45:38 -0700 Subject: [NET]: Cleanup INET_REFCNT_DEBUG code Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/net/inet_common.h b/include/net/inet_common.h index fbc1f4d..1fbd94d 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -29,7 +29,6 @@ extern unsigned int inet_poll(struct file * file, struct socket *sock, struct p extern int inet_listen(struct socket *sock, int backlog); extern void inet_sock_destruct(struct sock *sk); -extern atomic_t inet_sock_nr; extern int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 533fc07..c5a02dd 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -145,7 +145,6 @@ DECLARE_SNMP_STAT(struct udp_mib, udp_stats_in6); #define UDP6_INC_STATS(field) SNMP_INC_STATS(udp_stats_in6, field) #define UDP6_INC_STATS_BH(field) SNMP_INC_STATS_BH(udp_stats_in6, field) #define UDP6_INC_STATS_USER(field) SNMP_INC_STATS_USER(udp_stats_in6, field) -extern atomic_t inet6_sock_nr; int snmp6_register_dev(struct inet6_dev *idev); int snmp6_unregister_dev(struct inet6_dev *idev); diff --git a/include/net/sock.h b/include/net/sock.h index e9b1dba..11b8155 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -491,6 +491,9 @@ extern int sk_wait_data(struct sock *sk, long *timeo); struct request_sock_ops; +/* Here is the right place to enable sock refcounting debugging */ +#define SOCK_REFCNT_DEBUG + /* Networking protocol blocks we attach to sockets. * socket layer -> transport layer interface * transport -> network interface is defined by struct inet_proto @@ -561,7 +564,9 @@ struct proto { char name[32]; struct list_head node; - +#ifdef SOCK_REFCNT_DEBUG + atomic_t socks; +#endif struct { int inuse; u8 __pad[SMP_CACHE_BYTES - sizeof(int)]; @@ -571,6 +576,31 @@ struct proto { extern int proto_register(struct proto *prot, int alloc_slab); extern void proto_unregister(struct proto *prot); +#ifdef SOCK_REFCNT_DEBUG +static inline void sk_refcnt_debug_inc(struct sock *sk) +{ + atomic_inc(&sk->sk_prot->socks); +} + +static inline void sk_refcnt_debug_dec(struct sock *sk) +{ + atomic_dec(&sk->sk_prot->socks); + printk(KERN_DEBUG "%s socket %p released, %d are still alive\n", + sk->sk_prot->name, sk, atomic_read(&sk->sk_prot->socks)); +} + +static inline void sk_refcnt_debug_release(const struct sock *sk) +{ + if (atomic_read(&sk->sk_refcnt) != 1) + printk(KERN_DEBUG "Destruction of the %s socket %p delayed, refcnt=%d\n", + sk->sk_prot->name, sk, atomic_read(&sk->sk_refcnt)); +} +#else /* SOCK_REFCNT_DEBUG */ +#define sk_refcnt_debug_inc(sk) do { } while (0) +#define sk_refcnt_debug_dec(sk) do { } while (0) +#define sk_refcnt_debug_release(sk) do { } while (0) +#endif /* SOCK_REFCNT_DEBUG */ + /* Called with local bh disabled */ static __inline__ void sock_prot_inc_use(struct proto *prot) { diff --git a/include/net/tcp.h b/include/net/tcp.h index 5010f0c..3198473 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -306,7 +306,7 @@ extern kmem_cache_t *tcp_timewait_cachep; static inline void tcp_tw_put(struct tcp_tw_bucket *tw) { if (atomic_dec_and_test(&tw->tw_refcnt)) { -#ifdef INET_REFCNT_DEBUG +#ifdef SOCK_REFCNT_DEBUG printk(KERN_DEBUG "tw_bucket %p released\n", tw); #endif kmem_cache_free(tcp_timewait_cachep, tw); diff --git a/net/core/sock.c b/net/core/sock.c index 51a5e7d..a1a23be 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1367,11 +1367,7 @@ void sk_common_release(struct sock *sk) xfrm_sk_free_policy(sk); -#ifdef INET_REFCNT_DEBUG - if (atomic_read(&sk->sk_refcnt) != 1) - printk(KERN_DEBUG "Destruction of the socket %p delayed, c=%d\n", - sk, atomic_read(&sk->sk_refcnt)); -#endif + sk_refcnt_debug_release(sk); sock_put(sk); } diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 163ae40..9e83d77 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -114,10 +114,6 @@ DEFINE_SNMP_STAT(struct linux_mib, net_statistics); -#ifdef INET_REFCNT_DEBUG -atomic_t inet_sock_nr; -#endif - extern void ip_mc_drop_socket(struct sock *sk); /* The inetsw table contains everything that inet_create needs to @@ -153,11 +149,7 @@ void inet_sock_destruct(struct sock *sk) if (inet->opt) kfree(inet->opt); dst_release(sk->sk_dst_cache); -#ifdef INET_REFCNT_DEBUG - atomic_dec(&inet_sock_nr); - printk(KERN_DEBUG "INET socket %p released, %d are still alive\n", - sk, atomic_read(&inet_sock_nr)); -#endif + sk_refcnt_debug_dec(sk); } /* @@ -317,9 +309,7 @@ static int inet_create(struct socket *sock, int protocol) inet->mc_index = 0; inet->mc_list = NULL; -#ifdef INET_REFCNT_DEBUG - atomic_inc(&inet_sock_nr); -#endif + sk_refcnt_debug_inc(sk); if (inet->num) { /* It assumes that any protocol which allows @@ -1205,7 +1195,3 @@ EXPORT_SYMBOL(inet_stream_ops); EXPORT_SYMBOL(inet_unregister_protosw); EXPORT_SYMBOL(net_statistics); EXPORT_SYMBOL(sysctl_ip_nonlocal_bind); - -#ifdef INET_REFCNT_DEBUG -EXPORT_SYMBOL(inet_sock_nr); -#endif diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 42a2e2c..20159a3 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1580,12 +1580,7 @@ void tcp_destroy_sock(struct sock *sk) xfrm_sk_free_policy(sk); -#ifdef INET_REFCNT_DEBUG - if (atomic_read(&sk->sk_refcnt) != 1) { - printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n", - sk, atomic_read(&sk->sk_refcnt)); - } -#endif + sk_refcnt_debug_release(sk); atomic_dec(&tcp_orphan_count); sock_put(sk); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index f42a284..f8e288c 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -84,7 +84,7 @@ static void tcp_timewait_kill(struct tcp_tw_bucket *tw) tcp_bucket_destroy(tb); spin_unlock(&bhead->lock); -#ifdef INET_REFCNT_DEBUG +#ifdef SOCK_REFCNT_DEBUG if (atomic_read(&tw->tw_refcnt) != 1) { printk(KERN_DEBUG "tw_bucket %p refcnt=%d\n", tw, atomic_read(&tw->tw_refcnt)); @@ -799,9 +799,21 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newsk->sk_err = 0; newsk->sk_priority = 0; atomic_set(&newsk->sk_refcnt, 2); -#ifdef INET_REFCNT_DEBUG - atomic_inc(&inet_sock_nr); -#endif + + /* + * Increment the counter in the same struct proto as the master + * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that + * is the same as sk->sk_prot->socks, as this field was copied + * with memcpy), same rationale as the first comment in this + * function. + * + * This _changes_ the previous behaviour, where + * tcp_create_openreq_child always was incrementing the + * equivalent to tcp_prot->socks (inet_sock_nr), so this have + * to be taken into account in all callers. -acme + */ + sk_refcnt_debug_inc(newsk); + atomic_inc(&tcp_sockets_allocated); if (sock_flag(newsk, SOCK_KEEPOPEN)) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 5740473..7df2ccb 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -86,26 +86,12 @@ extern void if6_proc_exit(void); int sysctl_ipv6_bindv6only; -#ifdef INET_REFCNT_DEBUG -atomic_t inet6_sock_nr; -EXPORT_SYMBOL(inet6_sock_nr); -#endif - /* The inetsw table contains everything that inet_create needs to * build a new socket. */ static struct list_head inetsw6[SOCK_MAX]; static DEFINE_SPINLOCK(inetsw6_lock); -static void inet6_sock_destruct(struct sock *sk) -{ - inet_sock_destruct(sk); - -#ifdef INET_REFCNT_DEBUG - atomic_dec(&inet6_sock_nr); -#endif -} - static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk) { const int offset = sk->sk_prot->obj_size - sizeof(struct ipv6_pinfo); @@ -186,7 +172,7 @@ static int inet6_create(struct socket *sock, int protocol) inet->hdrincl = 1; } - sk->sk_destruct = inet6_sock_destruct; + sk->sk_destruct = inet_sock_destruct; sk->sk_family = PF_INET6; sk->sk_protocol = protocol; @@ -213,12 +199,17 @@ static int inet6_create(struct socket *sock, int protocol) inet->pmtudisc = IP_PMTUDISC_DONT; else inet->pmtudisc = IP_PMTUDISC_WANT; + /* + * Increment only the relevant sk_prot->socks debug field, this changes + * the previous behaviour of incrementing both the equivalent to + * answer->prot->socks (inet6_sock_nr) and inet_sock_nr. + * + * This allows better debug granularity as we'll know exactly how many + * UDPv6, TCPv6, etc socks were allocated, not the sum of all IPv6 + * transport protocol socks. -acme + */ + sk_refcnt_debug_inc(sk); - -#ifdef INET_REFCNT_DEBUG - atomic_inc(&inet6_sock_nr); - atomic_inc(&inet_sock_nr); -#endif if (inet->num) { /* It assumes that any protocol which allows * the user to assign a number at socket diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 3bc144a..76fe239 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -163,6 +163,13 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, fl6_free_socklist(sk); ipv6_sock_mc_close(sk); + /* + * Sock is moving from IPv6 to IPv4 (sk_prot), so + * remove it from the refcnt debug socks count in the + * original family... + */ + sk_refcnt_debug_dec(sk); + if (sk->sk_protocol == IPPROTO_TCP) { struct tcp_sock *tp = tcp_sk(sk); @@ -192,9 +199,11 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, kfree_skb(pktopt); sk->sk_destruct = inet_sock_destruct; -#ifdef INET_REFCNT_DEBUG - atomic_dec(&inet6_sock_nr); -#endif + /* + * ... and add it to the refcnt debug socks count + * in the new family. -acme + */ + sk_refcnt_debug_inc(sk); module_put(THIS_MODULE); retv = 0; break; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index ef29cfd..885e05b 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1407,12 +1407,11 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newnp->mcast_oif = tcp_v6_iif(skb); newnp->mcast_hops = skb->nh.ipv6h->hop_limit; - /* Charge newly allocated IPv6 socket. Though it is mapped, - * it is IPv6 yet. + /* + * No need to charge this sock to the relevant IPv6 refcnt debug socks count + * here, tcp_create_openreq_child now does this for us, see the comment in + * that function for the gory details. -acme */ -#ifdef INET_REFCNT_DEBUG - atomic_inc(&inet6_sock_nr); -#endif /* It is tricky place. Until this moment IPv4 tcp worked with IPv6 af_tcp.af_specific. @@ -1467,10 +1466,11 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, if (newsk == NULL) goto out; - /* Charge newly allocated IPv6 socket */ -#ifdef INET_REFCNT_DEBUG - atomic_inc(&inet6_sock_nr); -#endif + /* + * No need to charge this sock to the relevant IPv6 refcnt debug socks + * count here, tcp_create_openreq_child now does this for us, see the + * comment in that function for the gory details. -acme + */ ip6_dst_store(newsk, dst, NULL); newsk->sk_route_caps = dst->dev->features & diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index e9b2fd4..4a6421a 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -641,10 +641,7 @@ static struct sock *sctp_v6_create_accept_sk(struct sock *sk, else newinet->pmtudisc = IP_PMTUDISC_WANT; -#ifdef INET_REFCNT_DEBUG - atomic_inc(&inet6_sock_nr); - atomic_inc(&inet_sock_nr); -#endif + sk_refcnt_debug_inc(newsk); if (newsk->sk_prot->init(newsk)) { sk_common_release(newsk); diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index ce9245e..8d3f809 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -593,9 +593,7 @@ static struct sock *sctp_v4_create_accept_sk(struct sock *sk, newinet->mc_index = 0; newinet->mc_list = NULL; -#ifdef INET_REFCNT_DEBUG - atomic_inc(&inet_sock_nr); -#endif + sk_refcnt_debug_inc(newsk); if (newsk->sk_prot->init(newsk)) { sk_common_release(newsk); -- cgit v0.10.2 From 614c6cb4f225a7da9f13e5dd0fac3b531078eb9f Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 19:47:37 -0700 Subject: [SOCK]: Rename __tcp_v4_rehash to __sk_prot_rehash This operation was already generic and DCCP will use it. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/net/sock.h b/include/net/sock.h index 11b8155..f91ee82 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -612,6 +612,15 @@ static __inline__ void sock_prot_dec_use(struct proto *prot) prot->stats[smp_processor_id()].inuse--; } +/* With per-bucket locks this operation is not-atomic, so that + * this version is not worse. + */ +static inline void __sk_prot_rehash(struct sock *sk) +{ + sk->sk_prot->unhash(sk); + sk->sk_prot->hash(sk); +} + /* About 10 seconds */ #define SOCK_DESTROY_TIME (10*HZ) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 67c6708..c7c99d3 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1834,15 +1834,6 @@ do_time_wait: goto discard_it; } -/* With per-bucket locks this operation is not-atomic, so that - * this version is not worse. - */ -static void __tcp_v4_rehash(struct sock *sk) -{ - sk->sk_prot->unhash(sk); - sk->sk_prot->hash(sk); -} - static int tcp_v4_reselect_saddr(struct sock *sk) { struct inet_sock *inet = inet_sk(sk); @@ -1889,7 +1880,7 @@ static int tcp_v4_reselect_saddr(struct sock *sk) * Besides that, it does not check for connection * uniqueness. Wait for troubles. */ - __tcp_v4_rehash(sk); + __sk_prot_rehash(sk); return 0; } -- cgit v0.10.2 From 6cbb0df788b90777a7ed0f9d8261260353f48076 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 19:49:02 -0700 Subject: [SOCK]: Introduce sk_setup_caps From tcp_v4_setup_caps, that always is preceded by a call to __sk_dst_set, so coalesce this sequence into sk_setup_caps, removing one call to a TCP function in the IP layer. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/net/sock.h b/include/net/sock.h index f91ee82..69d869e 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1025,6 +1025,16 @@ sk_dst_check(struct sock *sk, u32 cookie) return dst; } +static inline void sk_setup_caps(struct sock *sk, struct dst_entry *dst) +{ + __sk_dst_set(sk, dst); + sk->sk_route_caps = dst->dev->features; + if (sk->sk_route_caps & NETIF_F_TSO) { + if (sock_flag(sk, SOCK_NO_LARGESEND) || dst->header_len) + sk->sk_route_caps &= ~NETIF_F_TSO; + } +} + static inline void sk_charge_skb(struct sock *sk, struct sk_buff *skb) { sk->sk_wmem_queued += skb->truesize; diff --git a/include/net/tcp.h b/include/net/tcp.h index 3198473..d95661a 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1658,15 +1658,6 @@ static inline int tcp_paws_check(const struct tcp_options_received *rx_opt, int return 1; } -static inline void tcp_v4_setup_caps(struct sock *sk, struct dst_entry *dst) -{ - sk->sk_route_caps = dst->dev->features; - if (sk->sk_route_caps & NETIF_F_TSO) { - if (sock_flag(sk, SOCK_NO_LARGESEND) || dst->header_len) - sk->sk_route_caps &= ~NETIF_F_TSO; - } -} - #define TCP_CHECK_TIMER(sk) do { } while (0) static inline int tcp_use_frto(const struct sock *sk) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index c934f53..c72fc87 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -69,13 +69,10 @@ #include #include #include -#include -#include #include #include #include #include -#include #include #include #include @@ -84,6 +81,7 @@ #include #include #include +#include /* * Shall we try to damage output packets if routing dev changes? @@ -329,8 +327,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok) if (ip_route_output_flow(&rt, &fl, sk, 0)) goto no_route; } - __sk_dst_set(sk, &rt->u.dst); - tcp_v4_setup_caps(sk, &rt->u.dst); + sk_setup_caps(sk, &rt->u.dst); } skb->dst = dst_clone(&rt->u.dst); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index c7c99d3..4a5daec 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -837,8 +837,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) goto failure; /* OK, now commit destination to socket. */ - __sk_dst_set(sk, &rt->u.dst); - tcp_v4_setup_caps(sk, &rt->u.dst); + sk_setup_caps(sk, &rt->u.dst); if (!tp->write_seq) tp->write_seq = secure_tcp_sequence_number(inet->saddr, @@ -1553,8 +1552,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, if (!newsk) goto exit; - newsk->sk_dst_cache = dst; - tcp_v4_setup_caps(newsk, dst); + sk_setup_caps(newsk, dst); newtp = tcp_sk(newsk); newinet = inet_sk(newsk); @@ -1855,8 +1853,7 @@ static int tcp_v4_reselect_saddr(struct sock *sk) if (err) return err; - __sk_dst_set(sk, &rt->u.dst); - tcp_v4_setup_caps(sk, &rt->u.dst); + sk_setup_caps(sk, &rt->u.dst); new_saddr = rt->rt_src; @@ -1914,8 +1911,7 @@ int tcp_v4_rebuild_header(struct sock *sk) err = ip_route_output_flow(&rt, &fl, sk, 0); } if (!err) { - __sk_dst_set(sk, &rt->u.dst); - tcp_v4_setup_caps(sk, &rt->u.dst); + sk_setup_caps(sk, &rt->u.dst); return 0; } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index f8e288c..7c46a55 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -711,6 +711,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, bh_lock_sock(newsk); rwlock_init(&newsk->sk_dst_lock); + newsk->sk_dst_cache = NULL; atomic_set(&newsk->sk_rmem_alloc, 0); skb_queue_head_init(&newsk->sk_receive_queue); atomic_set(&newsk->sk_wmem_alloc, 0); -- cgit v0.10.2 From 32519f11d38ea8f4f60896763bacec7db1760f9c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 19:50:02 -0700 Subject: [INET]: Introduce inet_sk_rebuild_header From tcp_v4_rebuild_header, that already was pretty generic, I only needed to use sk->sk_protocol instead of the hardcoded IPPROTO_TCP and establish the requirement that INET transport layer protocols that want to use this function map TCP_SYN_SENT to its equivalent state. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/linux/ip.h b/include/linux/ip.h index 31e7ced..33e8a19 100644 --- a/include/linux/ip.h +++ b/include/linux/ip.h @@ -196,6 +196,8 @@ static inline void inet_sk_copy_descendant(struct sock *sk_to, #endif #endif +extern int inet_sk_rebuild_header(struct sock *sk); + struct iphdr { #if defined(__LITTLE_ENDIAN_BITFIELD) __u8 ihl:4, diff --git a/include/net/tcp.h b/include/net/tcp.h index d95661a..0c769ad 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -799,8 +799,6 @@ extern void tcp_parse_options(struct sk_buff *skb, * TCP v4 functions exported for the inet6 API */ -extern int tcp_v4_rebuild_header(struct sock *sk); - extern int tcp_v4_build_header(struct sock *sk, struct sk_buff *skb); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 9e83d77..7137e64 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -951,6 +951,119 @@ void inet_unregister_protosw(struct inet_protosw *p) } } +/* + * Shall we try to damage output packets if routing dev changes? + */ + +int sysctl_ip_dynaddr; + +static int inet_sk_reselect_saddr(struct sock *sk) +{ + struct inet_sock *inet = inet_sk(sk); + int err; + struct rtable *rt; + __u32 old_saddr = inet->saddr; + __u32 new_saddr; + __u32 daddr = inet->daddr; + + if (inet->opt && inet->opt->srr) + daddr = inet->opt->faddr; + + /* Query new route. */ + err = ip_route_connect(&rt, daddr, 0, + RT_CONN_FLAGS(sk), + sk->sk_bound_dev_if, + sk->sk_protocol, + inet->sport, inet->dport, sk); + if (err) + return err; + + sk_setup_caps(sk, &rt->u.dst); + + new_saddr = rt->rt_src; + + if (new_saddr == old_saddr) + return 0; + + if (sysctl_ip_dynaddr > 1) { + printk(KERN_INFO "%s(): shifting inet->" + "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n", + __FUNCTION__, + NIPQUAD(old_saddr), + NIPQUAD(new_saddr)); + } + + inet->saddr = inet->rcv_saddr = new_saddr; + + /* + * XXX The only one ugly spot where we need to + * XXX really change the sockets identity after + * XXX it has entered the hashes. -DaveM + * + * Besides that, it does not check for connection + * uniqueness. Wait for troubles. + */ + __sk_prot_rehash(sk); + return 0; +} + +int inet_sk_rebuild_header(struct sock *sk) +{ + struct inet_sock *inet = inet_sk(sk); + struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0); + u32 daddr; + int err; + + /* Route is OK, nothing to do. */ + if (rt) + return 0; + + /* Reroute. */ + daddr = inet->daddr; + if (inet->opt && inet->opt->srr) + daddr = inet->opt->faddr; +{ + struct flowi fl = { + .oif = sk->sk_bound_dev_if, + .nl_u = { + .ip4_u = { + .daddr = daddr, + .saddr = inet->saddr, + .tos = RT_CONN_FLAGS(sk), + }, + }, + .proto = sk->sk_protocol, + .uli_u = { + .ports = { + .sport = inet->sport, + .dport = inet->dport, + }, + }, + }; + + err = ip_route_output_flow(&rt, &fl, sk, 0); +} + if (!err) + sk_setup_caps(sk, &rt->u.dst); + else { + /* Routing failed... */ + sk->sk_route_caps = 0; + /* + * Other protocols have to map its equivalent state to TCP_SYN_SENT. + * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme + */ + if (!sysctl_ip_dynaddr || + sk->sk_state != TCP_SYN_SENT || + (sk->sk_userlocks & SOCK_BINDADDR_LOCK) || + (err = inet_sk_reselect_saddr(sk)) != 0) + sk->sk_err_soft = -err; + } + + return err; +} + +EXPORT_SYMBOL(inet_sk_rebuild_header); + #ifdef CONFIG_IP_MULTICAST static struct net_protocol igmp_protocol = { .handler = igmp_rcv, diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index c72fc87..dd568b0 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -83,11 +83,6 @@ #include #include -/* - * Shall we try to damage output packets if routing dev changes? - */ - -int sysctl_ip_dynaddr; int sysctl_ip_default_ttl = IPDEFTTL; /* Generate a checksum for an outgoing IP datagram. */ diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 4a5daec..ae6fad9 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1832,101 +1832,6 @@ do_time_wait: goto discard_it; } -static int tcp_v4_reselect_saddr(struct sock *sk) -{ - struct inet_sock *inet = inet_sk(sk); - int err; - struct rtable *rt; - __u32 old_saddr = inet->saddr; - __u32 new_saddr; - __u32 daddr = inet->daddr; - - if (inet->opt && inet->opt->srr) - daddr = inet->opt->faddr; - - /* Query new route. */ - err = ip_route_connect(&rt, daddr, 0, - RT_CONN_FLAGS(sk), - sk->sk_bound_dev_if, - IPPROTO_TCP, - inet->sport, inet->dport, sk); - if (err) - return err; - - sk_setup_caps(sk, &rt->u.dst); - - new_saddr = rt->rt_src; - - if (new_saddr == old_saddr) - return 0; - - if (sysctl_ip_dynaddr > 1) { - printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->" - "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n", - NIPQUAD(old_saddr), - NIPQUAD(new_saddr)); - } - - inet->saddr = new_saddr; - inet->rcv_saddr = new_saddr; - - /* XXX The only one ugly spot where we need to - * XXX really change the sockets identity after - * XXX it has entered the hashes. -DaveM - * - * Besides that, it does not check for connection - * uniqueness. Wait for troubles. - */ - __sk_prot_rehash(sk); - return 0; -} - -int tcp_v4_rebuild_header(struct sock *sk) -{ - struct inet_sock *inet = inet_sk(sk); - struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0); - u32 daddr; - int err; - - /* Route is OK, nothing to do. */ - if (rt) - return 0; - - /* Reroute. */ - daddr = inet->daddr; - if (inet->opt && inet->opt->srr) - daddr = inet->opt->faddr; - - { - struct flowi fl = { .oif = sk->sk_bound_dev_if, - .nl_u = { .ip4_u = - { .daddr = daddr, - .saddr = inet->saddr, - .tos = RT_CONN_FLAGS(sk) } }, - .proto = IPPROTO_TCP, - .uli_u = { .ports = - { .sport = inet->sport, - .dport = inet->dport } } }; - - err = ip_route_output_flow(&rt, &fl, sk, 0); - } - if (!err) { - sk_setup_caps(sk, &rt->u.dst); - return 0; - } - - /* Routing failed... */ - sk->sk_route_caps = 0; - - if (!sysctl_ip_dynaddr || - sk->sk_state != TCP_SYN_SENT || - (sk->sk_userlocks & SOCK_BINDADDR_LOCK) || - (err = tcp_v4_reselect_saddr(sk)) != 0) - sk->sk_err_soft = -err; - - return err; -} - static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) { struct sockaddr_in *sin = (struct sockaddr_in *) uaddr; @@ -1998,7 +1903,7 @@ int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw) struct tcp_func ipv4_specific = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, - .rebuild_header = tcp_v4_rebuild_header, + .rebuild_header = inet_sk_rebuild_header, .conn_request = tcp_v4_conn_request, .syn_recv_sock = tcp_v4_syn_recv_sock, .remember_stamp = tcp_v4_remember_stamp, @@ -2630,7 +2535,6 @@ EXPORT_SYMBOL(tcp_unhash); EXPORT_SYMBOL(tcp_v4_conn_request); EXPORT_SYMBOL(tcp_v4_connect); EXPORT_SYMBOL(tcp_v4_do_rcv); -EXPORT_SYMBOL(tcp_v4_rebuild_header); EXPORT_SYMBOL(tcp_v4_remember_stamp); EXPORT_SYMBOL(tcp_v4_send_check); EXPORT_SYMBOL(tcp_v4_syn_recv_sock); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 885e05b..4e32a849 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1983,7 +1983,7 @@ static struct tcp_func ipv6_specific = { static struct tcp_func ipv6_mapped = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, - .rebuild_header = tcp_v4_rebuild_header, + .rebuild_header = inet_sk_rebuild_header, .conn_request = tcp_v6_conn_request, .syn_recv_sock = tcp_v6_syn_recv_sock, .remember_stamp = tcp_v4_remember_stamp, -- cgit v0.10.2 From 838ab6364956d9bdcefe84712de1621cf20a40b3 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 19:50:45 -0700 Subject: [NETFILTER]: Add refcounting and /proc/net/netfilter interface to nfnetlink_queue Signed-off-by: Harald Welte Signed-off-by: David S. Miller diff --git a/include/linux/netfilter/nfnetlink_queue.h b/include/linux/netfilter/nfnetlink_queue.h index edb463a..e142b0f 100644 --- a/include/linux/netfilter/nfnetlink_queue.h +++ b/include/linux/netfilter/nfnetlink_queue.h @@ -81,5 +81,6 @@ enum nfqnl_attr_config { NFQA_CFG_PARAMS, /* nfqnl_msg_config_params */ __NFQA_CFG_MAX }; +#define NFQA_CFG_MAX (__NFQA_CFG_MAX-1) #endif /* _NFNETLINK_QUEUE_H */ diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 2403261..eab309e 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -48,6 +49,7 @@ struct nfqnl_queue_entry { struct nfqnl_instance { struct hlist_node hlist; /* global list of queues */ + atomic_t use; int peer_pid; unsigned int queue_maxlen; @@ -105,17 +107,28 @@ __instance_lookup(u_int16_t queue_num) } static struct nfqnl_instance * -instance_lookup(u_int16_t queue_num) +instance_lookup_get(u_int16_t queue_num) { struct nfqnl_instance *inst; read_lock_bh(&instances_lock); inst = __instance_lookup(queue_num); + if (inst) + atomic_inc(&inst->use); read_unlock_bh(&instances_lock); return inst; } +static void +instance_put(struct nfqnl_instance *inst) +{ + if (inst && atomic_dec_and_test(&inst->use)) { + QDEBUG("kfree(inst=%p)\n", inst); + kfree(inst); + } +} + static struct nfqnl_instance * instance_create(u_int16_t queue_num, int pid) { @@ -141,6 +154,8 @@ instance_create(u_int16_t queue_num, int pid) inst->copy_range = 0xfffff; inst->copy_mode = NFQNL_COPY_NONE; atomic_set(&inst->id_sequence, 0); + /* needs to be two, since we _put() after creation */ + atomic_set(&inst->use, 2); inst->lock = SPIN_LOCK_UNLOCKED; INIT_LIST_HEAD(&inst->queue_list); @@ -182,8 +197,8 @@ _instance_destroy2(struct nfqnl_instance *inst, int lock) /* then flush all pending skbs from the queue */ nfqnl_flush(inst, NF_DROP); - /* and finally free the data structure */ - kfree(inst); + /* and finally put the refcount */ + instance_put(inst); module_put(THIS_MODULE); } @@ -471,7 +486,7 @@ nfqnl_enqueue_packet(struct sk_buff *skb, struct nf_info *info, QDEBUG("entered\n"); - queue = instance_lookup(queuenum); + queue = instance_lookup_get(queuenum); if (!queue) { QDEBUG("no queue instance matching\n"); return -EINVAL; @@ -479,7 +494,8 @@ nfqnl_enqueue_packet(struct sk_buff *skb, struct nf_info *info, if (queue->copy_mode == NFQNL_COPY_NONE) { QDEBUG("mode COPY_NONE, aborting\n"); - return -EAGAIN; + status = -EAGAIN; + goto err_out_put; } entry = kmalloc(sizeof(*entry), GFP_ATOMIC); @@ -487,7 +503,8 @@ nfqnl_enqueue_packet(struct sk_buff *skb, struct nf_info *info, if (net_ratelimit()) printk(KERN_ERR "nf_queue: OOM in nfqnl_enqueue_packet()\n"); - return -ENOMEM; + status = -ENOMEM; + goto err_out_put; } entry->info = info; @@ -523,6 +540,7 @@ nfqnl_enqueue_packet(struct sk_buff *skb, struct nf_info *info, __enqueue_entry(queue, entry); spin_unlock_bh(&queue->lock); + instance_put(queue); return status; err_out_free_nskb: @@ -533,6 +551,8 @@ err_out_unlock: err_out_free: kfree(entry); +err_out_put: + instance_put(queue); return status; } @@ -685,6 +705,12 @@ static struct notifier_block nfqnl_rtnl_notifier = { .notifier_call = nfqnl_rcv_nl_event, }; +static const int nfqa_verdict_min[NFQA_MAX] = { + [NFQA_VERDICT_HDR-1] = sizeof(struct nfqnl_msg_verdict_hdr), + [NFQA_MARK-1] = sizeof(u_int32_t), + [NFQA_PAYLOAD-1] = 0, +}; + static int nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb, struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp) @@ -696,26 +722,40 @@ nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb, struct nfqnl_instance *queue; unsigned int verdict; struct nfqnl_queue_entry *entry; + int err; - queue = instance_lookup(queue_num); + if (nfattr_bad_size(nfqa, NFQA_MAX, nfqa_verdict_min)) { + QDEBUG("bad attribute size\n"); + return -EINVAL; + } + + queue = instance_lookup_get(queue_num); if (!queue) return -ENODEV; - if (queue->peer_pid != NETLINK_CB(skb).pid) - return -EPERM; + if (queue->peer_pid != NETLINK_CB(skb).pid) { + err = -EPERM; + goto err_out_put; + } - if (!nfqa[NFQA_VERDICT_HDR-1]) - return -EINVAL; + if (!nfqa[NFQA_VERDICT_HDR-1]) { + err = -EINVAL; + goto err_out_put; + } vhdr = NFA_DATA(nfqa[NFQA_VERDICT_HDR-1]); verdict = ntohl(vhdr->verdict); - if ((verdict & NF_VERDICT_MASK) > NF_MAX_VERDICT) - return -EINVAL; + if ((verdict & NF_VERDICT_MASK) > NF_MAX_VERDICT) { + err = -EINVAL; + goto err_out_put; + } entry = find_dequeue_entry(queue, id_cmp, ntohl(vhdr->id)); - if (entry == NULL) - return -ENOENT; + if (entry == NULL) { + err = -ENOENT; + goto err_out_put; + } if (nfqa[NFQA_PAYLOAD-1]) { if (nfqnl_mangle(NFA_DATA(nfqa[NFQA_PAYLOAD-1]), @@ -727,7 +767,12 @@ nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb, skb->nfmark = ntohl(*(u_int32_t *)NFA_DATA(nfqa[NFQA_MARK-1])); issue_verdict(entry, verdict); + instance_put(queue); return 0; + +err_out_put: + instance_put(queue); + return err; } static int @@ -737,6 +782,11 @@ nfqnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb, return -ENOTSUPP; } +static const int nfqa_cfg_min[NFQA_CFG_MAX] = { + [NFQA_CFG_CMD-1] = sizeof(struct nfqnl_msg_config_cmd), + [NFQA_CFG_PARAMS-1] = sizeof(struct nfqnl_msg_config_params), +}; + static int nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb, struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp) @@ -744,10 +794,16 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb, struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); u_int16_t queue_num = ntohs(nfmsg->res_id); struct nfqnl_instance *queue; + int ret = 0; QDEBUG("entering for msg %u\n", NFNL_MSG_TYPE(nlh->nlmsg_type)); - queue = instance_lookup(queue_num); + if (nfattr_bad_size(nfqa, NFQA_CFG_MAX, nfqa_cfg_min)) { + QDEBUG("bad attribute size\n"); + return -EINVAL; + } + + queue = instance_lookup_get(queue_num); if (nfqa[NFQA_CFG_CMD-1]) { struct nfqnl_msg_config_cmd *cmd; cmd = NFA_DATA(nfqa[NFQA_CFG_CMD-1]); @@ -766,17 +822,19 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb, if (!queue) return -ENODEV; - if (queue->peer_pid != NETLINK_CB(skb).pid) - return -EPERM; + if (queue->peer_pid != NETLINK_CB(skb).pid) { + ret = -EPERM; + goto out_put; + } instance_destroy(queue); break; case NFQNL_CFG_CMD_PF_BIND: QDEBUG("registering queue handler for pf=%u\n", ntohs(cmd->pf)); - return nf_register_queue_handler(ntohs(cmd->pf), - nfqnl_enqueue_packet, - NULL); + ret = nf_register_queue_handler(ntohs(cmd->pf), + nfqnl_enqueue_packet, + NULL); break; case NFQNL_CFG_CMD_PF_UNBIND: @@ -784,20 +842,23 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb, ntohs(cmd->pf)); /* This is a bug and a feature. We can unregister * other handlers(!) */ - return nf_unregister_queue_handler(ntohs(cmd->pf)); + ret = nf_unregister_queue_handler(ntohs(cmd->pf)); break; default: - return -EINVAL; + ret = -EINVAL; + break; } } else { if (!queue) { QDEBUG("no config command, and no instance ENOENT\n"); - return -ENOENT; + ret = -ENOENT; + goto out_put; } if (queue->peer_pid != NETLINK_CB(skb).pid) { QDEBUG("no config command, and wrong pid\n"); - return -EPERM; + ret = -EPERM; + goto out_put; } } @@ -809,7 +870,9 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb, ntohl(params->copy_range)); } - return 0; +out_put: + instance_put(queue); + return ret; } static struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = { @@ -829,14 +892,132 @@ static struct nfnetlink_subsystem nfqnl_subsys = { .cb = nfqnl_cb, }; +#ifdef CONFIG_PROC_FS +struct iter_state { + unsigned int bucket; +}; + +static struct hlist_node *get_first(struct seq_file *seq) +{ + struct iter_state *st = seq->private; + + if (!st) + return NULL; + + for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) { + if (!hlist_empty(&instance_table[st->bucket])) + return instance_table[st->bucket].first; + } + return NULL; +} + +static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h) +{ + struct iter_state *st = seq->private; + + h = h->next; + while (!h) { + if (++st->bucket >= INSTANCE_BUCKETS) + return NULL; + + h = instance_table[st->bucket].first; + } + return h; +} + +static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos) +{ + struct hlist_node *head; + head = get_first(seq); + + if (head) + while (pos && (head = get_next(seq, head))) + pos--; + return pos ? NULL : head; +} + +static void *seq_start(struct seq_file *seq, loff_t *pos) +{ + read_lock_bh(&instances_lock); + return get_idx(seq, *pos); +} + +static void *seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + (*pos)++; + return get_next(s, v); +} + +static void seq_stop(struct seq_file *s, void *v) +{ + read_unlock_bh(&instances_lock); +} + +static int seq_show(struct seq_file *s, void *v) +{ + const struct nfqnl_instance *inst = v; + + return seq_printf(s, "%5d %6d %5d %1d %5d %5d %5d %8d %2d\n", + inst->queue_num, + inst->peer_pid, inst->queue_total, + inst->copy_mode, inst->copy_range, + inst->queue_dropped, inst->queue_user_dropped, + atomic_read(&inst->id_sequence), + atomic_read(&inst->use)); +} + +static struct seq_operations nfqnl_seq_ops = { + .start = seq_start, + .next = seq_next, + .stop = seq_stop, + .show = seq_show, +}; + +static int nfqnl_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + struct iter_state *is; + int ret; + + is = kmalloc(sizeof(*is), GFP_KERNEL); + if (!is) + return -ENOMEM; + memset(is, 0, sizeof(*is)); + ret = seq_open(file, &nfqnl_seq_ops); + if (ret < 0) + goto out_free; + seq = file->private_data; + seq->private = is; + return ret; +out_free: + kfree(is); + return ret; +} + +static struct file_operations nfqnl_file_ops = { + .owner = THIS_MODULE, + .open = nfqnl_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +#endif /* PROC_FS */ + static int init_or_cleanup(int init) { - int status = -ENOMEM; + int i, status = -ENOMEM; +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *proc_nfqueue; +#endif if (!init) goto cleanup; + for (i = 0; i < INSTANCE_BUCKETS; i++) + INIT_HLIST_HEAD(&instance_table[i]); + netlink_register_notifier(&nfqnl_rtnl_notifier); status = nfnetlink_subsys_register(&nfqnl_subsys); if (status < 0) { @@ -844,14 +1025,25 @@ init_or_cleanup(int init) goto cleanup_netlink_notifier; } +#ifdef CONFIG_PROC_FS + proc_nfqueue = create_proc_entry("nfnetlink_queue", 0440, + proc_net_netfilter); + if (!proc_nfqueue) + goto cleanup_subsys; + proc_nfqueue->proc_fops = &nfqnl_file_ops; +#endif + register_netdevice_notifier(&nfqnl_dev_notifier); + return status; cleanup: nf_unregister_queue_handlers(nfqnl_enqueue_packet); unregister_netdevice_notifier(&nfqnl_dev_notifier); +#ifdef CONFIG_PROC_FS +cleanup_subsys: +#endif nfnetlink_subsys_unregister(&nfqnl_subsys); - cleanup_netlink_notifier: netlink_unregister_notifier(&nfqnl_rtnl_notifier); return status; -- cgit v0.10.2 From 608c8e4f7b6e61cc783283e9dff8a465a5ad59bb Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 19:58:27 -0700 Subject: [NETFILTER]: Extend netfilter logging API This patch is in preparation to nfnetlink_log: - loggers now have to register struct nf_logger instead of nf_logfn - nf_log_unregister() replaced by nf_log_unregister_pf() and nf_log_unregister_logger() - add comment to ip[6]t_LOG.h to assure nobody redefines flags - add /proc/net/netfilter/nf_log to tell user which logger is currently registered for which address family - if user has configured logging, but no logging backend (logger) is available, always spit a message to syslog, not just the first time. - split ip[6]t_LOG.c into two parts: Backend: Always try to register as logger for the respective address family Frontend: Always log via nf_log_packet() API - modify all users of nf_log_packet() to accomodate additional argument Signed-off-by: Harald Welte Signed-off-by: David S. Miller diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 711e05f..815583a 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -114,15 +114,51 @@ void nf_unregister_sockopt(struct nf_sockopt_ops *reg); extern struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS]; -typedef void nf_logfn(unsigned int hooknum, +/* those NF_LOG_* defines and struct nf_loginfo are legacy definitios that will + * disappear once iptables is replaced with pkttables. Please DO NOT use them + * for any new code! */ +#define NF_LOG_TCPSEQ 0x01 /* Log TCP sequence numbers */ +#define NF_LOG_TCPOPT 0x02 /* Log TCP options */ +#define NF_LOG_IPOPT 0x04 /* Log IP options */ +#define NF_LOG_UID 0x08 /* Log UID owning local socket */ +#define NF_LOG_MASK 0x0f + +#define NF_LOG_TYPE_LOG 0x01 +#define NF_LOG_TYPE_ULOG 0x02 + +struct nf_loginfo { + u_int8_t type; + union { + struct { + u_int32_t copy_len; + u_int16_t group; + u_int16_t qthreshold; + } ulog; + struct { + u_int8_t level; + u_int8_t logflags; + } log; + } u; +}; + +typedef void nf_logfn(unsigned int pf, + unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, + const struct nf_loginfo *li, const char *prefix); +struct nf_logger { + struct module *me; + nf_logfn *logfn; + char *name; +}; + /* Function to register/unregister log function. */ -int nf_log_register(int pf, nf_logfn *logfn); -void nf_log_unregister(int pf, nf_logfn *logfn); +int nf_log_register(int pf, struct nf_logger *logger); +void nf_log_unregister_pf(int pf); +void nf_log_unregister_logger(struct nf_logger *logger); /* Calls the registered backend logging function */ void nf_log_packet(int pf, @@ -130,6 +166,7 @@ void nf_log_packet(int pf, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, + struct nf_loginfo *li, const char *fmt, ...); /* Activate hook; either okfn or kfree_skb called, unless a hook @@ -221,6 +258,11 @@ struct nf_queue_rerouter { extern int nf_register_queue_rerouter(int pf, struct nf_queue_rerouter *rer); extern int nf_unregister_queue_rerouter(int pf); +#ifdef CONFIG_PROC_FS +#include +extern struct proc_dir_entry *proc_net_netfilter; +#endif + #else /* !CONFIG_NETFILTER */ #define NF_HOOK(pf, hook, skb, indev, outdev, okfn) (okfn)(skb) static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {} diff --git a/include/linux/netfilter_ipv4/ipt_LOG.h b/include/linux/netfilter_ipv4/ipt_LOG.h index d25f782..22d1617 100644 --- a/include/linux/netfilter_ipv4/ipt_LOG.h +++ b/include/linux/netfilter_ipv4/ipt_LOG.h @@ -1,6 +1,7 @@ #ifndef _IPT_LOG_H #define _IPT_LOG_H +/* make sure not to change this without changing netfilter.h:NF_LOG_* (!) */ #define IPT_LOG_TCPSEQ 0x01 /* Log TCP sequence numbers */ #define IPT_LOG_TCPOPT 0x02 /* Log TCP options */ #define IPT_LOG_IPOPT 0x04 /* Log IP options */ diff --git a/include/linux/netfilter_ipv6/ip6t_LOG.h b/include/linux/netfilter_ipv6/ip6t_LOG.h index 42996a4..9008ff5 100644 --- a/include/linux/netfilter_ipv6/ip6t_LOG.h +++ b/include/linux/netfilter_ipv6/ip6t_LOG.h @@ -1,6 +1,7 @@ #ifndef _IP6T_LOG_H #define _IP6T_LOG_H +/* make sure not to change this without changing netfilter.h:NF_LOG_* (!) */ #define IP6T_LOG_TCPSEQ 0x01 /* Log TCP sequence numbers */ #define IP6T_LOG_TCPOPT 0x02 /* Log TCP options */ #define IP6T_LOG_IPOPT 0x04 /* Log IP options */ diff --git a/net/core/netfilter.c b/net/core/netfilter.c index 3e38084..98cc61e 100644 --- a/net/core/netfilter.c +++ b/net/core/netfilter.c @@ -22,6 +22,7 @@ #include #include #include +#include #include /* In this code, we can be waiting indefinitely for userspace to @@ -535,11 +536,10 @@ EXPORT_SYMBOL(skb_make_writable); #define NF_LOG_PREFIXLEN 128 -static nf_logfn *nf_logging[NPROTO]; /* = NULL */ -static int reported = 0; +static struct nf_logger *nf_logging[NPROTO]; /* = NULL */ static DEFINE_SPINLOCK(nf_log_lock); -int nf_log_register(int pf, nf_logfn *logfn) +int nf_log_register(int pf, struct nf_logger *logger) { int ret = -EBUSY; @@ -547,54 +547,134 @@ int nf_log_register(int pf, nf_logfn *logfn) * substituting pointer. */ spin_lock(&nf_log_lock); if (!nf_logging[pf]) { - rcu_assign_pointer(nf_logging[pf], logfn); + rcu_assign_pointer(nf_logging[pf], logger); ret = 0; } spin_unlock(&nf_log_lock); return ret; } -void nf_log_unregister(int pf, nf_logfn *logfn) +void nf_log_unregister_pf(int pf) { spin_lock(&nf_log_lock); - if (nf_logging[pf] == logfn) - nf_logging[pf] = NULL; + nf_logging[pf] = NULL; spin_unlock(&nf_log_lock); /* Give time to concurrent readers. */ synchronize_net(); -} +} + +void nf_log_unregister_logger(struct nf_logger *logger) +{ + int i; + + spin_lock(&nf_log_lock); + for (i = 0; i < NPROTO; i++) { + if (nf_logging[i] == logger) + nf_logging[i] = NULL; + } + spin_unlock(&nf_log_lock); + + synchronize_net(); +} void nf_log_packet(int pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, + struct nf_loginfo *loginfo, const char *fmt, ...) { va_list args; char prefix[NF_LOG_PREFIXLEN]; - nf_logfn *logfn; + struct nf_logger *logger; rcu_read_lock(); - logfn = rcu_dereference(nf_logging[pf]); - if (logfn) { + logger = rcu_dereference(nf_logging[pf]); + if (logger) { va_start(args, fmt); vsnprintf(prefix, sizeof(prefix), fmt, args); va_end(args); /* We must read logging before nf_logfn[pf] */ - logfn(hooknum, skb, in, out, prefix); - } else if (!reported) { - printk(KERN_WARNING "nf_log_packet: can\'t log yet, " - "no backend logging module loaded in!\n"); - reported++; + logger->logfn(pf, hooknum, skb, in, out, loginfo, prefix); + } else if (net_ratelimit()) { + printk(KERN_WARNING "nf_log_packet: can\'t log since " + "no backend logging module loaded in! Please either " + "load one, or disable logging explicitly\n"); } rcu_read_unlock(); } EXPORT_SYMBOL(nf_log_register); -EXPORT_SYMBOL(nf_log_unregister); +EXPORT_SYMBOL(nf_log_unregister_pf); +EXPORT_SYMBOL(nf_log_unregister_logger); EXPORT_SYMBOL(nf_log_packet); +#ifdef CONFIG_PROC_FS +struct proc_dir_entry *proc_net_netfilter; +EXPORT_SYMBOL(proc_net_netfilter); + +static void *seq_start(struct seq_file *seq, loff_t *pos) +{ + rcu_read_lock(); + + if (*pos >= NPROTO) + return NULL; + + return pos; +} + +static void *seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + (*pos)++; + + if (*pos >= NPROTO) + return NULL; + + return pos; +} + +static void seq_stop(struct seq_file *s, void *v) +{ + rcu_read_unlock(); +} + +static int seq_show(struct seq_file *s, void *v) +{ + loff_t *pos = v; + const struct nf_logger *logger; + + logger = rcu_dereference(nf_logging[*pos]); + + if (!logger) + return seq_printf(s, "%2lld NONE\n", *pos); + + return seq_printf(s, "%2lld %s\n", *pos, logger->name); +} + +static struct seq_operations nflog_seq_ops = { + .start = seq_start, + .next = seq_next, + .stop = seq_stop, + .show = seq_show, +}; + +static int nflog_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &nflog_seq_ops); +} + +static struct file_operations nflog_file_ops = { + .owner = THIS_MODULE, + .open = nflog_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +#endif /* PROC_FS */ + + /* This does not belong here, but locally generated errors need it if connection tracking in use: without this, connection may not be in hash table, and hence manufactured ICMP or RST packets will not be associated with it. */ @@ -613,6 +693,9 @@ void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) void __init netfilter_init(void) { int i, h; +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *pde; +#endif queue_rerouter = kmalloc(NPROTO * sizeof(struct nf_queue_rerouter), GFP_KERNEL); @@ -624,6 +707,16 @@ void __init netfilter_init(void) for (h = 0; h < NF_MAX_HOOKS; h++) INIT_LIST_HEAD(&nf_hooks[i][h]); } + +#ifdef CONFIG_PROC_FS + proc_net_netfilter = proc_mkdir("netfilter", proc_net); + if (!proc_net_netfilter) + panic("cannot create netfilter proc entry"); + pde = create_proc_entry("nf_log", S_IRUGO, proc_net_netfilter); + if (!pde) + panic("cannot create /proc/net/netfilter/nf_log"); + pde->proc_fops = &nflog_file_ops; +#endif } EXPORT_SYMBOL(ip_ct_attach); diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c index 3f90cb9..838d1d6 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c @@ -217,7 +217,7 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, icmph = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_ih), &_ih); if (icmph == NULL) { if (LOG_INVALID(IPPROTO_ICMP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_icmp: short packet "); return -NF_ACCEPT; } @@ -231,13 +231,13 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, if (!(u16)csum_fold(skb->csum)) break; if (LOG_INVALID(IPPROTO_ICMP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_icmp: bad HW ICMP checksum "); return -NF_ACCEPT; case CHECKSUM_NONE: if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) { if (LOG_INVALID(IPPROTO_ICMP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_icmp: bad ICMP checksum "); return -NF_ACCEPT; } @@ -254,7 +254,7 @@ checksum_skipped: */ if (icmph->type > NR_ICMP_TYPES) { if (LOG_INVALID(IPPROTO_ICMP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_icmp: invalid ICMP type "); return -NF_ACCEPT; } diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index c2bce22..f23ef1f 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -716,7 +716,7 @@ static int tcp_in_window(struct ip_ct_tcp *state, res = 1; } else { if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_tcp: %s ", before(seq, sender->td_maxend + 1) ? after(end, sender->td_end - receiver->td_maxwin - 1) ? @@ -815,7 +815,7 @@ static int tcp_error(struct sk_buff *skb, sizeof(_tcph), &_tcph); if (th == NULL) { if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_tcp: short packet "); return -NF_ACCEPT; } @@ -823,7 +823,7 @@ static int tcp_error(struct sk_buff *skb, /* Not whole TCP header or malformed packet */ if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) { if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_tcp: truncated/malformed packet "); return -NF_ACCEPT; } @@ -840,7 +840,7 @@ static int tcp_error(struct sk_buff *skb, skb->ip_summed == CHECKSUM_HW ? skb->csum : skb_checksum(skb, iph->ihl*4, tcplen, 0))) { if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_tcp: bad TCP checksum "); return -NF_ACCEPT; } @@ -849,7 +849,7 @@ static int tcp_error(struct sk_buff *skb, tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR)); if (!tcp_valid_flags[tcpflags]) { if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_tcp: invalid TCP flag combination "); return -NF_ACCEPT; } @@ -897,8 +897,9 @@ static int tcp_packet(struct ip_conntrack *conntrack, */ write_unlock_bh(&tcp_lock); if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, - "ip_ct_tcp: killing out of sync session "); + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + NULL, "ip_ct_tcp: " + "killing out of sync session "); if (del_timer(&conntrack->timeout)) conntrack->timeout.function((unsigned long) conntrack); @@ -912,7 +913,7 @@ static int tcp_packet(struct ip_conntrack *conntrack, write_unlock_bh(&tcp_lock); if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_tcp: invalid packet ignored "); return NF_ACCEPT; case TCP_CONNTRACK_MAX: @@ -922,7 +923,7 @@ static int tcp_packet(struct ip_conntrack *conntrack, old_state); write_unlock_bh(&tcp_lock); if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_tcp: invalid state "); return -NF_ACCEPT; case TCP_CONNTRACK_SYN_SENT: @@ -943,7 +944,7 @@ static int tcp_packet(struct ip_conntrack *conntrack, write_unlock_bh(&tcp_lock); if (LOG_INVALID(IPPROTO_TCP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, - "ip_ct_tcp: invalid SYN"); + NULL, "ip_ct_tcp: invalid SYN"); return -NF_ACCEPT; } case TCP_CONNTRACK_CLOSE: diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c index 1413016..f2dcac7 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c @@ -98,7 +98,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, hdr = skb_header_pointer(skb, iph->ihl*4, sizeof(_hdr), &_hdr); if (hdr == NULL) { if (LOG_INVALID(IPPROTO_UDP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_udp: short packet "); return -NF_ACCEPT; } @@ -106,7 +106,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, /* Truncated/malformed packets */ if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) { if (LOG_INVALID(IPPROTO_UDP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_udp: truncated/malformed packet "); return -NF_ACCEPT; } @@ -126,7 +126,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, skb->ip_summed == CHECKSUM_HW ? skb->csum : skb_checksum(skb, iph->ihl*4, udplen, 0))) { if (LOG_INVALID(IPPROTO_UDP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_udp: bad UDP checksum "); return -NF_ACCEPT; } diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c index ef08733..92ed050 100644 --- a/net/ipv4/netfilter/ipt_LOG.c +++ b/net/ipv4/netfilter/ipt_LOG.c @@ -27,10 +27,6 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team "); MODULE_DESCRIPTION("iptables syslog logging module"); -static unsigned int nflog = 1; -module_param(nflog, int, 0400); -MODULE_PARM_DESC(nflog, "register as internal netfilter logging module"); - #if 0 #define DEBUGP printk #else @@ -41,11 +37,17 @@ MODULE_PARM_DESC(nflog, "register as internal netfilter logging module"); static DEFINE_SPINLOCK(log_lock); /* One level of recursion won't kill us */ -static void dump_packet(const struct ipt_log_info *info, +static void dump_packet(const struct nf_loginfo *info, const struct sk_buff *skb, unsigned int iphoff) { struct iphdr _iph, *ih; + unsigned int logflags; + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + else + logflags = NF_LOG_MASK; ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); if (ih == NULL) { @@ -76,7 +78,7 @@ static void dump_packet(const struct ipt_log_info *info, if (ntohs(ih->frag_off) & IP_OFFSET) printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); - if ((info->logflags & IPT_LOG_IPOPT) + if ((logflags & IPT_LOG_IPOPT) && ih->ihl * 4 > sizeof(struct iphdr)) { unsigned char _opt[4 * 15 - sizeof(struct iphdr)], *op; unsigned int i, optsize; @@ -119,7 +121,7 @@ static void dump_packet(const struct ipt_log_info *info, printk("SPT=%u DPT=%u ", ntohs(th->source), ntohs(th->dest)); /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ - if (info->logflags & IPT_LOG_TCPSEQ) + if (logflags & IPT_LOG_TCPSEQ) printk("SEQ=%u ACK=%u ", ntohl(th->seq), ntohl(th->ack_seq)); /* Max length: 13 "WINDOW=65535 " */ @@ -146,7 +148,7 @@ static void dump_packet(const struct ipt_log_info *info, /* Max length: 11 "URGP=65535 " */ printk("URGP=%u ", ntohs(th->urg_ptr)); - if ((info->logflags & IPT_LOG_TCPOPT) + if ((logflags & IPT_LOG_TCPOPT) && th->doff * 4 > sizeof(struct tcphdr)) { unsigned char _opt[4 * 15 - sizeof(struct tcphdr)]; unsigned char *op; @@ -328,7 +330,7 @@ static void dump_packet(const struct ipt_log_info *info, } /* Max length: 15 "UID=4294967295 " */ - if ((info->logflags & IPT_LOG_UID) && !iphoff && skb->sk) { + if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) { read_lock_bh(&skb->sk->sk_callback_lock); if (skb->sk->sk_socket && skb->sk->sk_socket->file) printk("UID=%u ", skb->sk->sk_socket->file->f_uid); @@ -349,19 +351,31 @@ static void dump_packet(const struct ipt_log_info *info, /* maxlen = 230+ 91 + 230 + 252 = 803 */ } +struct nf_loginfo default_loginfo = { + .type = NF_LOG_TYPE_LOG, + .u = { + .log = { + .level = 0, + .logflags = NF_LOG_MASK, + }, + }, +}; + static void -ipt_log_packet(unsigned int hooknum, +ipt_log_packet(unsigned int pf, + unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, - const struct ipt_log_info *loginfo, - const char *level_string, + const struct nf_loginfo *loginfo, const char *prefix) { + if (!loginfo) + loginfo = &default_loginfo; + spin_lock_bh(&log_lock); - printk(level_string); - printk("%sIN=%s OUT=%s ", - prefix == NULL ? loginfo->prefix : prefix, + printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level, + prefix, in ? in->name : "", out ? out->name : ""); #ifdef CONFIG_BRIDGE_NETFILTER @@ -405,28 +419,15 @@ ipt_log_target(struct sk_buff **pskb, void *userinfo) { const struct ipt_log_info *loginfo = targinfo; - char level_string[4] = "< >"; + struct nf_loginfo li; - level_string[1] = '0' + (loginfo->level % 8); - ipt_log_packet(hooknum, *pskb, in, out, loginfo, level_string, NULL); + li.type = NF_LOG_TYPE_LOG; + li.u.log.level = loginfo->level; + li.u.log.logflags = loginfo->logflags; - return IPT_CONTINUE; -} + nf_log_packet(PF_INET, hooknum, *pskb, in, out, &li, loginfo->prefix); -static void -ipt_logfn(unsigned int hooknum, - const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const char *prefix) -{ - struct ipt_log_info loginfo = { - .level = 0, - .logflags = IPT_LOG_MASK, - .prefix = "" - }; - - ipt_log_packet(hooknum, skb, in, out, &loginfo, KERN_WARNING, prefix); + return IPT_CONTINUE; } static int ipt_log_checkentry(const char *tablename, @@ -464,20 +465,29 @@ static struct ipt_target ipt_log_reg = { .me = THIS_MODULE, }; +static struct nf_logger ipt_log_logger ={ + .name = "ipt_LOG", + .logfn = &ipt_log_packet, + .me = THIS_MODULE, +}; + static int __init init(void) { if (ipt_register_target(&ipt_log_reg)) return -EINVAL; - if (nflog) - nf_log_register(PF_INET, &ipt_logfn); + if (nf_log_register(PF_INET, &ipt_log_logger) < 0) { + printk(KERN_WARNING "ipt_LOG: not logging via system console " + "since somebody else already registered for PF_INET\n"); + /* we cannot make module load fail here, since otherwise + * iptables userspace would abort */ + } return 0; } static void __exit fini(void) { - if (nflog) - nf_log_unregister(PF_INET, &ipt_logfn); + nf_log_unregister_logger(&ipt_log_logger); ipt_unregister_target(&ipt_log_reg); } diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index 4ea8371..b86f06e 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -304,18 +304,27 @@ static unsigned int ipt_ulog_target(struct sk_buff **pskb, return IPT_CONTINUE; } -static void ipt_logfn(unsigned int hooknum, +static void ipt_logfn(unsigned int pf, + unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, + const struct nf_loginfo *li, const char *prefix) { - struct ipt_ulog_info loginfo = { - .nl_group = ULOG_DEFAULT_NLGROUP, - .copy_range = 0, - .qthreshold = ULOG_DEFAULT_QTHRESHOLD, - .prefix = "" - }; + struct ipt_ulog_info loginfo; + + if (!li || li->type != NF_LOG_TYPE_ULOG) { + loginfo.nl_group = ULOG_DEFAULT_NLGROUP; + loginfo.copy_range = 0; + loginfo.qthreshold = ULOG_DEFAULT_QTHRESHOLD; + loginfo.prefix[0] = '\0'; + } else { + loginfo.nl_group = li->u.ulog.group; + loginfo.copy_range = li->u.ulog.copy_len; + loginfo.qthreshold = li->u.ulog.qthreshold; + strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix)); + } ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix); } @@ -355,6 +364,12 @@ static struct ipt_target ipt_ulog_reg = { .me = THIS_MODULE, }; +static struct nf_logger ipt_ulog_logger = { + .name = "ipt_ULOG", + .logfn = &ipt_logfn, + .me = THIS_MODULE, +}; + static int __init init(void) { int i; @@ -382,7 +397,7 @@ static int __init init(void) return -EINVAL; } if (nflog) - nf_log_register(PF_INET, &ipt_logfn); + nf_log_register(PF_INET, &ipt_ulog_logger); return 0; } @@ -395,7 +410,7 @@ static void __exit fini(void) DEBUGP("ipt_ULOG: cleanup_module\n"); if (nflog) - nf_log_unregister(PF_INET, &ipt_logfn); + nf_log_unregister_logger(&ipt_ulog_logger); ipt_unregister_target(&ipt_ulog_reg); sock_release(nflognl->sk_socket); diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c index a692e26..0cd1d1b 100644 --- a/net/ipv6/netfilter/ip6t_LOG.c +++ b/net/ipv6/netfilter/ip6t_LOG.c @@ -26,10 +26,6 @@ MODULE_AUTHOR("Jan Rekorajski "); MODULE_DESCRIPTION("IP6 tables LOG target module"); MODULE_LICENSE("GPL"); -static unsigned int nflog = 1; -module_param(nflog, int, 0400); -MODULE_PARM_DESC(nflog, "register as internal netfilter logging module"); - struct in_device; #include #include @@ -44,7 +40,7 @@ struct in_device; static DEFINE_SPINLOCK(log_lock); /* One level of recursion won't kill us */ -static void dump_packet(const struct ip6t_log_info *info, +static void dump_packet(const struct nf_loginfo *info, const struct sk_buff *skb, unsigned int ip6hoff, int recurse) { @@ -53,6 +49,12 @@ static void dump_packet(const struct ip6t_log_info *info, struct ipv6hdr _ip6h, *ih; unsigned int ptr; unsigned int hdrlen = 0; + unsigned int logflags; + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + else + logflags = NF_LOG_MASK; ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h); if (ih == NULL) { @@ -84,7 +86,7 @@ static void dump_packet(const struct ip6t_log_info *info, } /* Max length: 48 "OPT (...) " */ - if (info->logflags & IP6T_LOG_IPOPT) + if (logflags & IP6T_LOG_IPOPT) printk("OPT ( "); switch (currenthdr) { @@ -119,7 +121,7 @@ static void dump_packet(const struct ip6t_log_info *info, case IPPROTO_ROUTING: case IPPROTO_HOPOPTS: if (fragment) { - if (info->logflags & IP6T_LOG_IPOPT) + if (logflags & IP6T_LOG_IPOPT) printk(")"); return; } @@ -127,7 +129,7 @@ static void dump_packet(const struct ip6t_log_info *info, break; /* Max Length */ case IPPROTO_AH: - if (info->logflags & IP6T_LOG_IPOPT) { + if (logflags & IP6T_LOG_IPOPT) { struct ip_auth_hdr _ahdr, *ah; /* Max length: 3 "AH " */ @@ -158,7 +160,7 @@ static void dump_packet(const struct ip6t_log_info *info, hdrlen = (hp->hdrlen+2)<<2; break; case IPPROTO_ESP: - if (info->logflags & IP6T_LOG_IPOPT) { + if (logflags & IP6T_LOG_IPOPT) { struct ip_esp_hdr _esph, *eh; /* Max length: 4 "ESP " */ @@ -190,7 +192,7 @@ static void dump_packet(const struct ip6t_log_info *info, printk("Unknown Ext Hdr %u", currenthdr); return; } - if (info->logflags & IP6T_LOG_IPOPT) + if (logflags & IP6T_LOG_IPOPT) printk(") "); currenthdr = hp->nexthdr; @@ -218,7 +220,7 @@ static void dump_packet(const struct ip6t_log_info *info, printk("SPT=%u DPT=%u ", ntohs(th->source), ntohs(th->dest)); /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ - if (info->logflags & IP6T_LOG_TCPSEQ) + if (logflags & IP6T_LOG_TCPSEQ) printk("SEQ=%u ACK=%u ", ntohl(th->seq), ntohl(th->ack_seq)); /* Max length: 13 "WINDOW=65535 " */ @@ -245,7 +247,7 @@ static void dump_packet(const struct ip6t_log_info *info, /* Max length: 11 "URGP=65535 " */ printk("URGP=%u ", ntohs(th->urg_ptr)); - if ((info->logflags & IP6T_LOG_TCPOPT) + if ((logflags & IP6T_LOG_TCPOPT) && th->doff * 4 > sizeof(struct tcphdr)) { u_int8_t _opt[60 - sizeof(struct tcphdr)], *op; unsigned int i; @@ -349,7 +351,7 @@ static void dump_packet(const struct ip6t_log_info *info, } /* Max length: 15 "UID=4294967295 " */ - if ((info->logflags & IP6T_LOG_UID) && recurse && skb->sk) { + if ((logflags & IP6T_LOG_UID) && recurse && skb->sk) { read_lock_bh(&skb->sk->sk_callback_lock); if (skb->sk->sk_socket && skb->sk->sk_socket->file) printk("UID=%u ", skb->sk->sk_socket->file->f_uid); @@ -357,19 +359,31 @@ static void dump_packet(const struct ip6t_log_info *info, } } +static struct nf_loginfo default_loginfo = { + .type = NF_LOG_TYPE_LOG, + .u = { + .log = { + .level = 0, + .logflags = NF_LOG_MASK, + }, + }, +}; + static void -ip6t_log_packet(unsigned int hooknum, +ip6t_log_packet(unsigned int pf, + unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, - const struct ip6t_log_info *loginfo, - const char *level_string, + const struct nf_loginfo *loginfo, const char *prefix) { + if (!loginfo) + loginfo = &default_loginfo; + spin_lock_bh(&log_lock); - printk(level_string); - printk("%sIN=%s OUT=%s ", - prefix == NULL ? loginfo->prefix : prefix, + printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level, + prefix, in ? in->name : "", out ? out->name : ""); if (in && !out) { @@ -416,29 +430,17 @@ ip6t_log_target(struct sk_buff **pskb, void *userinfo) { const struct ip6t_log_info *loginfo = targinfo; - char level_string[4] = "< >"; + struct nf_loginfo li; + + li.type = NF_LOG_TYPE_LOG; + li.u.log.level = loginfo->level; + li.u.log.logflags = loginfo->logflags; - level_string[1] = '0' + (loginfo->level % 8); - ip6t_log_packet(hooknum, *pskb, in, out, loginfo, level_string, NULL); + nf_log_packet(PF_INET6, hooknum, *pskb, in, out, &li, loginfo->prefix); return IP6T_CONTINUE; } -static void -ip6t_logfn(unsigned int hooknum, - const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const char *prefix) -{ - struct ip6t_log_info loginfo = { - .level = 0, - .logflags = IP6T_LOG_MASK, - .prefix = "" - }; - - ip6t_log_packet(hooknum, skb, in, out, &loginfo, KERN_WARNING, prefix); -} static int ip6t_log_checkentry(const char *tablename, const struct ip6t_entry *e, @@ -475,20 +477,29 @@ static struct ip6t_target ip6t_log_reg = { .me = THIS_MODULE, }; +static struct nf_logger ip6t_logger = { + .name = "ip6t_LOG", + .logfn = &ip6t_log_packet, + .me = THIS_MODULE, +}; + static int __init init(void) { if (ip6t_register_target(&ip6t_log_reg)) return -EINVAL; - if (nflog) - nf_log_register(PF_INET6, &ip6t_logfn); + if (nf_log_register(PF_INET6, &ip6t_logger) < 0) { + printk(KERN_WARNING "ip6t_LOG: not logging via system console " + "since somebody else already registered for PF_INET6\n"); + /* we cannot make module load fail here, since otherwise + * ip6tables userspace would abort */ + } return 0; } static void __exit fini(void) { - if (nflog) - nf_log_unregister(PF_INET6, &ip6t_logfn); + nf_log_unregister_logger(&ip6t_logger); ip6t_unregister_target(&ip6t_log_reg); } -- cgit v0.10.2 From 0597f2680d666a3bcf101ac0c771ba7e50016bbd Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 19:58:39 -0700 Subject: [NETFILTER]: Add new "nfnetlink_log" userspace packet logging facility This is a generic (layer3 independent) version of what ipt_ULOG is already doing for IPv4 today. ipt_ULOG, ebt_ulog and finally also ip[6]t_LOG will be deprecated by this mechanism in the long term. Signed-off-by: Harald Welte Signed-off-by: David S. Miller diff --git a/include/linux/netfilter/nfnetlink_log.h b/include/linux/netfilter/nfnetlink_log.h new file mode 100644 index 0000000..420ff46 --- /dev/null +++ b/include/linux/netfilter/nfnetlink_log.h @@ -0,0 +1,85 @@ +#ifndef _NFNETLINK_LOG_H +#define _NFNETLINK_LOG_H + +/* This file describes the netlink messages (i.e. 'protocol packets'), + * and not any kind of function definitions. It is shared between kernel and + * userspace. Don't put kernel specific stuff in here */ + +#include + +enum nfulnl_msg_types { + NFULNL_MSG_PACKET, /* packet from kernel to userspace */ + NFULNL_MSG_CONFIG, /* connect to a particular queue */ + + NFULNL_MSG_MAX +}; + +struct nfulnl_msg_packet_hdr { + u_int16_t hw_protocol; /* hw protocol (network order) */ + u_int8_t hook; /* netfilter hook */ + u_int8_t _pad; +} __attribute__ ((packed)); + +struct nfulnl_msg_packet_hw { + u_int16_t hw_addrlen; + u_int16_t _pad; + u_int8_t hw_addr[8]; +} __attribute__ ((packed)); + +struct nfulnl_msg_packet_timestamp { + u_int64_t sec; + u_int64_t usec; +} __attribute__ ((packed)); + +#define NFULNL_PREFIXLEN 30 /* just like old log target */ + +enum nfulnl_attr_type { + NFULA_UNSPEC, + NFULA_PACKET_HDR, + NFULA_MARK, /* u_int32_t nfmark */ + NFULA_TIMESTAMP, /* nfulnl_msg_packet_timestamp */ + NFULA_IFINDEX_INDEV, /* u_int32_t ifindex */ + NFULA_IFINDEX_OUTDEV, /* u_int32_t ifindex */ + NFULA_HWADDR, /* nfulnl_msg_packet_hw */ + NFULA_PAYLOAD, /* opaque data payload */ + NFULA_PREFIX, /* string prefix */ + NFULA_UID, /* user id of socket */ + + __NFULA_MAX +}; +#define NFULA_MAX (__NFULA_MAX - 1) + +enum nfulnl_msg_config_cmds { + NFULNL_CFG_CMD_NONE, + NFULNL_CFG_CMD_BIND, + NFULNL_CFG_CMD_UNBIND, + NFULNL_CFG_CMD_PF_BIND, + NFULNL_CFG_CMD_PF_UNBIND, +}; + +struct nfulnl_msg_config_cmd { + u_int8_t command; /* nfulnl_msg_config_cmds */ +} __attribute__ ((packed)); + +struct nfulnl_msg_config_mode { + u_int32_t copy_range; + u_int8_t copy_mode; + u_int8_t _pad; +} __attribute__ ((packed)); + +enum nfulnl_attr_config { + NFULA_CFG_UNSPEC, + NFULA_CFG_CMD, /* nfulnl_msg_config_cmd */ + NFULA_CFG_MODE, /* nfulnl_msg_config_mode */ + NFULA_CFG_NLBUFSIZ, /* u_int32_t buffer size */ + NFULA_CFG_TIMEOUT, /* u_int32_t in 1/100 s */ + NFULA_CFG_QTHRESH, /* u_int32_t */ + __NFULA_CFG_MAX +}; +#define NFULA_CFG_MAX (__NFULA_CFG_MAX -1) + +#define NFULNL_COPY_NONE 0x00 +#define NFULNL_COPY_META 0x01 +#define NFULNL_COPY_PACKET 0x02 + +#endif /* _NFNETLINK_LOG_H */ diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index f0eb23e..8296b38 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -11,3 +11,14 @@ config NETFILTER_NETLINK_QUEUE If this option isenabled, the kernel will include support for queueing packets via NFNETLINK. +config NETFILTER_NETLINK_LOG + tristate "Netfilter LOG over NFNETLINK interface" + depends on NETFILTER_NETLINK + help + If this option is enabled, the kernel will include support + for logging packets via NFNETLINK. + + This obsoletes the existing ipt_ULOG and ebg_ulog mechanisms, + and is also scheduled to replace the old syslog-based ipt_LOG + and ip6t_LOG modules. + diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 14a0b18..c41caeb 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -1,2 +1,3 @@ obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o +obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c new file mode 100644 index 0000000..f41045e --- /dev/null +++ b/net/netfilter/nfnetlink_log.c @@ -0,0 +1,995 @@ +/* + * This is a module which is used for logging packets to userspace via + * nfetlink. + * + * (C) 2005 by Harald Welte + * + * Based on the old ipv4-only ipt_ULOG.c: + * (C) 2000-2004 by Harald Welte + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define NFULNL_NLBUFSIZ_DEFAULT 4096 +#define NFULNL_TIMEOUT_DEFAULT 100 /* every second */ +#define NFULNL_QTHRESH_DEFAULT 100 /* 100 packets */ + +#define PRINTR(x, args...) do { if (net_ratelimit()) \ + printk(x, ## args); } while (0); + +#if 0 +#define UDEBUG(x, args ...) printk(KERN_DEBUG "%s(%d):%s(): " x, \ + __FILE__, __LINE__, __FUNCTION__, \ + ## args) +#else +#define UDEBUG(x, ...) +#endif + +struct nfulnl_instance { + struct hlist_node hlist; /* global list of instances */ + spinlock_t lock; + atomic_t use; /* use count */ + + unsigned int qlen; /* number of nlmsgs in skb */ + struct sk_buff *skb; /* pre-allocatd skb */ + struct nlmsghdr *lastnlh; /* netlink header of last msg in skb */ + struct timer_list timer; + int peer_pid; /* PID of the peer process */ + + /* configurable parameters */ + unsigned int flushtimeout; /* timeout until queue flush */ + unsigned int nlbufsiz; /* netlink buffer allocation size */ + unsigned int qthreshold; /* threshold of the queue */ + u_int32_t copy_range; + u_int16_t group_num; /* number of this queue */ + u_int8_t copy_mode; +}; + +static DEFINE_RWLOCK(instances_lock); + +#define INSTANCE_BUCKETS 16 +static struct hlist_head instance_table[INSTANCE_BUCKETS]; +static unsigned int hash_init; + +static inline u_int8_t instance_hashfn(u_int16_t group_num) +{ + return ((group_num & 0xff) % INSTANCE_BUCKETS); +} + +static struct nfulnl_instance * +__instance_lookup(u_int16_t group_num) +{ + struct hlist_head *head; + struct hlist_node *pos; + struct nfulnl_instance *inst; + + UDEBUG("entering (group_num=%u)\n", group_num); + + head = &instance_table[instance_hashfn(group_num)]; + hlist_for_each_entry(inst, pos, head, hlist) { + if (inst->group_num == group_num) + return inst; + } + return NULL; +} + +static inline void +instance_get(struct nfulnl_instance *inst) +{ + atomic_inc(&inst->use); +} + +static struct nfulnl_instance * +instance_lookup_get(u_int16_t group_num) +{ + struct nfulnl_instance *inst; + + read_lock_bh(&instances_lock); + inst = __instance_lookup(group_num); + if (inst) + instance_get(inst); + read_unlock_bh(&instances_lock); + + return inst; +} + +static void +instance_put(struct nfulnl_instance *inst) +{ + if (inst && atomic_dec_and_test(&inst->use)) { + UDEBUG("kfree(inst=%p)\n", inst); + kfree(inst); + } +} + +static void nfulnl_timer(unsigned long data); + +static struct nfulnl_instance * +instance_create(u_int16_t group_num, int pid) +{ + struct nfulnl_instance *inst; + + UDEBUG("entering (group_num=%u, pid=%d)\n", group_num, + pid); + + write_lock_bh(&instances_lock); + if (__instance_lookup(group_num)) { + inst = NULL; + UDEBUG("aborting, instance already exists\n"); + goto out_unlock; + } + + inst = kmalloc(sizeof(*inst), GFP_ATOMIC); + if (!inst) + goto out_unlock; + + memset(inst, 0, sizeof(*inst)); + INIT_HLIST_NODE(&inst->hlist); + inst->lock = SPIN_LOCK_UNLOCKED; + /* needs to be two, since we _put() after creation */ + atomic_set(&inst->use, 2); + + init_timer(&inst->timer); + inst->timer.function = nfulnl_timer; + inst->timer.data = (unsigned long)inst; + /* don't start timer yet. (re)start it with every packet */ + + inst->peer_pid = pid; + inst->group_num = group_num; + + inst->qthreshold = NFULNL_QTHRESH_DEFAULT; + inst->flushtimeout = NFULNL_TIMEOUT_DEFAULT; + inst->nlbufsiz = NFULNL_NLBUFSIZ_DEFAULT; + inst->copy_mode = NFULNL_COPY_PACKET; + inst->copy_range = 0xffff; + + if (!try_module_get(THIS_MODULE)) + goto out_free; + + hlist_add_head(&inst->hlist, + &instance_table[instance_hashfn(group_num)]); + + UDEBUG("newly added node: %p, next=%p\n", &inst->hlist, + inst->hlist.next); + + write_unlock_bh(&instances_lock); + + return inst; + +out_free: + instance_put(inst); +out_unlock: + write_unlock_bh(&instances_lock); + return NULL; +} + +static int __nfulnl_send(struct nfulnl_instance *inst); + +static void +_instance_destroy2(struct nfulnl_instance *inst, int lock) +{ + /* first pull it out of the global list */ + if (lock) + write_lock_bh(&instances_lock); + + UDEBUG("removing instance %p (queuenum=%u) from hash\n", + inst, inst->group_num); + + hlist_del(&inst->hlist); + + if (lock) + write_unlock_bh(&instances_lock); + + /* then flush all pending packets from skb */ + + spin_lock_bh(&inst->lock); + if (inst->skb) { + if (inst->qlen) + __nfulnl_send(inst); + if (inst->skb) { + kfree_skb(inst->skb); + inst->skb = NULL; + } + } + spin_unlock_bh(&inst->lock); + + /* and finally put the refcount */ + instance_put(inst); + + module_put(THIS_MODULE); +} + +static inline void +__instance_destroy(struct nfulnl_instance *inst) +{ + _instance_destroy2(inst, 0); +} + +static inline void +instance_destroy(struct nfulnl_instance *inst) +{ + _instance_destroy2(inst, 1); +} + +static int +nfulnl_set_mode(struct nfulnl_instance *inst, u_int8_t mode, + unsigned int range) +{ + int status = 0; + + spin_lock_bh(&inst->lock); + + switch (mode) { + case NFULNL_COPY_NONE: + case NFULNL_COPY_META: + inst->copy_mode = mode; + inst->copy_range = 0; + break; + + case NFULNL_COPY_PACKET: + inst->copy_mode = mode; + /* we're using struct nfattr which has 16bit nfa_len */ + if (range > 0xffff) + inst->copy_range = 0xffff; + else + inst->copy_range = range; + break; + + default: + status = -EINVAL; + break; + } + + spin_unlock_bh(&inst->lock); + + return status; +} + +static int +nfulnl_set_nlbufsiz(struct nfulnl_instance *inst, u_int32_t nlbufsiz) +{ + int status; + + spin_lock_bh(&inst->lock); + if (nlbufsiz < NFULNL_NLBUFSIZ_DEFAULT) + status = -ERANGE; + else if (nlbufsiz > 131072) + status = -ERANGE; + else { + inst->nlbufsiz = nlbufsiz; + status = 0; + } + spin_unlock_bh(&inst->lock); + + return status; +} + +static int +nfulnl_set_timeout(struct nfulnl_instance *inst, u_int32_t timeout) +{ + spin_lock_bh(&inst->lock); + inst->flushtimeout = timeout; + spin_unlock_bh(&inst->lock); + + return 0; +} + +static int +nfulnl_set_qthresh(struct nfulnl_instance *inst, u_int32_t qthresh) +{ + spin_lock_bh(&inst->lock); + inst->qthreshold = qthresh; + spin_unlock_bh(&inst->lock); + + return 0; +} + +static struct sk_buff *nfulnl_alloc_skb(unsigned int inst_size, + unsigned int pkt_size) +{ + struct sk_buff *skb; + + UDEBUG("entered (%u, %u)\n", inst_size, pkt_size); + + /* alloc skb which should be big enough for a whole multipart + * message. WARNING: has to be <= 128k due to slab restrictions */ + + skb = alloc_skb(inst_size, GFP_ATOMIC); + if (!skb) { + PRINTR("nfnetlink_log: can't alloc whole buffer (%u bytes)\n", + inst_size); + + /* try to allocate only as much as we need for current + * packet */ + + skb = alloc_skb(pkt_size, GFP_ATOMIC); + if (!skb) + PRINTR("nfnetlink_log: can't even alloc %u bytes\n", + pkt_size); + } + + return skb; +} + +static int +__nfulnl_send(struct nfulnl_instance *inst) +{ + int status; + + if (timer_pending(&inst->timer)) + del_timer(&inst->timer); + + if (inst->qlen > 1) + inst->lastnlh->nlmsg_type = NLMSG_DONE; + + status = nfnetlink_unicast(inst->skb, inst->peer_pid, MSG_DONTWAIT); + if (status < 0) { + UDEBUG("netlink_unicast() failed\n"); + /* FIXME: statistics */ + } + + inst->qlen = 0; + inst->skb = NULL; + inst->lastnlh = NULL; + + return status; +} + +static void nfulnl_timer(unsigned long data) +{ + struct nfulnl_instance *inst = (struct nfulnl_instance *)data; + + UDEBUG("timer function called, flushing buffer\n"); + + spin_lock_bh(&inst->lock); + __nfulnl_send(inst); + instance_put(inst); + spin_unlock_bh(&inst->lock); +} + +static inline int +__build_packet_message(struct nfulnl_instance *inst, + const struct sk_buff *skb, + unsigned int data_len, + unsigned int pf, + unsigned int hooknum, + const struct net_device *indev, + const struct net_device *outdev, + const struct nf_loginfo *li, + const char *prefix) +{ + unsigned char *old_tail; + struct nfulnl_msg_packet_hdr pmsg; + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + u_int32_t tmp_uint; + + UDEBUG("entered\n"); + + old_tail = inst->skb->tail; + nlh = NLMSG_PUT(inst->skb, 0, 0, + NFNL_SUBSYS_ULOG << 8 | NFULNL_MSG_PACKET, + sizeof(struct nfgenmsg)); + nfmsg = NLMSG_DATA(nlh); + nfmsg->nfgen_family = pf; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = htons(inst->group_num); + + pmsg.hw_protocol = htons(skb->protocol); + pmsg.hook = hooknum; + + NFA_PUT(inst->skb, NFULA_PACKET_HDR, sizeof(pmsg), &pmsg); + + if (prefix) { + int slen = strlen(prefix); + if (slen > NFULNL_PREFIXLEN) + slen = NFULNL_PREFIXLEN; + NFA_PUT(inst->skb, NFULA_PREFIX, slen, prefix); + } + + if (indev) { + tmp_uint = htonl(indev->ifindex); + NFA_PUT(inst->skb, NFULA_IFINDEX_INDEV, sizeof(tmp_uint), + &tmp_uint); + } + + if (outdev) { + tmp_uint = htonl(outdev->ifindex); + NFA_PUT(inst->skb, NFULA_IFINDEX_OUTDEV, sizeof(tmp_uint), + &tmp_uint); + } + + if (skb->nfmark) { + tmp_uint = htonl(skb->nfmark); + NFA_PUT(inst->skb, NFULA_MARK, sizeof(tmp_uint), &tmp_uint); + } + + if (indev && skb->dev && skb->dev->hard_header_parse) { + struct nfulnl_msg_packet_hw phw; + + phw.hw_addrlen = + skb->dev->hard_header_parse((struct sk_buff *)skb, + phw.hw_addr); + phw.hw_addrlen = htons(phw.hw_addrlen); + NFA_PUT(inst->skb, NFULA_HWADDR, sizeof(phw), &phw); + } + + if (skb->stamp.tv_sec) { + struct nfulnl_msg_packet_timestamp ts; + + ts.sec = cpu_to_be64(skb->stamp.tv_sec); + ts.usec = cpu_to_be64(skb->stamp.tv_usec); + + NFA_PUT(inst->skb, NFULA_TIMESTAMP, sizeof(ts), &ts); + } + + /* UID */ + if (skb->sk) { + read_lock_bh(&skb->sk->sk_callback_lock); + if (skb->sk->sk_socket && skb->sk->sk_socket->file) { + u_int32_t uid = htonl(skb->sk->sk_socket->file->f_uid); + /* need to unlock here since NFA_PUT may goto */ + read_unlock_bh(&skb->sk->sk_callback_lock); + NFA_PUT(inst->skb, NFULA_UID, sizeof(uid), &uid); + } else + read_unlock_bh(&skb->sk->sk_callback_lock); + } + + if (data_len) { + struct nfattr *nfa; + int size = NFA_LENGTH(data_len); + + if (skb_tailroom(inst->skb) < (int)NFA_SPACE(data_len)) { + printk(KERN_WARNING "nfnetlink_log: no tailroom!\n"); + goto nlmsg_failure; + } + + nfa = (struct nfattr *)skb_put(inst->skb, NFA_ALIGN(size)); + nfa->nfa_type = NFULA_PAYLOAD; + nfa->nfa_len = size; + + if (skb_copy_bits(skb, 0, NFA_DATA(nfa), data_len)) + BUG(); + } + + nlh->nlmsg_len = inst->skb->tail - old_tail; + return 0; + +nlmsg_failure: + UDEBUG("nlmsg_failure\n"); +nfattr_failure: + PRINTR(KERN_ERR "nfnetlink_log: error creating log nlmsg\n"); + return -1; +} + +#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) + +static struct nf_loginfo default_loginfo = { + .type = NF_LOG_TYPE_ULOG, + .u = { + .ulog = { + .copy_len = 0xffff, + .group = 0, + .qthreshold = 1, + }, + }, +}; + +/* log handler for internal netfilter logging api */ +static void +nfulnl_log_packet(unsigned int pf, + unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *li_user, + const char *prefix) +{ + unsigned int size, data_len; + struct nfulnl_instance *inst; + const struct nf_loginfo *li; + unsigned int qthreshold; + unsigned int nlbufsiz; + + if (li_user && li_user->type == NF_LOG_TYPE_ULOG) + li = li_user; + else + li = &default_loginfo; + + inst = instance_lookup_get(li->u.ulog.group); + if (!inst) + inst = instance_lookup_get(0); + if (!inst) { + PRINTR("nfnetlink_log: trying to log packet, " + "but no instance for group %u\n", li->u.ulog.group); + return; + } + + /* all macros expand to constant values at compile time */ + /* FIXME: do we want to make the size calculation conditional based on + * what is actually present? way more branches and checks, but more + * memory efficient... */ + size = NLMSG_SPACE(sizeof(struct nfgenmsg)) + + NFA_SPACE(sizeof(struct nfulnl_msg_packet_hdr)) + + NFA_SPACE(sizeof(u_int32_t)) /* ifindex */ + + NFA_SPACE(sizeof(u_int32_t)) /* ifindex */ + + NFA_SPACE(sizeof(u_int32_t)) /* mark */ + + NFA_SPACE(sizeof(u_int32_t)) /* uid */ + + NFA_SPACE(NFULNL_PREFIXLEN) /* prefix */ + + NFA_SPACE(sizeof(struct nfulnl_msg_packet_hw)) + + NFA_SPACE(sizeof(struct nfulnl_msg_packet_timestamp)); + + UDEBUG("initial size=%u\n", size); + + spin_lock_bh(&inst->lock); + + qthreshold = inst->qthreshold; + /* per-rule qthreshold overrides per-instance */ + if (qthreshold > li->u.ulog.qthreshold) + qthreshold = li->u.ulog.qthreshold; + + switch (inst->copy_mode) { + case NFULNL_COPY_META: + case NFULNL_COPY_NONE: + data_len = 0; + break; + + case NFULNL_COPY_PACKET: + if (inst->copy_range == 0 + || inst->copy_range > skb->len) + data_len = skb->len; + else + data_len = inst->copy_range; + + size += NFA_SPACE(data_len); + UDEBUG("copy_packet, therefore size now %u\n", size); + break; + + default: + spin_unlock_bh(&inst->lock); + instance_put(inst); + return; + } + + if (size > inst->nlbufsiz) + nlbufsiz = size; + else + nlbufsiz = inst->nlbufsiz; + + if (!inst->skb) { + if (!(inst->skb = nfulnl_alloc_skb(nlbufsiz, size))) { + UDEBUG("error in nfulnl_alloc_skb(%u, %u)\n", + inst->nlbufsiz, size); + goto alloc_failure; + } + } else if (inst->qlen >= qthreshold || + size > skb_tailroom(inst->skb)) { + /* either the queue len is too high or we don't have + * enough room in the skb left. flush to userspace. */ + UDEBUG("flushing old skb\n"); + + __nfulnl_send(inst); + + if (!(inst->skb = nfulnl_alloc_skb(nlbufsiz, size))) { + UDEBUG("error in nfulnl_alloc_skb(%u, %u)\n", + inst->nlbufsiz, size); + goto alloc_failure; + } + } + + UDEBUG("qlen %d, qthreshold %d\n", inst->qlen, qthreshold); + inst->qlen++; + + __build_packet_message(inst, skb, data_len, pf, + hooknum, in, out, li, prefix); + + /* timer_pending always called within inst->lock, so there + * is no chance of a race here */ + if (!timer_pending(&inst->timer)) { + instance_get(inst); + inst->timer.expires = jiffies + (inst->flushtimeout*HZ/100); + add_timer(&inst->timer); + } + spin_unlock_bh(&inst->lock); + + return; + +alloc_failure: + spin_unlock_bh(&inst->lock); + instance_put(inst); + UDEBUG("error allocating skb\n"); + /* FIXME: statistics */ +} + +static int +nfulnl_rcv_nl_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct netlink_notify *n = ptr; + + if (event == NETLINK_URELEASE && + n->protocol == NETLINK_NETFILTER && n->pid) { + int i; + + /* destroy all instances for this pid */ + write_lock_bh(&instances_lock); + for (i = 0; i < INSTANCE_BUCKETS; i++) { + struct hlist_node *tmp, *t2; + struct nfulnl_instance *inst; + struct hlist_head *head = &instance_table[i]; + + hlist_for_each_entry_safe(inst, tmp, t2, head, hlist) { + UDEBUG("node = %p\n", inst); + if (n->pid == inst->peer_pid) + __instance_destroy(inst); + } + } + write_unlock_bh(&instances_lock); + } + return NOTIFY_DONE; +} + +static struct notifier_block nfulnl_rtnl_notifier = { + .notifier_call = nfulnl_rcv_nl_event, +}; + +static int +nfulnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp) +{ + return -ENOTSUPP; +} + +static struct nf_logger nfulnl_logger = { + .name = "nfnetlink_log", + .logfn = &nfulnl_log_packet, + .me = THIS_MODULE, +}; + +static const int nfula_min[NFULA_MAX] = { + [NFULA_PACKET_HDR-1] = sizeof(struct nfulnl_msg_packet_hdr), + [NFULA_MARK-1] = sizeof(u_int32_t), + [NFULA_TIMESTAMP-1] = sizeof(struct nfulnl_msg_packet_timestamp), + [NFULA_IFINDEX_INDEV-1] = sizeof(u_int32_t), + [NFULA_IFINDEX_OUTDEV-1]= sizeof(u_int32_t), + [NFULA_HWADDR-1] = sizeof(struct nfulnl_msg_packet_hw), + [NFULA_PAYLOAD-1] = 0, + [NFULA_PREFIX-1] = 0, + [NFULA_UID-1] = sizeof(u_int32_t), +}; + +static const int nfula_cfg_min[NFULA_CFG_MAX] = { + [NFULA_CFG_CMD-1] = sizeof(struct nfulnl_msg_config_cmd), + [NFULA_CFG_MODE-1] = sizeof(struct nfulnl_msg_config_mode), + [NFULA_CFG_TIMEOUT-1] = sizeof(u_int32_t), + [NFULA_CFG_QTHRESH-1] = sizeof(u_int32_t), + [NFULA_CFG_NLBUFSIZ-1] = sizeof(u_int32_t), +}; + +static int +nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *nfula[], int *errp) +{ + struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); + u_int16_t group_num = ntohs(nfmsg->res_id); + struct nfulnl_instance *inst; + int ret = 0; + + UDEBUG("entering for msg %u\n", NFNL_MSG_TYPE(nlh->nlmsg_type)); + + if (nfattr_bad_size(nfula, NFULA_CFG_MAX, nfula_cfg_min)) { + UDEBUG("bad attribute size\n"); + return -EINVAL; + } + + inst = instance_lookup_get(group_num); + if (nfula[NFULA_CFG_CMD-1]) { + u_int8_t pf = nfmsg->nfgen_family; + struct nfulnl_msg_config_cmd *cmd; + cmd = NFA_DATA(nfula[NFULA_CFG_CMD-1]); + UDEBUG("found CFG_CMD for\n"); + + switch (cmd->command) { + case NFULNL_CFG_CMD_BIND: + if (inst) { + ret = -EBUSY; + goto out_put; + } + + inst = instance_create(group_num, + NETLINK_CB(skb).pid); + if (!inst) { + ret = -EINVAL; + goto out_put; + } + break; + case NFULNL_CFG_CMD_UNBIND: + if (!inst) { + ret = -ENODEV; + goto out_put; + } + + if (inst->peer_pid != NETLINK_CB(skb).pid) { + ret = -EPERM; + goto out_put; + } + + instance_destroy(inst); + break; + case NFULNL_CFG_CMD_PF_BIND: + UDEBUG("registering log handler for pf=%u\n", pf); + ret = nf_log_register(pf, &nfulnl_logger); + break; + case NFULNL_CFG_CMD_PF_UNBIND: + UDEBUG("unregistering log handler for pf=%u\n", pf); + /* This is a bug and a feature. We cannot unregister + * other handlers, like nfnetlink_inst can */ + nf_log_unregister_pf(pf); + break; + default: + ret = -EINVAL; + break; + } + } else { + if (!inst) { + UDEBUG("no config command, and no instance for " + "group=%u pid=%u =>ENOENT\n", + group_num, NETLINK_CB(skb).pid); + ret = -ENOENT; + goto out_put; + } + + if (inst->peer_pid != NETLINK_CB(skb).pid) { + UDEBUG("no config command, and wrong pid\n"); + ret = -EPERM; + goto out_put; + } + } + + if (nfula[NFULA_CFG_MODE-1]) { + struct nfulnl_msg_config_mode *params; + params = NFA_DATA(nfula[NFULA_CFG_MODE-1]); + + nfulnl_set_mode(inst, params->copy_mode, + ntohs(params->copy_range)); + } + + if (nfula[NFULA_CFG_TIMEOUT-1]) { + u_int32_t timeout = + *(u_int32_t *)NFA_DATA(nfula[NFULA_CFG_TIMEOUT-1]); + + nfulnl_set_timeout(inst, ntohl(timeout)); + } + + if (nfula[NFULA_CFG_NLBUFSIZ-1]) { + u_int32_t nlbufsiz = + *(u_int32_t *)NFA_DATA(nfula[NFULA_CFG_NLBUFSIZ-1]); + + nfulnl_set_nlbufsiz(inst, ntohl(nlbufsiz)); + } + + if (nfula[NFULA_CFG_QTHRESH-1]) { + u_int32_t qthresh = + *(u_int16_t *)NFA_DATA(nfula[NFULA_CFG_QTHRESH-1]); + + nfulnl_set_qthresh(inst, ntohl(qthresh)); + } + +out_put: + instance_put(inst); + return ret; +} + +static struct nfnl_callback nfulnl_cb[NFULNL_MSG_MAX] = { + [NFULNL_MSG_PACKET] = { .call = nfulnl_recv_unsupp, + .cap_required = CAP_NET_ADMIN }, + [NFULNL_MSG_CONFIG] = { .call = nfulnl_recv_config, + .cap_required = CAP_NET_ADMIN }, +}; + +static struct nfnetlink_subsystem nfulnl_subsys = { + .name = "log", + .subsys_id = NFNL_SUBSYS_ULOG, + .cb_count = NFULNL_MSG_MAX, + .attr_count = NFULA_MAX, + .cb = nfulnl_cb, +}; + +#ifdef CONFIG_PROC_FS +struct iter_state { + unsigned int bucket; +}; + +static struct hlist_node *get_first(struct seq_file *seq) +{ + struct iter_state *st = seq->private; + + if (!st) + return NULL; + + for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) { + if (!hlist_empty(&instance_table[st->bucket])) + return instance_table[st->bucket].first; + } + return NULL; +} + +static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h) +{ + struct iter_state *st = seq->private; + + h = h->next; + while (!h) { + if (++st->bucket >= INSTANCE_BUCKETS) + return NULL; + + h = instance_table[st->bucket].first; + } + return h; +} + +static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos) +{ + struct hlist_node *head; + head = get_first(seq); + + if (head) + while (pos && (head = get_next(seq, head))) + pos--; + return pos ? NULL : head; +} + +static void *seq_start(struct seq_file *seq, loff_t *pos) +{ + read_lock_bh(&instances_lock); + return get_idx(seq, *pos); +} + +static void *seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + (*pos)++; + return get_next(s, v); +} + +static void seq_stop(struct seq_file *s, void *v) +{ + read_unlock_bh(&instances_lock); +} + +static int seq_show(struct seq_file *s, void *v) +{ + const struct nfulnl_instance *inst = v; + + return seq_printf(s, "%5d %6d %5d %1d %5d %6d %2d\n", + inst->group_num, + inst->peer_pid, inst->qlen, + inst->copy_mode, inst->copy_range, + inst->flushtimeout, atomic_read(&inst->use)); +} + +static struct seq_operations nful_seq_ops = { + .start = seq_start, + .next = seq_next, + .stop = seq_stop, + .show = seq_show, +}; + +static int nful_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + struct iter_state *is; + int ret; + + is = kmalloc(sizeof(*is), GFP_KERNEL); + if (!is) + return -ENOMEM; + memset(is, 0, sizeof(*is)); + ret = seq_open(file, &nful_seq_ops); + if (ret < 0) + goto out_free; + seq = file->private_data; + seq->private = is; + return ret; +out_free: + kfree(is); + return ret; +} + +static struct file_operations nful_file_ops = { + .owner = THIS_MODULE, + .open = nful_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +#endif /* PROC_FS */ + +static int +init_or_cleanup(int init) +{ + int i, status = -ENOMEM; +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *proc_nful; +#endif + + if (!init) + goto cleanup; + + for (i = 0; i < INSTANCE_BUCKETS; i++) + INIT_HLIST_HEAD(&instance_table[i]); + + /* it's not really all that important to have a random value, so + * we can do this from the init function, even if there hasn't + * been that much entropy yet */ + get_random_bytes(&hash_init, sizeof(hash_init)); + + netlink_register_notifier(&nfulnl_rtnl_notifier); + status = nfnetlink_subsys_register(&nfulnl_subsys); + if (status < 0) { + printk(KERN_ERR "log: failed to create netlink socket\n"); + goto cleanup_netlink_notifier; + } + +#ifdef CONFIG_PROC_FS + proc_nful = create_proc_entry("nfnetlink_log", 0440, + proc_net_netfilter); + if (!proc_nful) + goto cleanup_subsys; + proc_nful->proc_fops = &nful_file_ops; +#endif + + return status; + +cleanup: + nf_log_unregister_logger(&nfulnl_logger); +#ifdef CONFIG_PROC_FS + remove_proc_entry("nfnetlink_log", proc_net_netfilter); +cleanup_subsys: +#endif + nfnetlink_subsys_unregister(&nfulnl_subsys); +cleanup_netlink_notifier: + netlink_unregister_notifier(&nfulnl_rtnl_notifier); + return status; +} + +static int __init init(void) +{ + + return init_or_cleanup(1); +} + +static void __exit fini(void) +{ + init_or_cleanup(0); +} + +MODULE_DESCRIPTION("netfilter userspace logging"); +MODULE_AUTHOR("Harald Welte "); +MODULE_LICENSE("GPL"); + +module_init(init); +module_exit(fini); diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index eab309e..d7b0330 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -1041,6 +1041,7 @@ cleanup: nf_unregister_queue_handlers(nfqnl_enqueue_packet); unregister_netdevice_notifier(&nfqnl_dev_notifier); #ifdef CONFIG_PROC_FS + remove_proc_entry("nfnetlink_queue", proc_net_netfilter); cleanup_subsys: #endif nfnetlink_subsys_unregister(&nfqnl_subsys); -- cgit v0.10.2 From 304a16180fb6d2b153b45f6fbbcec1fa814496e5 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 19:59:20 -0700 Subject: [INET]: Move the TCP ehash functions to include/net/inet_hashtables.h To be shared with DCCP (and others), this is the start of a series of patches that will expose the already generic TCP hash table routines. The few changes noticed when calling gcc -S before/after on a pentium4 were of this type: movl 40(%esp), %edx cmpl %esi, 472(%edx) je .L168 - pushl $291 + pushl $272 pushl $.LC0 pushl $.LC1 pushl $.LC2 [acme@toy net-2.6.14]$ size net/ipv4/tcp_ipv4.before.o net/ipv4/tcp_ipv4.after.o text data bss dec hex filename 17804 516 140 18460 481c net/ipv4/tcp_ipv4.before.o 17804 516 140 18460 481c net/ipv4/tcp_ipv4.after.o Holler if some weird architecture has issues with things like this 8) Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h new file mode 100644 index 0000000..c4c9e39 --- /dev/null +++ b/include/net/inet_hashtables.h @@ -0,0 +1,40 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Authors: Lotsa people, from code originally in tcp + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _INET_HASHTABLES_H +#define _INET_HASHTABLES_H + +#include + +static inline int inet_ehashfn(const __u32 laddr, const __u16 lport, + const __u32 faddr, const __u16 fport, + const int ehash_size) +{ + int h = (laddr ^ lport) ^ (faddr ^ fport); + h ^= h >> 16; + h ^= h >> 8; + return h & (ehash_size - 1); +} + +static inline int inet_sk_ehashfn(const struct sock *sk, const int ehash_size) +{ + const struct inet_sock *inet = inet_sk(sk); + const __u32 laddr = inet->rcv_saddr; + const __u16 lport = inet->num; + const __u32 faddr = inet->daddr; + const __u16 fport = inet->dport; + + return inet_ehashfn(laddr, lport, faddr, fport, ehash_size); +} + +#endif /* _INET_HASHTABLES_H */ diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ae6fad9..c03d7e9 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -64,6 +64,7 @@ #include #include +#include #include #include #include @@ -104,26 +105,6 @@ struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = { int sysctl_local_port_range[2] = { 1024, 4999 }; int tcp_port_rover = 1024 - 1; -static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport, - __u32 faddr, __u16 fport) -{ - int h = (laddr ^ lport) ^ (faddr ^ fport); - h ^= h >> 16; - h ^= h >> 8; - return h & (tcp_ehash_size - 1); -} - -static __inline__ int tcp_sk_hashfn(struct sock *sk) -{ - struct inet_sock *inet = inet_sk(sk); - __u32 laddr = inet->rcv_saddr; - __u16 lport = inet->num; - __u32 faddr = inet->daddr; - __u16 fport = inet->dport; - - return tcp_hashfn(laddr, lport, faddr, fport); -} - /* Allocate and initialize a new TCP local port bind bucket. * The bindhash mutex for snum's hash chain must be held here. */ @@ -367,7 +348,8 @@ static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible) lock = &tcp_lhash_lock; tcp_listen_wlock(); } else { - list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain; + sk->sk_hashent = inet_sk_ehashfn(sk, tcp_ehash_size); + list = &tcp_ehash[sk->sk_hashent].chain; lock = &tcp_ehash[sk->sk_hashent].lock; write_lock(lock); } @@ -500,7 +482,7 @@ static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport, /* Optimize here for direct hit, only listening connections can * have wildcards anyways. */ - int hash = tcp_hashfn(daddr, hnum, saddr, sport); + const int hash = inet_ehashfn(daddr, hnum, saddr, sport, tcp_ehash_size); head = &tcp_ehash[hash]; read_lock(&head->lock); sk_for_each(sk, node, &head->chain) { @@ -563,7 +545,7 @@ static int __tcp_v4_check_established(struct sock *sk, __u16 lport, int dif = sk->sk_bound_dev_if; TCP_V4_ADDR_COOKIE(acookie, saddr, daddr) __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport); - int hash = tcp_hashfn(daddr, lport, saddr, inet->dport); + const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_ehash_size); struct tcp_ehash_bucket *head = &tcp_ehash[hash]; struct sock *sk2; struct hlist_node *node; -- cgit v0.10.2 From 0f7ff9274e72fd254fbd1ab117bbc1db6e7cdb34 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 19:59:44 -0700 Subject: [INET]: Just rename the TCP hashtable functions/structs to inet_ This is to break down the complexity of the series of patches, making it very clear that this one just does: 1. renames tcp_ prefixed hashtable functions and data structures that were already mostly generic to inet_ to share it with DCCP and other INET transport protocols. 2. Removes not used functions (__tb_head & tb_head) 3. Removes some leftover prototypes in the headers (tcp_bucket_unlock & tcp_v4_build_header) Next changesets will move tcp_sk(sk)->bind_hash to inet_sock so that we can make functions such as tcp_inherit_port, __tcp_inherit_port, tcp_v4_get_port, __tcp_put_port, generic and get others like tcp_destroy_sock closer to generic (tcp_orphan_count will go to sk->sk_prot to allow this). Eventually most of these functions will be used passing the transport protocol inet_hashinfo structure. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/linux/tcp.h b/include/linux/tcp.h index e4fd82e..ec580a5 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -258,7 +258,7 @@ struct tcp_sock { __u32 snd_sml; /* Last byte of the most recently transmitted small packet */ __u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ __u32 lsndtime; /* timestamp of last sent data packet (for restart window) */ - struct tcp_bind_bucket *bind_hash; + struct inet_bind_bucket *bind_hash; /* Delayed ACK control data */ struct { __u8 pending; /* ACK is pending */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 0c769ad..6c9f6f7 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -44,13 +44,13 @@ * New scheme, half the table is for TIME_WAIT, the other half is * for the rest. I'll experiment with dynamic table growth later. */ -struct tcp_ehash_bucket { +struct inet_ehash_bucket { rwlock_t lock; struct hlist_head chain; } __attribute__((__aligned__(8))); /* This is for listening sockets, thus all sockets which possess wildcards. */ -#define TCP_LHTABLE_SIZE 32 /* Yes, really, this is all you need. */ +#define INET_LHTABLE_SIZE 32 /* Yes, really, this is all you need. */ /* There are a few simple rules, which allow for local port reuse by * an application. In essence: @@ -83,31 +83,22 @@ struct tcp_ehash_bucket { * users logged onto your box, isn't it nice to know that new data * ports are created in O(1) time? I thought so. ;-) -DaveM */ -struct tcp_bind_bucket { +struct inet_bind_bucket { unsigned short port; signed short fastreuse; struct hlist_node node; struct hlist_head owners; }; -#define tb_for_each(tb, node, head) hlist_for_each_entry(tb, node, head, node) +#define inet_bind_bucket_for_each(tb, node, head) \ + hlist_for_each_entry(tb, node, head, node) -struct tcp_bind_hashbucket { +struct inet_bind_hashbucket { spinlock_t lock; struct hlist_head chain; }; -static inline struct tcp_bind_bucket *__tb_head(struct tcp_bind_hashbucket *head) -{ - return hlist_entry(head->chain.first, struct tcp_bind_bucket, node); -} - -static inline struct tcp_bind_bucket *tb_head(struct tcp_bind_hashbucket *head) -{ - return hlist_empty(&head->chain) ? NULL : __tb_head(head); -} - -extern struct tcp_hashinfo { +struct inet_hashinfo { /* This is for sockets with full identity only. Sockets here will * always be without wildcards and will have the following invariant: * @@ -116,21 +107,21 @@ extern struct tcp_hashinfo { * First half of the table is for sockets not in TIME_WAIT, second half * is for TIME_WAIT sockets only. */ - struct tcp_ehash_bucket *__tcp_ehash; + struct inet_ehash_bucket *ehash; /* Ok, let's try this, I give up, we do need a local binding * TCP hash as well as the others for fast bind/connect. */ - struct tcp_bind_hashbucket *__tcp_bhash; + struct inet_bind_hashbucket *bhash; - int __tcp_bhash_size; - int __tcp_ehash_size; + int bhash_size; + int ehash_size; /* All sockets in TCP_LISTEN state will be in here. This is the only * table where wildcard'd TCP sockets can exist. Hash function here * is just local port number. */ - struct hlist_head __tcp_listening_hash[TCP_LHTABLE_SIZE]; + struct hlist_head listening_hash[INET_LHTABLE_SIZE]; /* All the above members are written once at bootup and * never written again _or_ are predominantly read-access. @@ -138,36 +129,39 @@ extern struct tcp_hashinfo { * Now align to a new cache line as all the following members * are often dirty. */ - rwlock_t __tcp_lhash_lock ____cacheline_aligned; - atomic_t __tcp_lhash_users; - wait_queue_head_t __tcp_lhash_wait; - spinlock_t __tcp_portalloc_lock; -} tcp_hashinfo; - -#define tcp_ehash (tcp_hashinfo.__tcp_ehash) -#define tcp_bhash (tcp_hashinfo.__tcp_bhash) -#define tcp_ehash_size (tcp_hashinfo.__tcp_ehash_size) -#define tcp_bhash_size (tcp_hashinfo.__tcp_bhash_size) -#define tcp_listening_hash (tcp_hashinfo.__tcp_listening_hash) -#define tcp_lhash_lock (tcp_hashinfo.__tcp_lhash_lock) -#define tcp_lhash_users (tcp_hashinfo.__tcp_lhash_users) -#define tcp_lhash_wait (tcp_hashinfo.__tcp_lhash_wait) -#define tcp_portalloc_lock (tcp_hashinfo.__tcp_portalloc_lock) + rwlock_t lhash_lock ____cacheline_aligned; + atomic_t lhash_users; + wait_queue_head_t lhash_wait; + spinlock_t portalloc_lock; +}; + +extern struct inet_hashinfo tcp_hashinfo; +#define tcp_ehash (tcp_hashinfo.ehash) +#define tcp_bhash (tcp_hashinfo.bhash) +#define tcp_ehash_size (tcp_hashinfo.ehash_size) +#define tcp_bhash_size (tcp_hashinfo.bhash_size) +#define tcp_listening_hash (tcp_hashinfo.listening_hash) +#define tcp_lhash_lock (tcp_hashinfo.lhash_lock) +#define tcp_lhash_users (tcp_hashinfo.lhash_users) +#define tcp_lhash_wait (tcp_hashinfo.lhash_wait) +#define tcp_portalloc_lock (tcp_hashinfo.portalloc_lock) extern kmem_cache_t *tcp_bucket_cachep; -extern struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head, - unsigned short snum); -extern void tcp_bucket_destroy(struct tcp_bind_bucket *tb); -extern void tcp_bucket_unlock(struct sock *sk); +extern struct inet_bind_bucket * + inet_bind_bucket_create(kmem_cache_t *cachep, + struct inet_bind_hashbucket *head, + const unsigned short snum); +extern void inet_bind_bucket_destroy(kmem_cache_t *cachep, + struct inet_bind_bucket *tb); extern int tcp_port_rover; /* These are AF independent. */ -static __inline__ int tcp_bhashfn(__u16 lport) +static inline int inet_bhashfn(const __u16 lport, const int bhash_size) { - return (lport & (tcp_bhash_size - 1)); + return lport & (bhash_size - 1); } -extern void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb, +extern void tcp_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, unsigned short snum); #if (BITS_PER_LONG == 64) @@ -212,7 +206,7 @@ struct tcp_tw_bucket { __u32 tw_ts_recent; long tw_ts_recent_stamp; unsigned long tw_ttd; - struct tcp_bind_bucket *tw_tb; + struct inet_bind_bucket *tw_tb; struct hlist_node tw_death_node; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) struct in6_addr tw_v6_daddr; @@ -366,14 +360,14 @@ extern void tcp_tw_deschedule(struct tcp_tw_bucket *tw); (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) /* These can have wildcards, don't try too hard. */ -static __inline__ int tcp_lhashfn(unsigned short num) +static inline int inet_lhashfn(const unsigned short num) { - return num & (TCP_LHTABLE_SIZE - 1); + return num & (INET_LHTABLE_SIZE - 1); } -static __inline__ int tcp_sk_listen_hashfn(struct sock *sk) +static inline int inet_sk_listen_hashfn(const struct sock *sk) { - return tcp_lhashfn(inet_sk(sk)->num); + return inet_lhashfn(inet_sk(sk)->num); } #define MAX_TCP_HEADER (128 + MAX_HEADER) @@ -799,9 +793,6 @@ extern void tcp_parse_options(struct sk_buff *skb, * TCP v4 functions exported for the inet6 API */ -extern int tcp_v4_build_header(struct sock *sk, - struct sk_buff *skb); - extern void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, struct sk_buff *skb); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 20159a3..1ec03db 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -272,6 +272,9 @@ int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics); kmem_cache_t *tcp_bucket_cachep; + +EXPORT_SYMBOL_GPL(tcp_bucket_cachep); + kmem_cache_t *tcp_timewait_cachep; atomic_t tcp_orphan_count = ATOMIC_INIT(0); @@ -2259,7 +2262,7 @@ void __init tcp_init(void) sizeof(skb->cb)); tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket", - sizeof(struct tcp_bind_bucket), + sizeof(struct inet_bind_bucket), 0, SLAB_HWCACHE_ALIGN, NULL, NULL); if (!tcp_bucket_cachep) @@ -2277,9 +2280,9 @@ void __init tcp_init(void) * * The methodology is similar to that of the buffer cache. */ - tcp_ehash = (struct tcp_ehash_bucket *) + tcp_ehash = alloc_large_system_hash("TCP established", - sizeof(struct tcp_ehash_bucket), + sizeof(struct inet_ehash_bucket), thash_entries, (num_physpages >= 128 * 1024) ? (25 - PAGE_SHIFT) : @@ -2294,9 +2297,9 @@ void __init tcp_init(void) INIT_HLIST_HEAD(&tcp_ehash[i].chain); } - tcp_bhash = (struct tcp_bind_hashbucket *) + tcp_bhash = alloc_large_system_hash("TCP bind", - sizeof(struct tcp_bind_hashbucket), + sizeof(struct inet_bind_hashbucket), tcp_ehash_size, (num_physpages >= 128 * 1024) ? (25 - PAGE_SHIFT) : @@ -2315,7 +2318,7 @@ void __init tcp_init(void) * on available memory. */ for (order = 0; ((1 << order) << PAGE_SHIFT) < - (tcp_bhash_size * sizeof(struct tcp_bind_hashbucket)); + (tcp_bhash_size * sizeof(struct inet_bind_hashbucket)); order++) ; if (order >= 4) { diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index f79bd11..5bb6a0f 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -590,7 +590,7 @@ static int tcpdiag_dump(struct sk_buff *skb, struct netlink_callback *cb) if (!(r->tcpdiag_states&(TCPF_LISTEN|TCPF_SYN_RECV))) goto skip_listen_ht; tcp_listen_lock(); - for (i = s_i; i < TCP_LHTABLE_SIZE; i++) { + for (i = s_i; i < INET_LHTABLE_SIZE; i++) { struct sock *sk; struct hlist_node *node; @@ -646,7 +646,7 @@ skip_listen_ht: return skb->len; for (i = s_i; i < tcp_ehash_size; i++) { - struct tcp_ehash_bucket *head = &tcp_ehash[i]; + struct inet_ehash_bucket *head = &tcp_ehash[i]; struct sock *sk; struct hlist_node *node; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index c03d7e9..4138630 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -89,12 +89,11 @@ static struct socket *tcp_socket; void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, struct sk_buff *skb); -struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = { - .__tcp_lhash_lock = RW_LOCK_UNLOCKED, - .__tcp_lhash_users = ATOMIC_INIT(0), - .__tcp_lhash_wait - = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait), - .__tcp_portalloc_lock = SPIN_LOCK_UNLOCKED +struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { + .lhash_lock = RW_LOCK_UNLOCKED, + .lhash_users = ATOMIC_INIT(0), + .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait), + .portalloc_lock = SPIN_LOCK_UNLOCKED, }; /* @@ -105,14 +104,14 @@ struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = { int sysctl_local_port_range[2] = { 1024, 4999 }; int tcp_port_rover = 1024 - 1; -/* Allocate and initialize a new TCP local port bind bucket. +/* Allocate and initialize a new local port bind bucket. * The bindhash mutex for snum's hash chain must be held here. */ -struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head, - unsigned short snum) +struct inet_bind_bucket *inet_bind_bucket_create(kmem_cache_t *cachep, + struct inet_bind_hashbucket *head, + const unsigned short snum) { - struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep, - SLAB_ATOMIC); + struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, SLAB_ATOMIC); if (tb) { tb->port = snum; tb->fastreuse = 0; @@ -123,20 +122,21 @@ struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head, } /* Caller must hold hashbucket lock for this tb with local BH disabled */ -void tcp_bucket_destroy(struct tcp_bind_bucket *tb) +void inet_bind_bucket_destroy(kmem_cache_t *cachep, struct inet_bind_bucket *tb) { if (hlist_empty(&tb->owners)) { __hlist_del(&tb->node); - kmem_cache_free(tcp_bucket_cachep, tb); + kmem_cache_free(cachep, tb); } } /* Caller must disable local BH processing. */ static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child) { - struct tcp_bind_hashbucket *head = - &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)]; - struct tcp_bind_bucket *tb; + struct inet_bind_hashbucket *head = + &tcp_bhash[inet_bhashfn(inet_sk(child)->num, + tcp_bhash_size)]; + struct inet_bind_bucket *tb; spin_lock(&head->lock); tb = tcp_sk(sk)->bind_hash; @@ -152,15 +152,15 @@ inline void tcp_inherit_port(struct sock *sk, struct sock *child) local_bh_enable(); } -void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb, - unsigned short snum) +void tcp_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, + const unsigned short snum) { inet_sk(sk)->num = snum; sk_add_bind_node(sk, &tb->owners); tcp_sk(sk)->bind_hash = tb; } -static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb) +static inline int tcp_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb) { const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk); struct sock *sk2; @@ -190,9 +190,9 @@ static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb) */ static int tcp_v4_get_port(struct sock *sk, unsigned short snum) { - struct tcp_bind_hashbucket *head; + struct inet_bind_hashbucket *head; struct hlist_node *node; - struct tcp_bind_bucket *tb; + struct inet_bind_bucket *tb; int ret; local_bh_disable(); @@ -211,9 +211,9 @@ static int tcp_v4_get_port(struct sock *sk, unsigned short snum) rover++; if (rover > high) rover = low; - head = &tcp_bhash[tcp_bhashfn(rover)]; + head = &tcp_bhash[inet_bhashfn(rover, tcp_bhash_size)]; spin_lock(&head->lock); - tb_for_each(tb, node, &head->chain) + inet_bind_bucket_for_each(tb, node, &head->chain) if (tb->port == rover) goto next; break; @@ -238,9 +238,9 @@ static int tcp_v4_get_port(struct sock *sk, unsigned short snum) */ snum = rover; } else { - head = &tcp_bhash[tcp_bhashfn(snum)]; + head = &tcp_bhash[inet_bhashfn(snum, tcp_bhash_size)]; spin_lock(&head->lock); - tb_for_each(tb, node, &head->chain) + inet_bind_bucket_for_each(tb, node, &head->chain) if (tb->port == snum) goto tb_found; } @@ -261,7 +261,7 @@ tb_found: } tb_not_found: ret = 1; - if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL) + if (!tb && (tb = inet_bind_bucket_create(tcp_bucket_cachep, head, snum)) == NULL) goto fail_unlock; if (hlist_empty(&tb->owners)) { if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) @@ -290,15 +290,16 @@ fail: static void __tcp_put_port(struct sock *sk) { struct inet_sock *inet = inet_sk(sk); - struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)]; - struct tcp_bind_bucket *tb; + struct inet_bind_hashbucket *head = &tcp_bhash[inet_bhashfn(inet->num, + tcp_bhash_size)]; + struct inet_bind_bucket *tb; spin_lock(&head->lock); tb = tcp_sk(sk)->bind_hash; __sk_del_bind_node(sk); tcp_sk(sk)->bind_hash = NULL; inet->num = 0; - tcp_bucket_destroy(tb); + inet_bind_bucket_destroy(tcp_bucket_cachep, tb); spin_unlock(&head->lock); } @@ -344,7 +345,7 @@ static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible) BUG_TRAP(sk_unhashed(sk)); if (listen_possible && sk->sk_state == TCP_LISTEN) { - list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)]; + list = &tcp_listening_hash[inet_sk_listen_hashfn(sk)]; lock = &tcp_lhash_lock; tcp_listen_wlock(); } else { @@ -381,7 +382,7 @@ void tcp_unhash(struct sock *sk) tcp_listen_wlock(); lock = &tcp_lhash_lock; } else { - struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent]; + struct inet_ehash_bucket *head = &tcp_ehash[sk->sk_hashent]; lock = &head->lock; write_lock_bh(&head->lock); } @@ -401,8 +402,10 @@ void tcp_unhash(struct sock *sk) * connection. So always assume those are both wildcarded * during the search since they can never be otherwise. */ -static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr, - unsigned short hnum, int dif) +static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, + const u32 daddr, + const unsigned short hnum, + const int dif) { struct sock *result = NULL, *sk; struct hlist_node *node; @@ -438,14 +441,15 @@ static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr, } /* Optimize the common listener case. */ -static inline struct sock *tcp_v4_lookup_listener(u32 daddr, - unsigned short hnum, int dif) +static inline struct sock *tcp_v4_lookup_listener(const u32 daddr, + const unsigned short hnum, + const int dif) { struct sock *sk = NULL; struct hlist_head *head; read_lock(&tcp_lhash_lock); - head = &tcp_listening_hash[tcp_lhashfn(hnum)]; + head = &tcp_listening_hash[inet_lhashfn(hnum)]; if (!hlist_empty(head)) { struct inet_sock *inet = inet_sk((sk = __sk_head(head))); @@ -470,11 +474,13 @@ sherry_cache: * Local BH must be disabled here. */ -static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport, - u32 daddr, u16 hnum, - int dif) +static inline struct sock *__tcp_v4_lookup_established(const u32 saddr, + const u16 sport, + const u32 daddr, + const u16 hnum, + const int dif) { - struct tcp_ehash_bucket *head; + struct inet_ehash_bucket *head; TCP_V4_ADDR_COOKIE(acookie, saddr, daddr) __u32 ports = TCP_COMBINED_PORTS(sport, hnum); struct sock *sk; @@ -546,7 +552,7 @@ static int __tcp_v4_check_established(struct sock *sk, __u16 lport, TCP_V4_ADDR_COOKIE(acookie, saddr, daddr) __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport); const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_ehash_size); - struct tcp_ehash_bucket *head = &tcp_ehash[hash]; + struct inet_ehash_bucket *head = &tcp_ehash[hash]; struct sock *sk2; struct hlist_node *node; struct tcp_tw_bucket *tw; @@ -639,9 +645,9 @@ static inline u32 connect_port_offset(const struct sock *sk) */ static inline int tcp_v4_hash_connect(struct sock *sk) { - unsigned short snum = inet_sk(sk)->num; - struct tcp_bind_hashbucket *head; - struct tcp_bind_bucket *tb; + const unsigned short snum = inet_sk(sk)->num; + struct inet_bind_hashbucket *head; + struct inet_bind_bucket *tb; int ret; if (!snum) { @@ -658,14 +664,14 @@ static inline int tcp_v4_hash_connect(struct sock *sk) local_bh_disable(); for (i = 1; i <= range; i++) { port = low + (i + offset) % range; - head = &tcp_bhash[tcp_bhashfn(port)]; + head = &tcp_bhash[inet_bhashfn(port, tcp_bhash_size)]; spin_lock(&head->lock); /* Does not bother with rcv_saddr checks, * because the established check is already * unique enough. */ - tb_for_each(tb, node, &head->chain) { + inet_bind_bucket_for_each(tb, node, &head->chain) { if (tb->port == port) { BUG_TRAP(!hlist_empty(&tb->owners)); if (tb->fastreuse >= 0) @@ -678,7 +684,7 @@ static inline int tcp_v4_hash_connect(struct sock *sk) } } - tb = tcp_bucket_create(head, port); + tb = inet_bind_bucket_create(tcp_bucket_cachep, head, port); if (!tb) { spin_unlock(&head->lock); break; @@ -713,7 +719,7 @@ ok: goto out; } - head = &tcp_bhash[tcp_bhashfn(snum)]; + head = &tcp_bhash[inet_bhashfn(snum, tcp_bhash_size)]; tb = tcp_sk(sk)->bind_hash; spin_lock_bh(&head->lock); if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { @@ -2055,7 +2061,7 @@ start_req: } read_unlock_bh(&tp->accept_queue.syn_wait_lock); } - if (++st->bucket < TCP_LHTABLE_SIZE) { + if (++st->bucket < INET_LHTABLE_SIZE) { sk = sk_head(&tcp_listening_hash[st->bucket]); goto get_sk; } @@ -2506,7 +2512,7 @@ void __init tcp_v4_init(struct net_proto_family *ops) EXPORT_SYMBOL(ipv4_specific); EXPORT_SYMBOL(tcp_bind_hash); -EXPORT_SYMBOL(tcp_bucket_create); +EXPORT_SYMBOL(inet_bind_bucket_create); EXPORT_SYMBOL(tcp_hashinfo); EXPORT_SYMBOL(tcp_inherit_port); EXPORT_SYMBOL(tcp_listen_wlock); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 7c46a55..1df6cd46 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -60,9 +60,9 @@ int tcp_tw_count; /* Must be called with locally disabled BHs. */ static void tcp_timewait_kill(struct tcp_tw_bucket *tw) { - struct tcp_ehash_bucket *ehead; - struct tcp_bind_hashbucket *bhead; - struct tcp_bind_bucket *tb; + struct inet_ehash_bucket *ehead; + struct inet_bind_hashbucket *bhead; + struct inet_bind_bucket *tb; /* Unlink from established hashes. */ ehead = &tcp_ehash[tw->tw_hashent]; @@ -76,12 +76,12 @@ static void tcp_timewait_kill(struct tcp_tw_bucket *tw) write_unlock(&ehead->lock); /* Disassociate with bind bucket. */ - bhead = &tcp_bhash[tcp_bhashfn(tw->tw_num)]; + bhead = &tcp_bhash[inet_bhashfn(tw->tw_num, tcp_bhash_size)]; spin_lock(&bhead->lock); tb = tw->tw_tb; __hlist_del(&tw->tw_bind_node); tw->tw_tb = NULL; - tcp_bucket_destroy(tb); + inet_bind_bucket_destroy(tcp_bucket_cachep, tb); spin_unlock(&bhead->lock); #ifdef SOCK_REFCNT_DEBUG @@ -296,14 +296,14 @@ kill: */ static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw) { - struct tcp_ehash_bucket *ehead = &tcp_ehash[sk->sk_hashent]; - struct tcp_bind_hashbucket *bhead; + struct inet_ehash_bucket *ehead = &tcp_ehash[sk->sk_hashent]; + struct inet_bind_hashbucket *bhead; /* Step 1: Put TW into bind hash. Original socket stays there too. Note, that any socket with inet_sk(sk)->num != 0 MUST be bound in binding cache, even if it is closed. */ - bhead = &tcp_bhash[tcp_bhashfn(inet_sk(sk)->num)]; + bhead = &tcp_bhash[inet_bhashfn(inet_sk(sk)->num, tcp_bhash_size)]; spin_lock(&bhead->lock); tw->tw_tb = tcp_sk(sk)->bind_hash; BUG_TRAP(tcp_sk(sk)->bind_hash); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 4e32a849..31f50fb 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -98,11 +98,11 @@ static __inline__ int tcp_v6_sk_hashfn(struct sock *sk) return tcp_v6_hashfn(laddr, lport, faddr, fport); } -static inline int tcp_v6_bind_conflict(struct sock *sk, - struct tcp_bind_bucket *tb) +static inline int tcp_v6_bind_conflict(const struct sock *sk, + const struct inet_bind_bucket *tb) { - struct sock *sk2; - struct hlist_node *node; + const struct sock *sk2; + const struct hlist_node *node; /* We must walk the whole port owner list in this case. -DaveM */ sk_for_each_bound(sk2, node, &tb->owners) { @@ -126,8 +126,8 @@ static inline int tcp_v6_bind_conflict(struct sock *sk, */ static int tcp_v6_get_port(struct sock *sk, unsigned short snum) { - struct tcp_bind_hashbucket *head; - struct tcp_bind_bucket *tb; + struct inet_bind_hashbucket *head; + struct inet_bind_bucket *tb; struct hlist_node *node; int ret; @@ -146,9 +146,9 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum) do { rover++; if (rover > high) rover = low; - head = &tcp_bhash[tcp_bhashfn(rover)]; + head = &tcp_bhash[inet_bhashfn(rover, tcp_bhash_size)]; spin_lock(&head->lock); - tb_for_each(tb, node, &head->chain) + inet_bind_bucket_for_each(tb, node, &head->chain) if (tb->port == rover) goto next; break; @@ -171,9 +171,9 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum) /* OK, here is the one we will use. */ snum = rover; } else { - head = &tcp_bhash[tcp_bhashfn(snum)]; + head = &tcp_bhash[inet_bhashfn(snum, tcp_bhash_size)]; spin_lock(&head->lock); - tb_for_each(tb, node, &head->chain) + inet_bind_bucket_for_each(tb, node, &head->chain) if (tb->port == snum) goto tb_found; } @@ -192,7 +192,7 @@ tb_found: } tb_not_found: ret = 1; - if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL) + if (!tb && (tb = inet_bind_bucket_create(tcp_bucket_cachep, head, snum)) == NULL) goto fail_unlock; if (hlist_empty(&tb->owners)) { if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) @@ -224,7 +224,7 @@ static __inline__ void __tcp_v6_hash(struct sock *sk) BUG_TRAP(sk_unhashed(sk)); if (sk->sk_state == TCP_LISTEN) { - list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)]; + list = &tcp_listening_hash[inet_sk_listen_hashfn(sk)]; lock = &tcp_lhash_lock; tcp_listen_wlock(); } else { @@ -264,7 +264,7 @@ static struct sock *tcp_v6_lookup_listener(struct in6_addr *daddr, unsigned shor hiscore=0; read_lock(&tcp_lhash_lock); - sk_for_each(sk, node, &tcp_listening_hash[tcp_lhashfn(hnum)]) { + sk_for_each(sk, node, &tcp_listening_hash[inet_lhashfn(hnum)]) { if (inet_sk(sk)->num == hnum && sk->sk_family == PF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); @@ -305,7 +305,7 @@ static inline struct sock *__tcp_v6_lookup_established(struct in6_addr *saddr, u struct in6_addr *daddr, u16 hnum, int dif) { - struct tcp_ehash_bucket *head; + struct inet_ehash_bucket *head; struct sock *sk; struct hlist_node *node; __u32 ports = TCP_COMBINED_PORTS(sport, hnum); @@ -461,7 +461,7 @@ static int __tcp_v6_check_established(struct sock *sk, __u16 lport, int dif = sk->sk_bound_dev_if; u32 ports = TCP_COMBINED_PORTS(inet->dport, lport); int hash = tcp_v6_hashfn(daddr, inet->num, saddr, inet->dport); - struct tcp_ehash_bucket *head = &tcp_ehash[hash]; + struct inet_ehash_bucket *head = &tcp_ehash[hash]; struct sock *sk2; struct hlist_node *node; struct tcp_tw_bucket *tw; @@ -540,8 +540,8 @@ static inline u32 tcpv6_port_offset(const struct sock *sk) static int tcp_v6_hash_connect(struct sock *sk) { unsigned short snum = inet_sk(sk)->num; - struct tcp_bind_hashbucket *head; - struct tcp_bind_bucket *tb; + struct inet_bind_hashbucket *head; + struct inet_bind_bucket *tb; int ret; if (!snum) { @@ -558,14 +558,14 @@ static int tcp_v6_hash_connect(struct sock *sk) local_bh_disable(); for (i = 1; i <= range; i++) { port = low + (i + offset) % range; - head = &tcp_bhash[tcp_bhashfn(port)]; + head = &tcp_bhash[inet_bhashfn(port, tcp_bhash_size)]; spin_lock(&head->lock); /* Does not bother with rcv_saddr checks, * because the established check is already * unique enough. */ - tb_for_each(tb, node, &head->chain) { + inet_bind_bucket_for_each(tb, node, &head->chain) { if (tb->port == port) { BUG_TRAP(!hlist_empty(&tb->owners)); if (tb->fastreuse >= 0) @@ -578,7 +578,7 @@ static int tcp_v6_hash_connect(struct sock *sk) } } - tb = tcp_bucket_create(head, port); + tb = inet_bind_bucket_create(tcp_bucket_cachep, head, port); if (!tb) { spin_unlock(&head->lock); break; @@ -613,7 +613,7 @@ ok: goto out; } - head = &tcp_bhash[tcp_bhashfn(snum)]; + head = &tcp_bhash[inet_bhashfn(snum, tcp_bhash_size)]; tb = tcp_sk(sk)->bind_hash; spin_lock_bh(&head->lock); -- cgit v0.10.2 From 77d8bf9c6208eb535f05718168ffcc476be0ca8c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:00:51 -0700 Subject: [INET]: Move the TCP hashtable functions/structs to inet_hashtables.[ch] Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/net/inet_common.h b/include/net/inet_common.h index 1fbd94d..f943306 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -8,6 +8,11 @@ extern struct proto_ops inet_dgram_ops; * INET4 prototypes used by INET6 */ +struct msghdr; +struct sock; +struct sockaddr; +struct socket; + extern void inet_remove_sock(struct sock *sk1); extern void inet_put_sock(unsigned short num, struct sock *sk); diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index c4c9e39..3a6c11c 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -14,8 +14,107 @@ #ifndef _INET_HASHTABLES_H #define _INET_HASHTABLES_H +#include +#include +#include +#include #include +/* This is for all connections with a full identity, no wildcards. + * New scheme, half the table is for TIME_WAIT, the other half is + * for the rest. I'll experiment with dynamic table growth later. + */ +struct inet_ehash_bucket { + rwlock_t lock; + struct hlist_head chain; +} __attribute__((__aligned__(8))); + +/* There are a few simple rules, which allow for local port reuse by + * an application. In essence: + * + * 1) Sockets bound to different interfaces may share a local port. + * Failing that, goto test 2. + * 2) If all sockets have sk->sk_reuse set, and none of them are in + * TCP_LISTEN state, the port may be shared. + * Failing that, goto test 3. + * 3) If all sockets are bound to a specific inet_sk(sk)->rcv_saddr local + * address, and none of them are the same, the port may be + * shared. + * Failing this, the port cannot be shared. + * + * The interesting point, is test #2. This is what an FTP server does + * all day. To optimize this case we use a specific flag bit defined + * below. As we add sockets to a bind bucket list, we perform a + * check of: (newsk->sk_reuse && (newsk->sk_state != TCP_LISTEN)) + * As long as all sockets added to a bind bucket pass this test, + * the flag bit will be set. + * The resulting situation is that tcp_v[46]_verify_bind() can just check + * for this flag bit, if it is set and the socket trying to bind has + * sk->sk_reuse set, we don't even have to walk the owners list at all, + * we return that it is ok to bind this socket to the requested local port. + * + * Sounds like a lot of work, but it is worth it. In a more naive + * implementation (ie. current FreeBSD etc.) the entire list of ports + * must be walked for each data port opened by an ftp server. Needless + * to say, this does not scale at all. With a couple thousand FTP + * users logged onto your box, isn't it nice to know that new data + * ports are created in O(1) time? I thought so. ;-) -DaveM + */ +struct inet_bind_bucket { + unsigned short port; + signed short fastreuse; + struct hlist_node node; + struct hlist_head owners; +}; + +#define inet_bind_bucket_for_each(tb, node, head) \ + hlist_for_each_entry(tb, node, head, node) + +struct inet_bind_hashbucket { + spinlock_t lock; + struct hlist_head chain; +}; + +/* This is for listening sockets, thus all sockets which possess wildcards. */ +#define INET_LHTABLE_SIZE 32 /* Yes, really, this is all you need. */ + +struct inet_hashinfo { + /* This is for sockets with full identity only. Sockets here will + * always be without wildcards and will have the following invariant: + * + * TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE + * + * First half of the table is for sockets not in TIME_WAIT, second half + * is for TIME_WAIT sockets only. + */ + struct inet_ehash_bucket *ehash; + + /* Ok, let's try this, I give up, we do need a local binding + * TCP hash as well as the others for fast bind/connect. + */ + struct inet_bind_hashbucket *bhash; + + int bhash_size; + int ehash_size; + + /* All sockets in TCP_LISTEN state will be in here. This is the only + * table where wildcard'd TCP sockets can exist. Hash function here + * is just local port number. + */ + struct hlist_head listening_hash[INET_LHTABLE_SIZE]; + + /* All the above members are written once at bootup and + * never written again _or_ are predominantly read-access. + * + * Now align to a new cache line as all the following members + * are often dirty. + */ + rwlock_t lhash_lock ____cacheline_aligned; + atomic_t lhash_users; + wait_queue_head_t lhash_wait; + spinlock_t portalloc_lock; +}; + static inline int inet_ehashfn(const __u32 laddr, const __u16 lport, const __u32 faddr, const __u16 fport, const int ehash_size) @@ -37,4 +136,27 @@ static inline int inet_sk_ehashfn(const struct sock *sk, const int ehash_size) return inet_ehashfn(laddr, lport, faddr, fport, ehash_size); } +extern struct inet_bind_bucket * + inet_bind_bucket_create(kmem_cache_t *cachep, + struct inet_bind_hashbucket *head, + const unsigned short snum); +extern void inet_bind_bucket_destroy(kmem_cache_t *cachep, + struct inet_bind_bucket *tb); + +static inline int inet_bhashfn(const __u16 lport, const int bhash_size) +{ + return lport & (bhash_size - 1); +} + +/* These can have wildcards, don't try too hard. */ +static inline int inet_lhashfn(const unsigned short num) +{ + return num & (INET_LHTABLE_SIZE - 1); +} + +static inline int inet_sk_listen_hashfn(const struct sock *sk) +{ + return inet_lhashfn(inet_sk(sk)->num); +} + #endif /* _INET_HASHTABLES_H */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 6c9f6f7..ff5d30a 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -40,101 +41,6 @@ #endif #include -/* This is for all connections with a full identity, no wildcards. - * New scheme, half the table is for TIME_WAIT, the other half is - * for the rest. I'll experiment with dynamic table growth later. - */ -struct inet_ehash_bucket { - rwlock_t lock; - struct hlist_head chain; -} __attribute__((__aligned__(8))); - -/* This is for listening sockets, thus all sockets which possess wildcards. */ -#define INET_LHTABLE_SIZE 32 /* Yes, really, this is all you need. */ - -/* There are a few simple rules, which allow for local port reuse by - * an application. In essence: - * - * 1) Sockets bound to different interfaces may share a local port. - * Failing that, goto test 2. - * 2) If all sockets have sk->sk_reuse set, and none of them are in - * TCP_LISTEN state, the port may be shared. - * Failing that, goto test 3. - * 3) If all sockets are bound to a specific inet_sk(sk)->rcv_saddr local - * address, and none of them are the same, the port may be - * shared. - * Failing this, the port cannot be shared. - * - * The interesting point, is test #2. This is what an FTP server does - * all day. To optimize this case we use a specific flag bit defined - * below. As we add sockets to a bind bucket list, we perform a - * check of: (newsk->sk_reuse && (newsk->sk_state != TCP_LISTEN)) - * As long as all sockets added to a bind bucket pass this test, - * the flag bit will be set. - * The resulting situation is that tcp_v[46]_verify_bind() can just check - * for this flag bit, if it is set and the socket trying to bind has - * sk->sk_reuse set, we don't even have to walk the owners list at all, - * we return that it is ok to bind this socket to the requested local port. - * - * Sounds like a lot of work, but it is worth it. In a more naive - * implementation (ie. current FreeBSD etc.) the entire list of ports - * must be walked for each data port opened by an ftp server. Needless - * to say, this does not scale at all. With a couple thousand FTP - * users logged onto your box, isn't it nice to know that new data - * ports are created in O(1) time? I thought so. ;-) -DaveM - */ -struct inet_bind_bucket { - unsigned short port; - signed short fastreuse; - struct hlist_node node; - struct hlist_head owners; -}; - -#define inet_bind_bucket_for_each(tb, node, head) \ - hlist_for_each_entry(tb, node, head, node) - -struct inet_bind_hashbucket { - spinlock_t lock; - struct hlist_head chain; -}; - -struct inet_hashinfo { - /* This is for sockets with full identity only. Sockets here will - * always be without wildcards and will have the following invariant: - * - * TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE - * - * First half of the table is for sockets not in TIME_WAIT, second half - * is for TIME_WAIT sockets only. - */ - struct inet_ehash_bucket *ehash; - - /* Ok, let's try this, I give up, we do need a local binding - * TCP hash as well as the others for fast bind/connect. - */ - struct inet_bind_hashbucket *bhash; - - int bhash_size; - int ehash_size; - - /* All sockets in TCP_LISTEN state will be in here. This is the only - * table where wildcard'd TCP sockets can exist. Hash function here - * is just local port number. - */ - struct hlist_head listening_hash[INET_LHTABLE_SIZE]; - - /* All the above members are written once at bootup and - * never written again _or_ are predominantly read-access. - * - * Now align to a new cache line as all the following members - * are often dirty. - */ - rwlock_t lhash_lock ____cacheline_aligned; - atomic_t lhash_users; - wait_queue_head_t lhash_wait; - spinlock_t portalloc_lock; -}; - extern struct inet_hashinfo tcp_hashinfo; #define tcp_ehash (tcp_hashinfo.ehash) #define tcp_bhash (tcp_hashinfo.bhash) @@ -147,19 +53,8 @@ extern struct inet_hashinfo tcp_hashinfo; #define tcp_portalloc_lock (tcp_hashinfo.portalloc_lock) extern kmem_cache_t *tcp_bucket_cachep; -extern struct inet_bind_bucket * - inet_bind_bucket_create(kmem_cache_t *cachep, - struct inet_bind_hashbucket *head, - const unsigned short snum); -extern void inet_bind_bucket_destroy(kmem_cache_t *cachep, - struct inet_bind_bucket *tb); -extern int tcp_port_rover; -/* These are AF independent. */ -static inline int inet_bhashfn(const __u16 lport, const int bhash_size) -{ - return lport & (bhash_size - 1); -} +extern int tcp_port_rover; extern void tcp_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, unsigned short snum); @@ -359,17 +254,6 @@ extern void tcp_tw_deschedule(struct tcp_tw_bucket *tw); ipv6_addr_equal(&inet6_sk(__sk)->rcv_saddr, (__daddr)) && \ (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) -/* These can have wildcards, don't try too hard. */ -static inline int inet_lhashfn(const unsigned short num) -{ - return num & (INET_LHTABLE_SIZE - 1); -} - -static inline int inet_sk_listen_hashfn(const struct sock *sk) -{ - return inet_lhashfn(inet_sk(sk)->num); -} - #define MAX_TCP_HEADER (128 + MAX_HEADER) /* diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 61c7386..2d8d30e 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -4,7 +4,7 @@ obj-y := route.o inetpeer.o protocol.o \ ip_input.o ip_fragment.o ip_forward.o ip_options.o \ - ip_output.o ip_sockglue.o \ + ip_output.o ip_sockglue.o inet_hashtables.o \ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ tcp_minisocks.o tcp_cong.o \ datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c new file mode 100644 index 0000000..343a890 --- /dev/null +++ b/net/ipv4/inet_hashtables.c @@ -0,0 +1,51 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Generic INET transport hashtables + * + * Authors: Lotsa people, from code originally in tcp + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include + +#include + +/* + * Allocate and initialize a new local port bind bucket. + * The bindhash mutex for snum's hash chain must be held here. + */ +struct inet_bind_bucket *inet_bind_bucket_create(kmem_cache_t *cachep, + struct inet_bind_hashbucket *head, + const unsigned short snum) +{ + struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, SLAB_ATOMIC); + + if (tb != NULL) { + tb->port = snum; + tb->fastreuse = 0; + INIT_HLIST_HEAD(&tb->owners); + hlist_add_head(&tb->node, &head->chain); + } + return tb; +} + +EXPORT_SYMBOL(inet_bind_bucket_create); + +/* + * Caller must hold hashbucket lock for this tb with local BH disabled + */ +void inet_bind_bucket_destroy(kmem_cache_t *cachep, struct inet_bind_bucket *tb) +{ + if (hlist_empty(&tb->owners)) { + __hlist_del(&tb->node); + kmem_cache_free(cachep, tb); + } +} diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 4138630..58e36ed 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -104,32 +104,6 @@ struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { int sysctl_local_port_range[2] = { 1024, 4999 }; int tcp_port_rover = 1024 - 1; -/* Allocate and initialize a new local port bind bucket. - * The bindhash mutex for snum's hash chain must be held here. - */ -struct inet_bind_bucket *inet_bind_bucket_create(kmem_cache_t *cachep, - struct inet_bind_hashbucket *head, - const unsigned short snum) -{ - struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, SLAB_ATOMIC); - if (tb) { - tb->port = snum; - tb->fastreuse = 0; - INIT_HLIST_HEAD(&tb->owners); - hlist_add_head(&tb->node, &head->chain); - } - return tb; -} - -/* Caller must hold hashbucket lock for this tb with local BH disabled */ -void inet_bind_bucket_destroy(kmem_cache_t *cachep, struct inet_bind_bucket *tb) -{ - if (hlist_empty(&tb->owners)) { - __hlist_del(&tb->node); - kmem_cache_free(cachep, tb); - } -} - /* Caller must disable local BH processing. */ static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child) { -- cgit v0.10.2 From a55ebcc4c4532107ad9eee1c9bb698ab5f12c00f Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:01:14 -0700 Subject: [INET]: Move bind_hash from tcp_sk to inet_sk This should really be in a inet_connection_sock, but I'm leaving it for a later optimization, when some more fields common to INET transport protocols now in tcp_sk or inet_sk will be chunked out into inet_connection_sock, for now its better to concentrate on getting the changes in the core merged to leave the DCCP tree with only DCCP specific code. Next changesets will take advantage of this move to generalise things like tcp_bind_hash, tcp_put_port, tcp_inherit_port, making the later receive a inet_hashinfo parameter, and even __tcp_tw_hashdance, etc in the future, when tcp_tw_bucket gets transformed into the struct timewait_sock hierarchy. tcp_destroy_sock also is eligible as soon as tcp_orphan_count gets moved to sk_prot. A cascade of incremental changes will ultimately make the tcp_lookup functions be fully generic. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/linux/ip.h b/include/linux/ip.h index 33e8a19..2c54bbd 100644 --- a/include/linux/ip.h +++ b/include/linux/ip.h @@ -128,6 +128,7 @@ static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk) return (struct inet_request_sock *)sk; } +struct inet_bind_bucket; struct ipv6_pinfo; struct inet_sock { @@ -157,6 +158,7 @@ struct inet_sock { int mc_index; /* Multicast device index */ __u32 mc_addr; struct ip_mc_socklist *mc_list; /* Group array */ + struct inet_bind_bucket *bind_hash; /* * Following members are used to retain the infomation to build * an ip header on each ip fragmentation while the socket is corked. diff --git a/include/linux/tcp.h b/include/linux/tcp.h index ec580a5..e70ab19 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -258,7 +258,6 @@ struct tcp_sock { __u32 snd_sml; /* Last byte of the most recently transmitted small packet */ __u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ __u32 lsndtime; /* timestamp of last sent data packet (for restart window) */ - struct inet_bind_bucket *bind_hash; /* Delayed ACK control data */ struct { __u8 pending; /* ACK is pending */ diff --git a/include/net/tcp.h b/include/net/tcp.h index ff5d30a..6c6c879 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1266,7 +1266,7 @@ static __inline__ void tcp_set_state(struct sock *sk, int state) TCP_INC_STATS(TCP_MIB_ESTABRESETS); sk->sk_prot->unhash(sk); - if (tcp_sk(sk)->bind_hash && + if (inet_sk(sk)->bind_hash && !(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) tcp_put_port(sk); /* fall through */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1ec03db..e54a410 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1575,7 +1575,7 @@ void tcp_destroy_sock(struct sock *sk) BUG_TRAP(sk_unhashed(sk)); /* If it has not 0 inet_sk(sk)->num, it must be bound */ - BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash); + BUG_TRAP(!inet_sk(sk)->num || inet_sk(sk)->bind_hash); sk->sk_prot->destroy(sk); @@ -1802,7 +1802,7 @@ int tcp_disconnect(struct sock *sk, int flags) tcp_sack_reset(&tp->rx_opt); __sk_dst_reset(sk); - BUG_TRAP(!inet->num || tp->bind_hash); + BUG_TRAP(!inet->num || inet->bind_hash); sk->sk_error_report(sk); return err; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 58e36ed..10a9b3a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -113,9 +113,9 @@ static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child) struct inet_bind_bucket *tb; spin_lock(&head->lock); - tb = tcp_sk(sk)->bind_hash; + tb = inet_sk(sk)->bind_hash; sk_add_bind_node(child, &tb->owners); - tcp_sk(child)->bind_hash = tb; + inet_sk(child)->bind_hash = tb; spin_unlock(&head->lock); } @@ -129,9 +129,10 @@ inline void tcp_inherit_port(struct sock *sk, struct sock *child) void tcp_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, const unsigned short snum) { - inet_sk(sk)->num = snum; + struct inet_sock *inet = inet_sk(sk); + inet->num = snum; sk_add_bind_node(sk, &tb->owners); - tcp_sk(sk)->bind_hash = tb; + inet->bind_hash = tb; } static inline int tcp_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb) @@ -246,9 +247,9 @@ tb_not_found: (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) tb->fastreuse = 0; success: - if (!tcp_sk(sk)->bind_hash) + if (!inet_sk(sk)->bind_hash) tcp_bind_hash(sk, tb, snum); - BUG_TRAP(tcp_sk(sk)->bind_hash == tb); + BUG_TRAP(inet_sk(sk)->bind_hash == tb); ret = 0; fail_unlock: @@ -269,9 +270,9 @@ static void __tcp_put_port(struct sock *sk) struct inet_bind_bucket *tb; spin_lock(&head->lock); - tb = tcp_sk(sk)->bind_hash; + tb = inet->bind_hash; __sk_del_bind_node(sk); - tcp_sk(sk)->bind_hash = NULL; + inet->bind_hash = NULL; inet->num = 0; inet_bind_bucket_destroy(tcp_bucket_cachep, tb); spin_unlock(&head->lock); @@ -694,7 +695,7 @@ ok: } head = &tcp_bhash[inet_bhashfn(snum, tcp_bhash_size)]; - tb = tcp_sk(sk)->bind_hash; + tb = inet_sk(sk)->bind_hash; spin_lock_bh(&head->lock); if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { __tcp_v4_hash(sk, 0); @@ -1940,7 +1941,7 @@ int tcp_v4_destroy_sock(struct sock *sk) __skb_queue_purge(&tp->ucopy.prequeue); /* Clean up a referenced TCP bind bucket. */ - if (tp->bind_hash) + if (inet_sk(sk)->bind_hash) tcp_put_port(sk); /* diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 1df6cd46..267cea1 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -296,17 +296,17 @@ kill: */ static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw) { + const struct inet_sock *inet = inet_sk(sk); struct inet_ehash_bucket *ehead = &tcp_ehash[sk->sk_hashent]; struct inet_bind_hashbucket *bhead; - /* Step 1: Put TW into bind hash. Original socket stays there too. - Note, that any socket with inet_sk(sk)->num != 0 MUST be bound in + Note, that any socket with inet->num != 0 MUST be bound in binding cache, even if it is closed. */ - bhead = &tcp_bhash[inet_bhashfn(inet_sk(sk)->num, tcp_bhash_size)]; + bhead = &tcp_bhash[inet_bhashfn(inet->num, tcp_bhash_size)]; spin_lock(&bhead->lock); - tw->tw_tb = tcp_sk(sk)->bind_hash; - BUG_TRAP(tcp_sk(sk)->bind_hash); + tw->tw_tb = inet->bind_hash; + BUG_TRAP(inet->bind_hash); tw_add_bind_node(tw, &tw->tw_tb->owners); spin_unlock(&bhead->lock); @@ -694,6 +694,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, if(newsk != NULL) { struct inet_request_sock *ireq = inet_rsk(req); struct tcp_request_sock *treq = tcp_rsk(req); + struct inet_sock *newinet = inet_sk(newsk); struct tcp_sock *newtp; struct sk_filter *filter; @@ -702,10 +703,10 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, /* SANITY */ sk_node_init(&newsk->sk_node); - tcp_sk(newsk)->bind_hash = NULL; + newinet->bind_hash = NULL; /* Clone the TCP header template */ - inet_sk(newsk)->dport = ireq->rmt_port; + newinet->dport = ireq->rmt_port; sock_lock_init(newsk); bh_lock_sock(newsk); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 31f50fb..a8ca7ba 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -204,9 +204,9 @@ tb_not_found: tb->fastreuse = 0; success: - if (!tcp_sk(sk)->bind_hash) + if (!inet_sk(sk)->bind_hash) tcp_bind_hash(sk, tb, snum); - BUG_TRAP(tcp_sk(sk)->bind_hash == tb); + BUG_TRAP(inet_sk(sk)->bind_hash == tb); ret = 0; fail_unlock: @@ -613,8 +613,8 @@ ok: goto out; } - head = &tcp_bhash[inet_bhashfn(snum, tcp_bhash_size)]; - tb = tcp_sk(sk)->bind_hash; + head = &tcp_bhash[inet_bhashfn(snum, tcp_bhash_size)]; + tb = inet_sk(sk)->bind_hash; spin_lock_bh(&head->lock); if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { -- cgit v0.10.2 From a86888b925299330053d20e0eba03ac4d2648c4b Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 9 Aug 2005 20:02:13 -0700 Subject: [NETFILTER]: Fix multiple problems with the conntrack event cache refcnt underflow: the reference count is decremented when a conntrack entry is removed from the hash but it is not incremented when entering new entries. missing protection of process context against softirq context: all cache operations need to locally disable softirqs to avoid races. Additionally the event cache can't be initialized when a packet enteres the conntrack code but needs to be initialized whenever we cache an event and the stored conntrack entry doesn't match the current one. incorrect flushing of the event cache in ip_ct_iterate_cleanup: without real locking we can't flush the cache for different CPUs without incurring races. The cache for different CPUs can only be flushed when no packets are going through the code. ip_ct_iterate_cleanup doesn't need to drop all references, so flushing is moved to the cleanup path. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller diff --git a/include/linux/netfilter_ipv4/ip_conntrack.h b/include/linux/netfilter_ipv4/ip_conntrack.h index ff2c1c6..088742b 100644 --- a/include/linux/netfilter_ipv4/ip_conntrack.h +++ b/include/linux/netfilter_ipv4/ip_conntrack.h @@ -411,6 +411,7 @@ struct ip_conntrack_stat #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS #include +#include struct ip_conntrack_ecache { struct ip_conntrack *ct; @@ -445,26 +446,24 @@ ip_conntrack_expect_unregister_notifier(struct notifier_block *nb) return notifier_chain_unregister(&ip_conntrack_expect_chain, nb); } +extern void ip_ct_deliver_cached_events(const struct ip_conntrack *ct); +extern void __ip_ct_event_cache_init(struct ip_conntrack *ct); + static inline void ip_conntrack_event_cache(enum ip_conntrack_events event, const struct sk_buff *skb) { - struct ip_conntrack_ecache *ecache = - &__get_cpu_var(ip_conntrack_ecache); - - if (unlikely((struct ip_conntrack *) skb->nfct != ecache->ct)) { - if (net_ratelimit()) { - printk(KERN_ERR "ctevent: skb->ct != ecache->ct !!!\n"); - dump_stack(); - } - } + struct ip_conntrack *ct = (struct ip_conntrack *)skb->nfct; + struct ip_conntrack_ecache *ecache; + + local_bh_disable(); + ecache = &__get_cpu_var(ip_conntrack_ecache); + if (ct != ecache->ct) + __ip_ct_event_cache_init(ct); ecache->events |= event; + local_bh_enable(); } -extern void -ip_conntrack_deliver_cached_events_for(const struct ip_conntrack *ct); -extern void ip_conntrack_event_cache_init(const struct sk_buff *skb); - static inline void ip_conntrack_event(enum ip_conntrack_events event, struct ip_conntrack *ct) { @@ -483,9 +482,7 @@ static inline void ip_conntrack_event_cache(enum ip_conntrack_events event, const struct sk_buff *skb) {} static inline void ip_conntrack_event(enum ip_conntrack_events event, struct ip_conntrack *ct) {} -static inline void ip_conntrack_deliver_cached_events_for( - struct ip_conntrack *ct) {} -static inline void ip_conntrack_event_cache_init(const struct sk_buff *skb) {} +static inline void ip_ct_deliver_cached_events(const struct ip_conntrack *ct) {} static inline void ip_conntrack_expect_event(enum ip_conntrack_expect_events event, struct ip_conntrack_expect *exp) {} diff --git a/include/linux/netfilter_ipv4/ip_conntrack_core.h b/include/linux/netfilter_ipv4/ip_conntrack_core.h index fbf6c3e..dc4d2a0 100644 --- a/include/linux/netfilter_ipv4/ip_conntrack_core.h +++ b/include/linux/netfilter_ipv4/ip_conntrack_core.h @@ -44,18 +44,14 @@ static inline int ip_conntrack_confirm(struct sk_buff **pskb) struct ip_conntrack *ct = (struct ip_conntrack *)(*pskb)->nfct; int ret = NF_ACCEPT; - if (ct && !is_confirmed(ct)) - ret = __ip_conntrack_confirm(pskb); - ip_conntrack_deliver_cached_events_for(ct); - + if (ct) { + if (!is_confirmed(ct)) + ret = __ip_conntrack_confirm(pskb); + ip_ct_deliver_cached_events(ct); + } return ret; } -#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS -struct ip_conntrack_ecache; -extern void __ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ec); -#endif - extern void __ip_ct_expect_unlink_destroy(struct ip_conntrack_expect *exp); extern struct list_head *ip_conntrack_hash; diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index d9fddae..5c3f16e 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -85,73 +85,62 @@ struct notifier_block *ip_conntrack_expect_chain; DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache); -static inline void __deliver_cached_events(struct ip_conntrack_ecache *ecache) +/* deliver cached events and clear cache entry - must be called with locally + * disabled softirqs */ +static inline void +__ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache) { + DEBUGP("ecache: delivering events for %p\n", ecache->ct); if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events) notifier_call_chain(&ip_conntrack_chain, ecache->events, ecache->ct); ecache->events = 0; -} - -void __ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache) -{ - __deliver_cached_events(ecache); + ip_conntrack_put(ecache->ct); + ecache->ct = NULL; } /* Deliver all cached events for a particular conntrack. This is called * by code prior to async packet handling or freeing the skb */ -void -ip_conntrack_deliver_cached_events_for(const struct ip_conntrack *ct) +void ip_ct_deliver_cached_events(const struct ip_conntrack *ct) { - struct ip_conntrack_ecache *ecache = - &__get_cpu_var(ip_conntrack_ecache); - - if (!ct) - return; + struct ip_conntrack_ecache *ecache; + + local_bh_disable(); + ecache = &__get_cpu_var(ip_conntrack_ecache); + if (ecache->ct == ct) + __ip_ct_deliver_cached_events(ecache); + local_bh_enable(); +} - if (ecache->ct == ct) { - DEBUGP("ecache: delivering event for %p\n", ct); - __deliver_cached_events(ecache); - } else { - if (net_ratelimit()) - printk(KERN_WARNING "ecache: want to deliver for %p, " - "but cache has %p\n", ct, ecache->ct); - } +void __ip_ct_event_cache_init(struct ip_conntrack *ct) +{ + struct ip_conntrack_ecache *ecache; - /* signalize that events have already been delivered */ - ecache->ct = NULL; + /* take care of delivering potentially old events */ + ecache = &__get_cpu_var(ip_conntrack_ecache); + BUG_ON(ecache->ct == ct); + if (ecache->ct) + __ip_ct_deliver_cached_events(ecache); + /* initialize for this conntrack/packet */ + ecache->ct = ct; + nf_conntrack_get(&ct->ct_general); } -/* Deliver cached events for old pending events, if current conntrack != old */ -void ip_conntrack_event_cache_init(const struct sk_buff *skb) +/* flush the event cache - touches other CPU's data and must not be called while + * packets are still passing through the code */ +static void ip_ct_event_cache_flush(void) { - struct ip_conntrack *ct = (struct ip_conntrack *) skb->nfct; - struct ip_conntrack_ecache *ecache = - &__get_cpu_var(ip_conntrack_ecache); + struct ip_conntrack_ecache *ecache; + int cpu; - /* take care of delivering potentially old events */ - if (ecache->ct != ct) { - enum ip_conntrack_info ctinfo; - /* we have to check, since at startup the cache is NULL */ - if (likely(ecache->ct)) { - DEBUGP("ecache: entered for different conntrack: " - "ecache->ct=%p, skb->nfct=%p. delivering " - "events\n", ecache->ct, ct); - __deliver_cached_events(ecache); + for_each_cpu(cpu) { + ecache = &per_cpu(ip_conntrack_ecache, cpu); + if (ecache->ct) ip_conntrack_put(ecache->ct); - } else { - DEBUGP("ecache: entered for conntrack %p, " - "cache was clean before\n", ct); - } - - /* initialize for this conntrack/packet */ - ecache->ct = ip_conntrack_get(skb, &ctinfo); - /* ecache->events cleared by __deliver_cached_devents() */ - } else { - DEBUGP("ecache: re-entered for conntrack %p.\n", ct); } } - +#else +static inline void ip_ct_event_cache_flush(void) {} #endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */ DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat); @@ -878,8 +867,6 @@ unsigned int ip_conntrack_in(unsigned int hooknum, IP_NF_ASSERT((*pskb)->nfct); - ip_conntrack_event_cache_init(*pskb); - ret = proto->packet(ct, *pskb, ctinfo); if (ret < 0) { /* Invalid: inverse of the return code tells @@ -1278,23 +1265,6 @@ ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data) ip_conntrack_put(ct); } - -#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS - { - /* we need to deliver all cached events in order to drop - * the reference counts */ - int cpu; - for_each_cpu(cpu) { - struct ip_conntrack_ecache *ecache = - &per_cpu(ip_conntrack_ecache, cpu); - if (ecache->ct) { - __ip_ct_deliver_cached_events(ecache); - ip_conntrack_put(ecache->ct); - ecache->ct = NULL; - } - } - } -#endif } /* Fast function for those who don't want to parse /proc (and I don't @@ -1381,6 +1351,7 @@ void ip_conntrack_flush() delete... */ synchronize_net(); + ip_ct_event_cache_flush(); i_see_dead_people: ip_ct_iterate_cleanup(kill_all, NULL); if (atomic_read(&ip_conntrack_count) != 0) { diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c index ca97c3a..ee5895a 100644 --- a/net/ipv4/netfilter/ip_conntrack_standalone.c +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c @@ -401,7 +401,6 @@ static unsigned int ip_confirm(unsigned int hooknum, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - ip_conntrack_event_cache_init(*pskb); /* We've seen it coming out the other side: confirm it */ return ip_conntrack_confirm(pskb); } @@ -419,7 +418,6 @@ static unsigned int ip_conntrack_help(unsigned int hooknum, ct = ip_conntrack_get(*pskb, &ctinfo); if (ct && ct->helper) { unsigned int ret; - ip_conntrack_event_cache_init(*pskb); ret = ct->helper->help(pskb, ct, ctinfo); if (ret != NF_ACCEPT) return ret; @@ -978,6 +976,7 @@ EXPORT_SYMBOL_GPL(ip_conntrack_chain); EXPORT_SYMBOL_GPL(ip_conntrack_expect_chain); EXPORT_SYMBOL_GPL(ip_conntrack_register_notifier); EXPORT_SYMBOL_GPL(ip_conntrack_unregister_notifier); +EXPORT_SYMBOL_GPL(__ip_ct_event_cache_init); EXPORT_PER_CPU_SYMBOL_GPL(ip_conntrack_ecache); #endif EXPORT_SYMBOL(ip_conntrack_protocol_register); -- cgit v0.10.2 From 94cd2b67641e7ddc2e6ed71d76e00116957423db Mon Sep 17 00:00:00 2001 From: Pablo Neira Date: Tue, 9 Aug 2005 20:02:36 -0700 Subject: [NETFILTER]: remove bogus memset() calls from ip_conntrack_netlink.c nfattr_parse_nested() calls nfattr_parse() which in turn does a memset on the 'tb' array. All callers therefore don't need to memset before calling it. Signed-off-by: Pablo Neira Signed-off-by: Harald Welte Signed-off-by: David S. Miller diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index f43ec18..36a046f 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -479,7 +479,6 @@ ctnetlink_parse_tuple_ip(struct nfattr *attr, struct ip_conntrack_tuple *tuple) DEBUGP("entered %s\n", __FUNCTION__); - memset(tb, 0, CTA_IP_MAX * sizeof(tb)); if (nfattr_parse_nested(tb, CTA_IP_MAX, attr) < 0) goto nfattr_failure; @@ -522,8 +521,6 @@ ctnetlink_parse_tuple_proto(struct nfattr *attr, DEBUGP("entered %s\n", __FUNCTION__); - memset(tb, 0, CTA_PROTO_MAX * sizeof(tb)); - if (nfattr_parse_nested(tb, CTA_PROTO_MAX, attr) < 0) goto nfattr_failure; @@ -556,7 +553,6 @@ ctnetlink_parse_tuple(struct nfattr *cda[], struct ip_conntrack_tuple *tuple, DEBUGP("entered %s\n", __FUNCTION__); - memset(tb, 0, CTA_TUPLE_MAX * sizeof(tb)); memset(tuple, 0, sizeof(*tuple)); if (nfattr_parse_nested(tb, CTA_TUPLE_MAX, cda[type-1]) < 0) @@ -607,8 +603,6 @@ static int ctnetlink_parse_nat_proto(struct nfattr *attr, DEBUGP("entered %s\n", __FUNCTION__); - memset(tb, 0, CTA_PROTONAT_MAX * sizeof(tb)); - if (nfattr_parse_nested(tb, CTA_PROTONAT_MAX, attr) < 0) goto nfattr_failure; @@ -646,7 +640,6 @@ ctnetlink_parse_nat(struct nfattr *cda[], DEBUGP("entered %s\n", __FUNCTION__); - memset(tb, 0, CTA_NAT_MAX * sizeof(tb)); memset(range, 0, sizeof(*range)); if (nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]) < 0) @@ -684,7 +677,6 @@ ctnetlink_parse_help(struct nfattr *attr, char **helper_name) struct nfattr *tb[CTA_HELP_MAX]; DEBUGP("entered %s\n", __FUNCTION__); - memset(tb, 0, CTA_HELP_MAX * sizeof(tb)); if (nfattr_parse_nested(tb, CTA_HELP_MAX, attr) < 0) goto nfattr_failure; -- cgit v0.10.2 From 88aa0429048d08c18f2772782588f953bbbd79be Mon Sep 17 00:00:00 2001 From: Pablo Neira Date: Tue, 9 Aug 2005 20:02:55 -0700 Subject: [NETFILTER]: conntrack_netlink: Fix locking during conntrack_create The current codepath allowed for ip_conntrack_lock to be unlock'ed twice. Signed-off-by: Pablo Neira Signed-off-by: Harald Welte Signed-off-by: David S. Miller diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 36a046f..0ab2d7d 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -1052,13 +1052,14 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, err = -ENOENT; if (nlh->nlmsg_flags & NLM_F_CREATE) err = ctnetlink_create_conntrack(cda, &otuple, &rtuple); + return err; + } + /* implicit 'else' */ + + /* we only allow nat config for new conntracks */ + if (cda[CTA_NAT-1]) { + err = -EINVAL; goto out_unlock; - } else { - /* we only allow nat config for new conntracks */ - if (cda[CTA_NAT-1]) { - err = -EINVAL; - goto out_unlock; - } } /* We manipulate the conntrack inside the global conntrack table lock, -- cgit v0.10.2 From bd9a26b7f2ee7567571bb5b7acc1a256c544a0dd Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 20:03:22 -0700 Subject: [NETFILTER]: fix ctnetlink 'create_expect' parsing There was a stupid copy+paste mistake where we parse the MASK nfattr into the "tuple" variable instead of the "mask" variable. This patch fixes it. Thanks to Pablo Neira. Signed-off-by: Harald Welte Signed-off-by: David S. Miller diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 0ab2d7d..23f18f6 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -1388,7 +1388,7 @@ ctnetlink_create_expect(struct nfattr *cda[]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE); if (err < 0) return err; - err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASK); + err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK); if (err < 0) return err; -- cgit v0.10.2 From 927ccbcc28dceee29dad876982768cca29738564 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 20:03:40 -0700 Subject: [NETFILTER]: attribute count is an attribute of message type, not subsytem Prior to this patch, every nfnetlink subsystem had to specify it's attribute count. However, in reality the attribute count depends on the message type within the subsystem, not the subsystem itself. This patch moves 'attr_count' from 'struct nfnetlink_subsys' into nfnl_callback to fix this. Signed-off-by: Harald Welte Signed-off-by: David S. Miller diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index 561f9df..b0feb23 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -85,9 +85,10 @@ struct nfgenmsg { struct nfnl_callback { - kernel_cap_t cap_required; /* capabilities required for this msg */ int (*call)(struct sock *nl, struct sk_buff *skb, struct nlmsghdr *nlh, struct nfattr *cda[], int *errp); + kernel_cap_t cap_required; /* capabilities required for this msg */ + u_int16_t attr_count; /* number of nfattr's */ }; struct nfnetlink_subsystem @@ -95,7 +96,6 @@ struct nfnetlink_subsystem const char *name; __u8 subsys_id; /* nfnetlink subsystem ID */ __u8 cb_count; /* number of callbacks */ - u_int32_t attr_count; /* number of nfattr's */ struct nfnl_callback *cb; /* callback for individual types */ }; diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 23f18f6..53d9897 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -1484,21 +1484,28 @@ static struct notifier_block ctnl_notifier_exp = { static struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = { [IPCTNL_MSG_CT_NEW] = { .call = ctnetlink_new_conntrack, + .attr_count = CTA_MAX, .cap_required = CAP_NET_ADMIN }, [IPCTNL_MSG_CT_GET] = { .call = ctnetlink_get_conntrack, + .attr_count = CTA_MAX, .cap_required = CAP_NET_ADMIN }, [IPCTNL_MSG_CT_DELETE] = { .call = ctnetlink_del_conntrack, + .attr_count = CTA_MAX, .cap_required = CAP_NET_ADMIN }, [IPCTNL_MSG_CT_GET_CTRZERO] = { .call = ctnetlink_get_conntrack, + .attr_count = CTA_MAX, .cap_required = CAP_NET_ADMIN }, }; static struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_MAX] = { [IPCTNL_MSG_EXP_GET] = { .call = ctnetlink_get_expect, + .attr_count = CTA_EXPECT_MAX, .cap_required = CAP_NET_ADMIN }, [IPCTNL_MSG_EXP_NEW] = { .call = ctnetlink_new_expect, + .attr_count = CTA_EXPECT_MAX, .cap_required = CAP_NET_ADMIN }, [IPCTNL_MSG_EXP_DELETE] = { .call = ctnetlink_del_expect, + .attr_count = CTA_EXPECT_MAX, .cap_required = CAP_NET_ADMIN }, }; @@ -1506,7 +1513,6 @@ static struct nfnetlink_subsystem ctnl_subsys = { .name = "conntrack", .subsys_id = NFNL_SUBSYS_CTNETLINK, .cb_count = IPCTNL_MSG_MAX, - .attr_count = CTA_MAX, .cb = ctnl_cb, }; @@ -1514,7 +1520,6 @@ static struct nfnetlink_subsystem ctnl_exp_subsys = { .name = "conntrack_expect", .subsys_id = NFNL_SUBSYS_CTNETLINK_EXP, .cb_count = IPCTNL_MSG_EXP_MAX, - .attr_count = CTA_MAX, .cb = ctnl_exp_cb, }; diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 30b25f4..578e4fe 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -155,8 +155,18 @@ nfnetlink_check_attributes(struct nfnetlink_subsystem *subsys, struct nlmsghdr *nlh, struct nfattr *cda[]) { int min_len; + u_int16_t attr_count; + u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); - memset(cda, 0, sizeof(struct nfattr *) * subsys->attr_count); + if (unlikely(cb_id >= subsys->cb_count)) { + DEBUGP("msgtype %u >= %u, returning\n", + cb_id, subsys->cb_count); + return -EINVAL; + } + + attr_count = subsys->cb[cb_id].attr_count; + + memset(cda, 0, sizeof(struct nfattr *) * attr_count); /* check attribute lengths. */ min_len = NLMSG_ALIGN(sizeof(struct nfgenmsg)); @@ -170,7 +180,7 @@ nfnetlink_check_attributes(struct nfnetlink_subsystem *subsys, while (NFA_OK(attr, attrlen)) { unsigned flavor = attr->nfa_type; if (flavor) { - if (flavor > subsys->attr_count) + if (flavor > attr_count) return -EINVAL; cda[flavor - 1] = attr; } @@ -256,9 +266,11 @@ static inline int nfnetlink_rcv_msg(struct sk_buff *skb, } { - struct nfattr *cda[ss->attr_count]; + u_int16_t attr_count = + ss->cb[NFNL_MSG_TYPE(nlh->nlmsg_type)].attr_count; + struct nfattr *cda[attr_count]; - memset(cda, 0, ss->attr_count*sizeof(struct nfattr *)); + memset(cda, 0, sizeof(struct nfattr *) * attr_count); err = nfnetlink_check_attributes(ss, nlh, cda); if (err < 0) diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index f41045e..1750f0d 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -805,8 +805,10 @@ out_put: static struct nfnl_callback nfulnl_cb[NFULNL_MSG_MAX] = { [NFULNL_MSG_PACKET] = { .call = nfulnl_recv_unsupp, - .cap_required = CAP_NET_ADMIN }, + .attr_count = NFULA_MAX, + .cap_required = CAP_NET_ADMIN, }, [NFULNL_MSG_CONFIG] = { .call = nfulnl_recv_config, + .attr_count = NFULA_CFG_MAX, .cap_required = CAP_NET_ADMIN }, }; @@ -814,7 +816,6 @@ static struct nfnetlink_subsystem nfulnl_subsys = { .name = "log", .subsys_id = NFNL_SUBSYS_ULOG, .cb_count = NFULNL_MSG_MAX, - .attr_count = NFULA_MAX, .cb = nfulnl_cb, }; diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index d7b0330..04323ee 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -877,10 +877,13 @@ out_put: static struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = { [NFQNL_MSG_PACKET] = { .call = nfqnl_recv_unsupp, + .attr_count = NFQA_MAX, .cap_required = CAP_NET_ADMIN }, [NFQNL_MSG_VERDICT] = { .call = nfqnl_recv_verdict, + .attr_count = NFQA_MAX, .cap_required = CAP_NET_ADMIN }, [NFQNL_MSG_CONFIG] = { .call = nfqnl_recv_config, + .attr_count = NFQA_CFG_MAX, .cap_required = CAP_NET_ADMIN }, }; @@ -888,7 +891,6 @@ static struct nfnetlink_subsystem nfqnl_subsys = { .name = "nf_queue", .subsys_id = NFNL_SUBSYS_QUEUE, .cb_count = NFQNL_MSG_MAX, - .attr_count = NFQA_MAX, .cb = nfqnl_cb, }; -- cgit v0.10.2 From a42827b71b87fc9816d2f58626e825b0eb500efe Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 20:03:54 -0700 Subject: [NETFILTER]: cleanup nfnetlink_check_attributes() 1) memset return parameter 'cda' (nfattr pointer array) only on success 2) a message without attributes and just a 'struct nfgenmsg' is valid, don't return -EINVAL 3) use likely() and unlikely() where apropriate Signed-off-by: Harald Welte Signed-off-by: David S. Miller diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 578e4fe..84efffd 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -163,17 +163,16 @@ nfnetlink_check_attributes(struct nfnetlink_subsystem *subsys, cb_id, subsys->cb_count); return -EINVAL; } - - attr_count = subsys->cb[cb_id].attr_count; - - memset(cda, 0, sizeof(struct nfattr *) * attr_count); - /* check attribute lengths. */ min_len = NLMSG_ALIGN(sizeof(struct nfgenmsg)); - if (nlh->nlmsg_len < min_len) + if (unlikely(nlh->nlmsg_len < min_len)) return -EINVAL; - if (nlh->nlmsg_len > min_len) { + attr_count = subsys->cb[cb_id].attr_count; + memset(cda, 0, sizeof(struct nfattr *) * attr_count); + + /* check attribute lengths. */ + if (likely(nlh->nlmsg_len > min_len)) { struct nfattr *attr = NFM_NFA(NLMSG_DATA(nlh)); int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len); @@ -186,8 +185,10 @@ nfnetlink_check_attributes(struct nfnetlink_subsystem *subsys, } attr = NFA_NEXT(attr, attrlen); } - } else - return -EINVAL; + } + + /* implicit: if nlmsg_len == min_len, we return 0, and an empty + * (zeroed) cda[] array. The message is valid, but empty. */ return 0; } -- cgit v0.10.2 From 1444fc559b01aa5d4fedf4ee4f306a9e9cd56f95 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 20:04:07 -0700 Subject: [NETFILTER]: don't use nested attributes for conntrack_expect We used to use nested nfattr structures for ip_conntrack_expect. This is bogus, since ip_conntrack and ip_conntrack_expect are communicated in different netlink message types. both should be encoded at the top level attributes, no extra nesting required. This patch addresses the issue. Signed-off-by: Harald Welte Signed-off-by: David S. Miller diff --git a/include/linux/netfilter/nfnetlink_conntrack.h b/include/linux/netfilter/nfnetlink_conntrack.h index fb528e0..5c55751 100644 --- a/include/linux/netfilter/nfnetlink_conntrack.h +++ b/include/linux/netfilter/nfnetlink_conntrack.h @@ -33,7 +33,6 @@ enum ctattr_type { CTA_COUNTERS_ORIG, CTA_COUNTERS_REPLY, CTA_USE, - CTA_EXPECT, CTA_ID, __CTA_MAX }; @@ -103,10 +102,12 @@ enum ctattr_protonat { enum ctattr_expect { CTA_EXPECT_UNSPEC, + CTA_EXPECT_MASTER, CTA_EXPECT_TUPLE, CTA_EXPECT_MASK, CTA_EXPECT_TIMEOUT, CTA_EXPECT_ID, + CTA_EXPECT_HELP_NAME, __CTA_EXPECT_MAX }; #define CTA_EXPECT_MAX (__CTA_EXPECT_MAX - 1) diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 53d9897..f5bda82 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -1100,18 +1100,21 @@ static inline int ctnetlink_exp_dump_expect(struct sk_buff *skb, const struct ip_conntrack_expect *exp) { + struct ip_conntrack *master = exp->master; u_int32_t timeout = htonl((exp->timeout.expires - jiffies) / HZ); u_int32_t id = htonl(exp->id); - struct nfattr *nest_parms = NFA_NEST(skb, CTA_EXPECT); if (ctnetlink_exp_dump_tuple(skb, &exp->tuple, CTA_EXPECT_TUPLE) < 0) goto nfattr_failure; if (ctnetlink_exp_dump_tuple(skb, &exp->mask, CTA_EXPECT_MASK) < 0) goto nfattr_failure; + if (ctnetlink_exp_dump_tuple(skb, + &master->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + CTA_EXPECT_MASTER) < 0) + goto nfattr_failure; NFA_PUT(skb, CTA_EXPECT_TIMEOUT, sizeof(timeout), &timeout); NFA_PUT(skb, CTA_EXPECT_ID, sizeof(u_int32_t), &id); - NFA_NEST_END(skb, nest_parms); return 0; @@ -1259,10 +1262,8 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, return 0; } - if (cda[CTA_TUPLE_ORIG-1]) - err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG); - else if (cda[CTA_TUPLE_REPLY-1]) - err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY); + if (cda[CTA_EXPECT_MASTER-1]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER); else return -EINVAL; @@ -1310,13 +1311,33 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, struct ip_conntrack_helper *h; int err; - /* delete by tuple needs either orig or reply tuple */ - if (cda[CTA_TUPLE_ORIG-1]) - err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG); - else if (cda[CTA_TUPLE_REPLY-1]) - err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY); - else if (cda[CTA_HELP_NAME-1]) { - char *name = NFA_DATA(cda[CTA_HELP_NAME-1]); + if (cda[CTA_EXPECT_TUPLE-1]) { + /* delete a single expect by tuple */ + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE); + if (err < 0) + return err; + + /* bump usage count to 2 */ + exp = ip_conntrack_expect_find_get(&tuple); + if (!exp) + return -ENOENT; + + if (cda[CTA_EXPECT_ID-1]) { + u_int32_t id = + *(u_int32_t *)NFA_DATA(cda[CTA_EXPECT_ID-1]); + if (exp->id != ntohl(id)) { + ip_conntrack_expect_put(exp); + return -ENOENT; + } + } + + /* after list removal, usage count == 1 */ + ip_conntrack_unexpect_related(exp); + /* have to put what we 'get' above. + * after this line usage count == 0 */ + ip_conntrack_expect_put(exp); + } else if (cda[CTA_EXPECT_HELP_NAME-1]) { + char *name = NFA_DATA(cda[CTA_EXPECT_HELP_NAME-1]); /* delete all expectations for this helper */ write_lock_bh(&ip_conntrack_lock); @@ -1332,7 +1353,6 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, __ip_ct_expect_unlink_destroy(exp); } write_unlock(&ip_conntrack_lock); - return 0; } else { /* This basically means we have to flush everything*/ write_lock_bh(&ip_conntrack_lock); @@ -1342,30 +1362,8 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, __ip_ct_expect_unlink_destroy(exp); } write_unlock_bh(&ip_conntrack_lock); - return 0; } - if (err < 0) - return err; - - /* bump usage count to 2 */ - exp = ip_conntrack_expect_find_get(&tuple); - if (!exp) - return -ENOENT; - - if (cda[CTA_EXPECT_ID-1]) { - u_int32_t id = *(u_int32_t *)NFA_DATA(cda[CTA_EXPECT_ID-1]); - if (exp->id != ntohl(id)) { - ip_conntrack_expect_put(exp); - return -ENOENT; - } - } - - /* after list removal, usage count == 1 */ - ip_conntrack_unexpect_related(exp); - /* have to put what we 'get' above. after this line usage count == 0 */ - ip_conntrack_expect_put(exp); - return 0; } static int @@ -1385,21 +1383,14 @@ ctnetlink_create_expect(struct nfattr *cda[]) DEBUGP("entered %s\n", __FUNCTION__); + /* caller guarantees that those three CTA_EXPECT_* exist */ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE); if (err < 0) return err; err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK); if (err < 0) return err; - - if (cda[CTA_TUPLE_ORIG-1]) - err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_TUPLE_ORIG); - else if (cda[CTA_TUPLE_REPLY-1]) - err = ctnetlink_parse_tuple(cda, &master_tuple, - CTA_TUPLE_REPLY); - else - return -EINVAL; - + err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER); if (err < 0) return err; @@ -1444,7 +1435,9 @@ ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb, DEBUGP("entered %s\n", __FUNCTION__); - if (!cda[CTA_EXPECT_TUPLE-1] || !cda[CTA_EXPECT_MASK-1]) + if (!cda[CTA_EXPECT_TUPLE-1] + || !cda[CTA_EXPECT_MASK-1] + || !cda[CTA_EXPECT_MASTER-1]) return -EINVAL; err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE); -- cgit v0.10.2 From 14a50bbaa51202b676a95e9b41bc5ed6c77aa9cc Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 9 Aug 2005 20:05:52 -0700 Subject: [NETFILTER]: ctnetlink: make sure event order is correct The following sequence is displayed during events dumping of an ICMP connection: [NEW] [DESTROY] [UPDATE] This happens because the event IPCT_DESTROY is delivered in death_by_timeout(), that is called from the icmp protocol helper (ct->timeout.function) once we see the reply. To fix this, we move this event to destroy_conntrack(). Signed-off-by: Pablo Neira Ayuso Signed-off-by: Harald Welte Signed-off-by: David S. Miller diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index 5c3f16e..dace93e 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -316,6 +316,7 @@ destroy_conntrack(struct nf_conntrack *nfct) IP_NF_ASSERT(atomic_read(&nfct->use) == 0); IP_NF_ASSERT(!timer_pending(&ct->timeout)); + ip_conntrack_event(IPCT_DESTROY, ct); set_bit(IPS_DYING_BIT, &ct->status); /* To make sure we don't get any weird locking issues here: @@ -355,7 +356,6 @@ static void death_by_timeout(unsigned long ul_conntrack) { struct ip_conntrack *ct = (void *)ul_conntrack; - ip_conntrack_event(IPCT_DESTROY, ct); write_lock_bh(&ip_conntrack_lock); /* Inside lock so preempt is disabled on module removal path. * Otherwise we can get spurious warnings. */ -- cgit v0.10.2 From 37012f7fd326eb3c959428a4fe7e203e6304fe43 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 9 Aug 2005 20:06:11 -0700 Subject: [NETFILTER]: fix conntrack refcount leak in unlink_expect() In unlink_expect(), the expectation is removed from the list so the refcount must be dropped as well. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Harald Welte Signed-off-by: David S. Miller diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index dace93e..9261388 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -204,6 +204,7 @@ static void unlink_expect(struct ip_conntrack_expect *exp) list_del(&exp->list); CONNTRACK_STAT_INC(expect_delete); exp->master->expecting--; + ip_conntrack_expect_put(exp); } void __ip_ct_expect_unlink_destroy(struct ip_conntrack_expect *exp) -- cgit v0.10.2 From 28b19d99ac6d92e4fb2fe34144c495019a638a64 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 9 Aug 2005 20:06:27 -0700 Subject: [NETFILTER]: Fix typo in ctnl_exp_cb array (no bug, just memory waste) This fixes the size of the ctnl_exp_cb array that is IPCTNL_MSG_EXP_MAX instead of IPCTNL_MSG_MAX. Simple typo. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Harald Welte Signed-off-by: David S. Miller diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index f5bda82..e3ba449 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -1490,7 +1490,7 @@ static struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = { .cap_required = CAP_NET_ADMIN }, }; -static struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_MAX] = { +static struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = { [IPCTNL_MSG_EXP_GET] = { .call = ctnetlink_get_expect, .attr_count = CTA_EXPECT_MAX, .cap_required = CAP_NET_ADMIN }, -- cgit v0.10.2 From ff21d5774b4a186c98be6398eacde75d896db804 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 9 Aug 2005 20:06:42 -0700 Subject: [NETFILTER]: fix list traversal order in ctnetlink Currently conntracks are inserted after the head. That means that conntracks are sorted from the biggest to the smallest id. This happens because we use list_prepend (list_add) instead list_add_tail. This can result in problems during the list iteration. list_for_each(i, &ip_conntrack_hash[cb->args[0]]) { h = (struct ip_conntrack_tuple_hash *) i; if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) continue; ct = tuplehash_to_ctrack(h); if (ct->id <= *id) continue; In that case just the first conntrack in the bucket will be dumped. To fix this, we iterate the list from the tail to the head via list_for_each_prev. Same thing for the list of expectations. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Harald Welte Signed-off-by: David S. Miller diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index e3ba449..1221a9c 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -404,7 +404,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) read_lock_bh(&ip_conntrack_lock); for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) { - list_for_each(i, &ip_conntrack_hash[cb->args[0]]) { + list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) { h = (struct ip_conntrack_tuple_hash *) i; if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) continue; @@ -441,7 +441,7 @@ ctnetlink_dump_table_w(struct sk_buff *skb, struct netlink_callback *cb) write_lock_bh(&ip_conntrack_lock); for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) { - list_for_each(i, &ip_conntrack_hash[cb->args[0]]) { + list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) { h = (struct ip_conntrack_tuple_hash *) i; if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) continue; @@ -1214,7 +1214,7 @@ ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb) DEBUGP("entered %s, last id=%llu\n", __FUNCTION__, *id); read_lock_bh(&ip_conntrack_lock); - list_for_each(i, &ip_conntrack_expect_list) { + list_for_each_prev(i, &ip_conntrack_expect_list) { exp = (struct ip_conntrack_expect *) i; if (exp->id <= *id) continue; -- cgit v0.10.2 From 2d8c4ce51903636ce0f60addc8134aa50ab8fa76 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:07:13 -0700 Subject: [INET]: Generalise tcp_bind_hash & tcp_inherit_port This required moving tcp_bucket_cachep to inet_hashinfo. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 3a6c11c..da97055 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -14,12 +14,15 @@ #ifndef _INET_HASHTABLES_H #define _INET_HASHTABLES_H +#include #include #include #include #include #include +#include + /* This is for all connections with a full identity, no wildcards. * New scheme, half the table is for TIME_WAIT, the other half is * for the rest. I'll experiment with dynamic table growth later. @@ -113,6 +116,7 @@ struct inet_hashinfo { atomic_t lhash_users; wait_queue_head_t lhash_wait; spinlock_t portalloc_lock; + kmem_cache_t *bind_bucket_cachep; }; static inline int inet_ehashfn(const __u32 laddr, const __u16 lport, @@ -148,6 +152,9 @@ static inline int inet_bhashfn(const __u16 lport, const int bhash_size) return lport & (bhash_size - 1); } +extern void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, + const unsigned short snum); + /* These can have wildcards, don't try too hard. */ static inline int inet_lhashfn(const unsigned short num) { @@ -159,4 +166,29 @@ static inline int inet_sk_listen_hashfn(const struct sock *sk) return inet_lhashfn(inet_sk(sk)->num); } +/* Caller must disable local BH processing. */ +static inline void __inet_inherit_port(struct inet_hashinfo *table, + struct sock *sk, struct sock *child) +{ + const int bhash = inet_bhashfn(inet_sk(child)->num, table->bhash_size); + struct inet_bind_hashbucket *head = &table->bhash[bhash]; + struct inet_bind_bucket *tb; + + spin_lock(&head->lock); + tb = inet_sk(sk)->bind_hash; + sk_add_bind_node(child, &tb->owners); + inet_sk(child)->bind_hash = tb; + spin_unlock(&head->lock); +} + +static inline void inet_inherit_port(struct inet_hashinfo *table, + struct sock *sk, struct sock *child) +{ + local_bh_disable(); + __inet_inherit_port(table, sk, child); + local_bh_enable(); +} + +extern void inet_put_port(struct inet_hashinfo *table, struct sock *sk); + #endif /* _INET_HASHTABLES_H */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 6c6c879..9eb8ff7 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -51,14 +51,10 @@ extern struct inet_hashinfo tcp_hashinfo; #define tcp_lhash_users (tcp_hashinfo.lhash_users) #define tcp_lhash_wait (tcp_hashinfo.lhash_wait) #define tcp_portalloc_lock (tcp_hashinfo.portalloc_lock) - -extern kmem_cache_t *tcp_bucket_cachep; +#define tcp_bucket_cachep (tcp_hashinfo.bind_bucket_cachep) extern int tcp_port_rover; -extern void tcp_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, - unsigned short snum); - #if (BITS_PER_LONG == 64) #define TCP_ADDRCMP_ALIGN_BYTES 8 #else @@ -549,9 +545,6 @@ DECLARE_SNMP_STAT(struct tcp_mib, tcp_statistics); #define TCP_ADD_STATS_BH(field, val) SNMP_ADD_STATS_BH(tcp_statistics, field, val) #define TCP_ADD_STATS_USER(field, val) SNMP_ADD_STATS_USER(tcp_statistics, field, val) -extern void tcp_put_port(struct sock *sk); -extern void tcp_inherit_port(struct sock *sk, struct sock *child); - extern void tcp_v4_err(struct sk_buff *skb, u32); extern void tcp_shutdown (struct sock *sk, int how); @@ -1268,7 +1261,7 @@ static __inline__ void tcp_set_state(struct sock *sk, int state) sk->sk_prot->unhash(sk); if (inet_sk(sk)->bind_hash && !(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) - tcp_put_port(sk); + inet_put_port(&tcp_hashinfo, sk); /* fall through */ default: if (oldstate==TCP_ESTABLISHED) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 343a890..33d6cbe 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -14,6 +14,7 @@ */ #include +#include #include #include @@ -49,3 +50,42 @@ void inet_bind_bucket_destroy(kmem_cache_t *cachep, struct inet_bind_bucket *tb) kmem_cache_free(cachep, tb); } } + +void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, + const unsigned short snum) +{ + struct inet_sock *inet = inet_sk(sk); + inet->num = snum; + sk_add_bind_node(sk, &tb->owners); + inet->bind_hash = tb; +} + +EXPORT_SYMBOL(inet_bind_hash); + +/* + * Get rid of any references to a local port held by the given sock. + */ +static void __inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk) +{ + struct inet_sock *inet = inet_sk(sk); + const int bhash = inet_bhashfn(inet->num, hashinfo->bhash_size); + struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; + struct inet_bind_bucket *tb; + + spin_lock(&head->lock); + tb = inet->bind_hash; + __sk_del_bind_node(sk); + inet->bind_hash = NULL; + inet->num = 0; + inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); + spin_unlock(&head->lock); +} + +void inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk) +{ + local_bh_disable(); + __inet_put_port(hashinfo, sk); + local_bh_enable(); +} + +EXPORT_SYMBOL(inet_put_port); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e54a410..38c04c1a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -271,10 +271,6 @@ int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics); -kmem_cache_t *tcp_bucket_cachep; - -EXPORT_SYMBOL_GPL(tcp_bucket_cachep); - kmem_cache_t *tcp_timewait_cachep; atomic_t tcp_orphan_count = ATOMIC_INIT(0); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 10a9b3a..40fe4f5 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -104,37 +104,6 @@ struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { int sysctl_local_port_range[2] = { 1024, 4999 }; int tcp_port_rover = 1024 - 1; -/* Caller must disable local BH processing. */ -static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child) -{ - struct inet_bind_hashbucket *head = - &tcp_bhash[inet_bhashfn(inet_sk(child)->num, - tcp_bhash_size)]; - struct inet_bind_bucket *tb; - - spin_lock(&head->lock); - tb = inet_sk(sk)->bind_hash; - sk_add_bind_node(child, &tb->owners); - inet_sk(child)->bind_hash = tb; - spin_unlock(&head->lock); -} - -inline void tcp_inherit_port(struct sock *sk, struct sock *child) -{ - local_bh_disable(); - __tcp_inherit_port(sk, child); - local_bh_enable(); -} - -void tcp_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, - const unsigned short snum) -{ - struct inet_sock *inet = inet_sk(sk); - inet->num = snum; - sk_add_bind_node(sk, &tb->owners); - inet->bind_hash = tb; -} - static inline int tcp_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb) { const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk); @@ -248,7 +217,7 @@ tb_not_found: tb->fastreuse = 0; success: if (!inet_sk(sk)->bind_hash) - tcp_bind_hash(sk, tb, snum); + inet_bind_hash(sk, tb, snum); BUG_TRAP(inet_sk(sk)->bind_hash == tb); ret = 0; @@ -259,32 +228,6 @@ fail: return ret; } -/* Get rid of any references to a local port held by the - * given sock. - */ -static void __tcp_put_port(struct sock *sk) -{ - struct inet_sock *inet = inet_sk(sk); - struct inet_bind_hashbucket *head = &tcp_bhash[inet_bhashfn(inet->num, - tcp_bhash_size)]; - struct inet_bind_bucket *tb; - - spin_lock(&head->lock); - tb = inet->bind_hash; - __sk_del_bind_node(sk); - inet->bind_hash = NULL; - inet->num = 0; - inet_bind_bucket_destroy(tcp_bucket_cachep, tb); - spin_unlock(&head->lock); -} - -void tcp_put_port(struct sock *sk) -{ - local_bh_disable(); - __tcp_put_port(sk); - local_bh_enable(); -} - /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP. * Look, when several writers sleep and reader wakes them up, all but one * immediately hit write lock and grab all the cpus. Exclusive sleep solves @@ -678,7 +621,7 @@ ok: hint += i; /* Head lock still held and bh's disabled */ - tcp_bind_hash(sk, tb, port); + inet_bind_hash(sk, tb, port); if (sk_unhashed(sk)) { inet_sk(sk)->sport = htons(port); __tcp_v4_hash(sk, 0); @@ -1537,7 +1480,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, tcp_initialize_rcv_mss(newsk); __tcp_v4_hash(newsk, 0); - __tcp_inherit_port(sk, newsk); + __inet_inherit_port(&tcp_hashinfo, sk, newsk); return newsk; @@ -1942,7 +1885,7 @@ int tcp_v4_destroy_sock(struct sock *sk) /* Clean up a referenced TCP bind bucket. */ if (inet_sk(sk)->bind_hash) - tcp_put_port(sk); + inet_put_port(&tcp_hashinfo, sk); /* * If sendmsg cached page exists, toss it. @@ -2486,14 +2429,11 @@ void __init tcp_v4_init(struct net_proto_family *ops) } EXPORT_SYMBOL(ipv4_specific); -EXPORT_SYMBOL(tcp_bind_hash); EXPORT_SYMBOL(inet_bind_bucket_create); EXPORT_SYMBOL(tcp_hashinfo); -EXPORT_SYMBOL(tcp_inherit_port); EXPORT_SYMBOL(tcp_listen_wlock); EXPORT_SYMBOL(tcp_port_rover); EXPORT_SYMBOL(tcp_prot); -EXPORT_SYMBOL(tcp_put_port); EXPORT_SYMBOL(tcp_unhash); EXPORT_SYMBOL(tcp_v4_conn_request); EXPORT_SYMBOL(tcp_v4_connect); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index a8ca7ba..bfbedb5 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -205,7 +205,7 @@ tb_not_found: success: if (!inet_sk(sk)->bind_hash) - tcp_bind_hash(sk, tb, snum); + inet_bind_hash(sk, tb, snum); BUG_TRAP(inet_sk(sk)->bind_hash == tb); ret = 0; @@ -597,7 +597,7 @@ ok: hint += i; /* Head lock still held and bh's disabled */ - tcp_bind_hash(sk, tb, port); + inet_bind_hash(sk, tb, port); if (sk_unhashed(sk)) { inet_sk(sk)->sport = htons(port); __tcp_v6_hash(sk); @@ -1536,7 +1536,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newinet->daddr = newinet->saddr = newinet->rcv_saddr = LOOPBACK4_IPV6; __tcp_v6_hash(newsk); - tcp_inherit_port(sk, newsk); + inet_inherit_port(&tcp_hashinfo, sk, newsk); return newsk; -- cgit v0.10.2 From 6e04e02165a7209a71db553b7bc48d68421e5ebf Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:07:35 -0700 Subject: [INET]: Move tcp_port_rover to inet_hashinfo Also expose all of the tcp_hashinfo members, i.e. killing those tcp_ehash, etc macros, this will more clearly expose already generic functions and some that need just a bit of work to become generic, as we'll see in the upcoming changesets. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index da97055..da07411 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -117,6 +117,7 @@ struct inet_hashinfo { wait_queue_head_t lhash_wait; spinlock_t portalloc_lock; kmem_cache_t *bind_bucket_cachep; + int port_rover; }; static inline int inet_ehashfn(const __u32 laddr, const __u16 lport, diff --git a/include/net/sock.h b/include/net/sock.h index 69d869e..391d00b 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -136,7 +136,7 @@ struct sock_common { * @sk_no_check: %SO_NO_CHECK setting, wether or not checkup packets * @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO) * @sk_lingertime: %SO_LINGER l_linger setting - * @sk_hashent: hash entry in several tables (e.g. tcp_ehash) + * @sk_hashent: hash entry in several tables (e.g. inet_hashinfo.ehash) * @sk_backlog: always used with the per-socket spinlock held * @sk_callback_lock: used with the callbacks in the end of this struct * @sk_error_queue: rarely used diff --git a/include/net/tcp.h b/include/net/tcp.h index 9eb8ff7..99e4769 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -41,19 +41,7 @@ #endif #include -extern struct inet_hashinfo tcp_hashinfo; -#define tcp_ehash (tcp_hashinfo.ehash) -#define tcp_bhash (tcp_hashinfo.bhash) -#define tcp_ehash_size (tcp_hashinfo.ehash_size) -#define tcp_bhash_size (tcp_hashinfo.bhash_size) -#define tcp_listening_hash (tcp_hashinfo.listening_hash) -#define tcp_lhash_lock (tcp_hashinfo.lhash_lock) -#define tcp_lhash_users (tcp_hashinfo.lhash_users) -#define tcp_lhash_wait (tcp_hashinfo.lhash_wait) -#define tcp_portalloc_lock (tcp_hashinfo.portalloc_lock) -#define tcp_bucket_cachep (tcp_hashinfo.bind_bucket_cachep) - -extern int tcp_port_rover; +extern struct inet_hashinfo tcp_hashinfo; #if (BITS_PER_LONG == 64) #define TCP_ADDRCMP_ALIGN_BYTES 8 @@ -1463,21 +1451,21 @@ extern void tcp_listen_wlock(void); /* - We may sleep inside this lock. * - If sleeping is not required (or called from BH), - * use plain read_(un)lock(&tcp_lhash_lock). + * use plain read_(un)lock(&inet_hashinfo.lhash_lock). */ static inline void tcp_listen_lock(void) { /* read_lock synchronizes to candidates to writers */ - read_lock(&tcp_lhash_lock); - atomic_inc(&tcp_lhash_users); - read_unlock(&tcp_lhash_lock); + read_lock(&tcp_hashinfo.lhash_lock); + atomic_inc(&tcp_hashinfo.lhash_users); + read_unlock(&tcp_hashinfo.lhash_lock); } static inline void tcp_listen_unlock(void) { - if (atomic_dec_and_test(&tcp_lhash_users)) - wake_up(&tcp_lhash_wait); + if (atomic_dec_and_test(&tcp_hashinfo.lhash_users)) + wake_up(&tcp_hashinfo.lhash_wait); } static inline int keepalive_intvl_when(const struct tcp_sock *tp) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 38c04c1a..2f4b1a3 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2257,11 +2257,11 @@ void __init tcp_init(void) __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb), sizeof(skb->cb)); - tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket", - sizeof(struct inet_bind_bucket), - 0, SLAB_HWCACHE_ALIGN, - NULL, NULL); - if (!tcp_bucket_cachep) + tcp_hashinfo.bind_bucket_cachep = + kmem_cache_create("tcp_bind_bucket", + sizeof(struct inet_bind_bucket), 0, + SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!tcp_hashinfo.bind_bucket_cachep) panic("tcp_init: Cannot alloc tcp_bind_bucket cache."); tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket", @@ -2276,7 +2276,7 @@ void __init tcp_init(void) * * The methodology is similar to that of the buffer cache. */ - tcp_ehash = + tcp_hashinfo.ehash = alloc_large_system_hash("TCP established", sizeof(struct inet_ehash_bucket), thash_entries, @@ -2284,37 +2284,37 @@ void __init tcp_init(void) (25 - PAGE_SHIFT) : (27 - PAGE_SHIFT), HASH_HIGHMEM, - &tcp_ehash_size, + &tcp_hashinfo.ehash_size, NULL, 0); - tcp_ehash_size = (1 << tcp_ehash_size) >> 1; - for (i = 0; i < (tcp_ehash_size << 1); i++) { - rwlock_init(&tcp_ehash[i].lock); - INIT_HLIST_HEAD(&tcp_ehash[i].chain); + tcp_hashinfo.ehash_size = (1 << tcp_hashinfo.ehash_size) >> 1; + for (i = 0; i < (tcp_hashinfo.ehash_size << 1); i++) { + rwlock_init(&tcp_hashinfo.ehash[i].lock); + INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain); } - tcp_bhash = + tcp_hashinfo.bhash = alloc_large_system_hash("TCP bind", sizeof(struct inet_bind_hashbucket), - tcp_ehash_size, + tcp_hashinfo.ehash_size, (num_physpages >= 128 * 1024) ? (25 - PAGE_SHIFT) : (27 - PAGE_SHIFT), HASH_HIGHMEM, - &tcp_bhash_size, + &tcp_hashinfo.bhash_size, NULL, 64 * 1024); - tcp_bhash_size = 1 << tcp_bhash_size; - for (i = 0; i < tcp_bhash_size; i++) { - spin_lock_init(&tcp_bhash[i].lock); - INIT_HLIST_HEAD(&tcp_bhash[i].chain); + tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size; + for (i = 0; i < tcp_hashinfo.bhash_size; i++) { + spin_lock_init(&tcp_hashinfo.bhash[i].lock); + INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); } /* Try to be a bit smarter and adjust defaults depending * on available memory. */ for (order = 0; ((1 << order) << PAGE_SHIFT) < - (tcp_bhash_size * sizeof(struct inet_bind_hashbucket)); + (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket)); order++) ; if (order >= 4) { @@ -2329,7 +2329,7 @@ void __init tcp_init(void) sysctl_tcp_max_orphans >>= (3 - order); sysctl_max_syn_backlog = 128; } - tcp_port_rover = sysctl_local_port_range[0] - 1; + tcp_hashinfo.port_rover = sysctl_local_port_range[0] - 1; sysctl_tcp_mem[0] = 768 << order; sysctl_tcp_mem[1] = 1024 << order; @@ -2344,7 +2344,7 @@ void __init tcp_init(void) printk(KERN_INFO "TCP: Hash tables configured " "(established %d bind %d)\n", - tcp_ehash_size << 1, tcp_bhash_size); + tcp_hashinfo.ehash_size << 1, tcp_hashinfo.bhash_size); tcp_register_congestion_control(&tcp_reno); } diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 5bb6a0f..0ae738b 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -595,7 +595,7 @@ static int tcpdiag_dump(struct sk_buff *skb, struct netlink_callback *cb) struct hlist_node *node; num = 0; - sk_for_each(sk, node, &tcp_listening_hash[i]) { + sk_for_each(sk, node, &tcp_hashinfo.listening_hash[i]) { struct inet_sock *inet = inet_sk(sk); if (num < s_num) { @@ -645,8 +645,8 @@ skip_listen_ht: if (!(r->tcpdiag_states&~(TCPF_LISTEN|TCPF_SYN_RECV))) return skb->len; - for (i = s_i; i < tcp_ehash_size; i++) { - struct inet_ehash_bucket *head = &tcp_ehash[i]; + for (i = s_i; i < tcp_hashinfo.ehash_size; i++) { + struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[i]; struct sock *sk; struct hlist_node *node; @@ -678,7 +678,7 @@ next_normal: if (r->tcpdiag_states&TCPF_TIME_WAIT) { sk_for_each(sk, node, - &tcp_ehash[i + tcp_ehash_size].chain) { + &tcp_hashinfo.ehash[i + tcp_hashinfo.ehash_size].chain) { struct inet_sock *inet = inet_sk(sk); if (num < s_num) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 40fe4f5..f5373f9 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -94,6 +94,7 @@ struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { .lhash_users = ATOMIC_INIT(0), .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait), .portalloc_lock = SPIN_LOCK_UNLOCKED, + .port_rover = 1024 - 1, }; /* @@ -102,7 +103,6 @@ struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { * 32768-61000 */ int sysctl_local_port_range[2] = { 1024, 4999 }; -int tcp_port_rover = 1024 - 1; static inline int tcp_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb) { @@ -146,16 +146,16 @@ static int tcp_v4_get_port(struct sock *sk, unsigned short snum) int remaining = (high - low) + 1; int rover; - spin_lock(&tcp_portalloc_lock); - if (tcp_port_rover < low) + spin_lock(&tcp_hashinfo.portalloc_lock); + if (tcp_hashinfo.port_rover < low) rover = low; else - rover = tcp_port_rover; + rover = tcp_hashinfo.port_rover; do { rover++; if (rover > high) rover = low; - head = &tcp_bhash[inet_bhashfn(rover, tcp_bhash_size)]; + head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)]; spin_lock(&head->lock); inet_bind_bucket_for_each(tb, node, &head->chain) if (tb->port == rover) @@ -164,8 +164,8 @@ static int tcp_v4_get_port(struct sock *sk, unsigned short snum) next: spin_unlock(&head->lock); } while (--remaining > 0); - tcp_port_rover = rover; - spin_unlock(&tcp_portalloc_lock); + tcp_hashinfo.port_rover = rover; + spin_unlock(&tcp_hashinfo.portalloc_lock); /* Exhausted local port range during search? It is not * possible for us to be holding one of the bind hash @@ -182,7 +182,7 @@ static int tcp_v4_get_port(struct sock *sk, unsigned short snum) */ snum = rover; } else { - head = &tcp_bhash[inet_bhashfn(snum, tcp_bhash_size)]; + head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)]; spin_lock(&head->lock); inet_bind_bucket_for_each(tb, node, &head->chain) if (tb->port == snum) @@ -205,7 +205,7 @@ tb_found: } tb_not_found: ret = 1; - if (!tb && (tb = inet_bind_bucket_create(tcp_bucket_cachep, head, snum)) == NULL) + if (!tb && (tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, snum)) == NULL) goto fail_unlock; if (hlist_empty(&tb->owners)) { if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) @@ -237,22 +237,22 @@ fail: void tcp_listen_wlock(void) { - write_lock(&tcp_lhash_lock); + write_lock(&tcp_hashinfo.lhash_lock); - if (atomic_read(&tcp_lhash_users)) { + if (atomic_read(&tcp_hashinfo.lhash_users)) { DEFINE_WAIT(wait); for (;;) { - prepare_to_wait_exclusive(&tcp_lhash_wait, + prepare_to_wait_exclusive(&tcp_hashinfo.lhash_wait, &wait, TASK_UNINTERRUPTIBLE); - if (!atomic_read(&tcp_lhash_users)) + if (!atomic_read(&tcp_hashinfo.lhash_users)) break; - write_unlock_bh(&tcp_lhash_lock); + write_unlock_bh(&tcp_hashinfo.lhash_lock); schedule(); - write_lock_bh(&tcp_lhash_lock); + write_lock_bh(&tcp_hashinfo.lhash_lock); } - finish_wait(&tcp_lhash_wait, &wait); + finish_wait(&tcp_hashinfo.lhash_wait, &wait); } } @@ -263,20 +263,20 @@ static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible) BUG_TRAP(sk_unhashed(sk)); if (listen_possible && sk->sk_state == TCP_LISTEN) { - list = &tcp_listening_hash[inet_sk_listen_hashfn(sk)]; - lock = &tcp_lhash_lock; + list = &tcp_hashinfo.listening_hash[inet_sk_listen_hashfn(sk)]; + lock = &tcp_hashinfo.lhash_lock; tcp_listen_wlock(); } else { - sk->sk_hashent = inet_sk_ehashfn(sk, tcp_ehash_size); - list = &tcp_ehash[sk->sk_hashent].chain; - lock = &tcp_ehash[sk->sk_hashent].lock; + sk->sk_hashent = inet_sk_ehashfn(sk, tcp_hashinfo.ehash_size); + list = &tcp_hashinfo.ehash[sk->sk_hashent].chain; + lock = &tcp_hashinfo.ehash[sk->sk_hashent].lock; write_lock(lock); } __sk_add_node(sk, list); sock_prot_inc_use(sk->sk_prot); write_unlock(lock); if (listen_possible && sk->sk_state == TCP_LISTEN) - wake_up(&tcp_lhash_wait); + wake_up(&tcp_hashinfo.lhash_wait); } static void tcp_v4_hash(struct sock *sk) @@ -298,9 +298,9 @@ void tcp_unhash(struct sock *sk) if (sk->sk_state == TCP_LISTEN) { local_bh_disable(); tcp_listen_wlock(); - lock = &tcp_lhash_lock; + lock = &tcp_hashinfo.lhash_lock; } else { - struct inet_ehash_bucket *head = &tcp_ehash[sk->sk_hashent]; + struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[sk->sk_hashent]; lock = &head->lock; write_lock_bh(&head->lock); } @@ -311,7 +311,7 @@ void tcp_unhash(struct sock *sk) ende: if (sk->sk_state == TCP_LISTEN) - wake_up(&tcp_lhash_wait); + wake_up(&tcp_hashinfo.lhash_wait); } /* Don't inline this cruft. Here are some nice properties to @@ -366,8 +366,8 @@ static inline struct sock *tcp_v4_lookup_listener(const u32 daddr, struct sock *sk = NULL; struct hlist_head *head; - read_lock(&tcp_lhash_lock); - head = &tcp_listening_hash[inet_lhashfn(hnum)]; + read_lock(&tcp_hashinfo.lhash_lock); + head = &tcp_hashinfo.listening_hash[inet_lhashfn(hnum)]; if (!hlist_empty(head)) { struct inet_sock *inet = inet_sk((sk = __sk_head(head))); @@ -382,7 +382,7 @@ static inline struct sock *tcp_v4_lookup_listener(const u32 daddr, sherry_cache: sock_hold(sk); } - read_unlock(&tcp_lhash_lock); + read_unlock(&tcp_hashinfo.lhash_lock); return sk; } @@ -406,8 +406,8 @@ static inline struct sock *__tcp_v4_lookup_established(const u32 saddr, /* Optimize here for direct hit, only listening connections can * have wildcards anyways. */ - const int hash = inet_ehashfn(daddr, hnum, saddr, sport, tcp_ehash_size); - head = &tcp_ehash[hash]; + const int hash = inet_ehashfn(daddr, hnum, saddr, sport, tcp_hashinfo.ehash_size); + head = &tcp_hashinfo.ehash[hash]; read_lock(&head->lock); sk_for_each(sk, node, &head->chain) { if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif)) @@ -415,7 +415,7 @@ static inline struct sock *__tcp_v4_lookup_established(const u32 saddr, } /* Must check for a TIME_WAIT'er before going to listener hash. */ - sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) { + sk_for_each(sk, node, &(head + tcp_hashinfo.ehash_size)->chain) { if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif)) goto hit; } @@ -469,8 +469,8 @@ static int __tcp_v4_check_established(struct sock *sk, __u16 lport, int dif = sk->sk_bound_dev_if; TCP_V4_ADDR_COOKIE(acookie, saddr, daddr) __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport); - const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_ehash_size); - struct inet_ehash_bucket *head = &tcp_ehash[hash]; + const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_hashinfo.ehash_size); + struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash]; struct sock *sk2; struct hlist_node *node; struct tcp_tw_bucket *tw; @@ -478,7 +478,7 @@ static int __tcp_v4_check_established(struct sock *sk, __u16 lport, write_lock(&head->lock); /* Check TIME-WAIT sockets first. */ - sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) { + sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) { tw = (struct tcp_tw_bucket *)sk2; if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) { @@ -582,7 +582,7 @@ static inline int tcp_v4_hash_connect(struct sock *sk) local_bh_disable(); for (i = 1; i <= range; i++) { port = low + (i + offset) % range; - head = &tcp_bhash[inet_bhashfn(port, tcp_bhash_size)]; + head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)]; spin_lock(&head->lock); /* Does not bother with rcv_saddr checks, @@ -602,7 +602,7 @@ static inline int tcp_v4_hash_connect(struct sock *sk) } } - tb = inet_bind_bucket_create(tcp_bucket_cachep, head, port); + tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port); if (!tb) { spin_unlock(&head->lock); break; @@ -637,7 +637,7 @@ ok: goto out; } - head = &tcp_bhash[inet_bhashfn(snum, tcp_bhash_size)]; + head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)]; tb = inet_sk(sk)->bind_hash; spin_lock_bh(&head->lock); if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { @@ -1926,7 +1926,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur) if (!sk) { st->bucket = 0; - sk = sk_head(&tcp_listening_hash[0]); + sk = sk_head(&tcp_hashinfo.listening_hash[0]); goto get_sk; } @@ -1980,7 +1980,7 @@ start_req: read_unlock_bh(&tp->accept_queue.syn_wait_lock); } if (++st->bucket < INET_LHTABLE_SIZE) { - sk = sk_head(&tcp_listening_hash[st->bucket]); + sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]); goto get_sk; } cur = NULL; @@ -2004,7 +2004,7 @@ static void *established_get_first(struct seq_file *seq) struct tcp_iter_state* st = seq->private; void *rc = NULL; - for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) { + for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) { struct sock *sk; struct hlist_node *node; struct tcp_tw_bucket *tw; @@ -2012,8 +2012,8 @@ static void *established_get_first(struct seq_file *seq) /* We can reschedule _before_ having picked the target: */ cond_resched_softirq(); - read_lock(&tcp_ehash[st->bucket].lock); - sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) { + read_lock(&tcp_hashinfo.ehash[st->bucket].lock); + sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { if (sk->sk_family != st->family) { continue; } @@ -2022,14 +2022,14 @@ static void *established_get_first(struct seq_file *seq) } st->state = TCP_SEQ_STATE_TIME_WAIT; tw_for_each(tw, node, - &tcp_ehash[st->bucket + tcp_ehash_size].chain) { + &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) { if (tw->tw_family != st->family) { continue; } rc = tw; goto out; } - read_unlock(&tcp_ehash[st->bucket].lock); + read_unlock(&tcp_hashinfo.ehash[st->bucket].lock); st->state = TCP_SEQ_STATE_ESTABLISHED; } out: @@ -2056,15 +2056,15 @@ get_tw: cur = tw; goto out; } - read_unlock(&tcp_ehash[st->bucket].lock); + read_unlock(&tcp_hashinfo.ehash[st->bucket].lock); st->state = TCP_SEQ_STATE_ESTABLISHED; /* We can reschedule between buckets: */ cond_resched_softirq(); - if (++st->bucket < tcp_ehash_size) { - read_lock(&tcp_ehash[st->bucket].lock); - sk = sk_head(&tcp_ehash[st->bucket].chain); + if (++st->bucket < tcp_hashinfo.ehash_size) { + read_lock(&tcp_hashinfo.ehash[st->bucket].lock); + sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain); } else { cur = NULL; goto out; @@ -2078,7 +2078,7 @@ get_tw: } st->state = TCP_SEQ_STATE_TIME_WAIT; - tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain); + tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain); goto get_tw; found: cur = sk; @@ -2173,7 +2173,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v) case TCP_SEQ_STATE_TIME_WAIT: case TCP_SEQ_STATE_ESTABLISHED: if (v) - read_unlock(&tcp_ehash[st->bucket].lock); + read_unlock(&tcp_hashinfo.ehash[st->bucket].lock); local_bh_enable(); break; } @@ -2432,7 +2432,6 @@ EXPORT_SYMBOL(ipv4_specific); EXPORT_SYMBOL(inet_bind_bucket_create); EXPORT_SYMBOL(tcp_hashinfo); EXPORT_SYMBOL(tcp_listen_wlock); -EXPORT_SYMBOL(tcp_port_rover); EXPORT_SYMBOL(tcp_prot); EXPORT_SYMBOL(tcp_unhash); EXPORT_SYMBOL(tcp_v4_conn_request); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 267cea1..f29e2f6 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -60,12 +60,11 @@ int tcp_tw_count; /* Must be called with locally disabled BHs. */ static void tcp_timewait_kill(struct tcp_tw_bucket *tw) { - struct inet_ehash_bucket *ehead; struct inet_bind_hashbucket *bhead; struct inet_bind_bucket *tb; - /* Unlink from established hashes. */ - ehead = &tcp_ehash[tw->tw_hashent]; + struct inet_ehash_bucket *ehead = &tcp_hashinfo.ehash[tw->tw_hashent]; + write_lock(&ehead->lock); if (hlist_unhashed(&tw->tw_node)) { write_unlock(&ehead->lock); @@ -76,12 +75,12 @@ static void tcp_timewait_kill(struct tcp_tw_bucket *tw) write_unlock(&ehead->lock); /* Disassociate with bind bucket. */ - bhead = &tcp_bhash[inet_bhashfn(tw->tw_num, tcp_bhash_size)]; + bhead = &tcp_hashinfo.bhash[inet_bhashfn(tw->tw_num, tcp_hashinfo.bhash_size)]; spin_lock(&bhead->lock); tb = tw->tw_tb; __hlist_del(&tw->tw_bind_node); tw->tw_tb = NULL; - inet_bind_bucket_destroy(tcp_bucket_cachep, tb); + inet_bind_bucket_destroy(tcp_hashinfo.bind_bucket_cachep, tb); spin_unlock(&bhead->lock); #ifdef SOCK_REFCNT_DEBUG @@ -297,13 +296,13 @@ kill: static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw) { const struct inet_sock *inet = inet_sk(sk); - struct inet_ehash_bucket *ehead = &tcp_ehash[sk->sk_hashent]; + struct inet_ehash_bucket *ehead = &tcp_hashinfo.ehash[sk->sk_hashent]; struct inet_bind_hashbucket *bhead; /* Step 1: Put TW into bind hash. Original socket stays there too. Note, that any socket with inet->num != 0 MUST be bound in binding cache, even if it is closed. */ - bhead = &tcp_bhash[inet_bhashfn(inet->num, tcp_bhash_size)]; + bhead = &tcp_hashinfo.bhash[inet_bhashfn(inet->num, tcp_hashinfo.bhash_size)]; spin_lock(&bhead->lock); tw->tw_tb = inet->bind_hash; BUG_TRAP(inet->bind_hash); @@ -317,7 +316,7 @@ static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw) sock_prot_dec_use(sk->sk_prot); /* Step 3: Hash TW into TIMEWAIT half of established hash table. */ - tw_add_node(tw, &(ehead + tcp_ehash_size)->chain); + tw_add_node(tw, &(ehead + tcp_hashinfo.ehash_size)->chain); atomic_inc(&tw->tw_refcnt); write_unlock(&ehead->lock); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index bfbedb5..362ef5a 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -84,7 +84,7 @@ static __inline__ int tcp_v6_hashfn(struct in6_addr *laddr, u16 lport, hashent ^= (laddr->s6_addr32[3] ^ faddr->s6_addr32[3]); hashent ^= hashent>>16; hashent ^= hashent>>8; - return (hashent & (tcp_ehash_size - 1)); + return (hashent & (tcp_hashinfo.ehash_size - 1)); } static __inline__ int tcp_v6_sk_hashfn(struct sock *sk) @@ -138,15 +138,15 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum) int remaining = (high - low) + 1; int rover; - spin_lock(&tcp_portalloc_lock); - if (tcp_port_rover < low) + spin_lock(&tcp_hashinfo.portalloc_lock); + if (tcp_hashinfo.port_rover < low) rover = low; else - rover = tcp_port_rover; + rover = tcp_hashinfo.port_rover; do { rover++; if (rover > high) rover = low; - head = &tcp_bhash[inet_bhashfn(rover, tcp_bhash_size)]; + head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)]; spin_lock(&head->lock); inet_bind_bucket_for_each(tb, node, &head->chain) if (tb->port == rover) @@ -155,8 +155,8 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum) next: spin_unlock(&head->lock); } while (--remaining > 0); - tcp_port_rover = rover; - spin_unlock(&tcp_portalloc_lock); + tcp_hashinfo.port_rover = rover; + spin_unlock(&tcp_hashinfo.portalloc_lock); /* Exhausted local port range during search? It is not * possible for us to be holding one of the bind hash @@ -171,7 +171,7 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum) /* OK, here is the one we will use. */ snum = rover; } else { - head = &tcp_bhash[inet_bhashfn(snum, tcp_bhash_size)]; + head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)]; spin_lock(&head->lock); inet_bind_bucket_for_each(tb, node, &head->chain) if (tb->port == snum) @@ -192,8 +192,11 @@ tb_found: } tb_not_found: ret = 1; - if (!tb && (tb = inet_bind_bucket_create(tcp_bucket_cachep, head, snum)) == NULL) - goto fail_unlock; + if (tb == NULL) { + tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, snum); + if (tb == NULL) + goto fail_unlock; + } if (hlist_empty(&tb->owners)) { if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) tb->fastreuse = 1; @@ -224,13 +227,13 @@ static __inline__ void __tcp_v6_hash(struct sock *sk) BUG_TRAP(sk_unhashed(sk)); if (sk->sk_state == TCP_LISTEN) { - list = &tcp_listening_hash[inet_sk_listen_hashfn(sk)]; - lock = &tcp_lhash_lock; + list = &tcp_hashinfo.listening_hash[inet_sk_listen_hashfn(sk)]; + lock = &tcp_hashinfo.lhash_lock; tcp_listen_wlock(); } else { sk->sk_hashent = tcp_v6_sk_hashfn(sk); - list = &tcp_ehash[sk->sk_hashent].chain; - lock = &tcp_ehash[sk->sk_hashent].lock; + list = &tcp_hashinfo.ehash[sk->sk_hashent].chain; + lock = &tcp_hashinfo.ehash[sk->sk_hashent].lock; write_lock(lock); } @@ -263,8 +266,8 @@ static struct sock *tcp_v6_lookup_listener(struct in6_addr *daddr, unsigned shor int score, hiscore; hiscore=0; - read_lock(&tcp_lhash_lock); - sk_for_each(sk, node, &tcp_listening_hash[inet_lhashfn(hnum)]) { + read_lock(&tcp_hashinfo.lhash_lock); + sk_for_each(sk, node, &tcp_hashinfo.listening_hash[inet_lhashfn(hnum)]) { if (inet_sk(sk)->num == hnum && sk->sk_family == PF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); @@ -291,7 +294,7 @@ static struct sock *tcp_v6_lookup_listener(struct in6_addr *daddr, unsigned shor } if (result) sock_hold(result); - read_unlock(&tcp_lhash_lock); + read_unlock(&tcp_hashinfo.lhash_lock); return result; } @@ -315,7 +318,7 @@ static inline struct sock *__tcp_v6_lookup_established(struct in6_addr *saddr, u * have wildcards anyways. */ hash = tcp_v6_hashfn(daddr, hnum, saddr, sport); - head = &tcp_ehash[hash]; + head = &tcp_hashinfo.ehash[hash]; read_lock(&head->lock); sk_for_each(sk, node, &head->chain) { /* For IPV6 do the cheaper port and family tests first. */ @@ -323,7 +326,7 @@ static inline struct sock *__tcp_v6_lookup_established(struct in6_addr *saddr, u goto hit; /* You sunk my battleship! */ } /* Must check for a TIME_WAIT'er before going to listener hash. */ - sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) { + sk_for_each(sk, node, &(head + tcp_hashinfo.ehash_size)->chain) { /* FIXME: acme: check this... */ struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk; @@ -461,7 +464,7 @@ static int __tcp_v6_check_established(struct sock *sk, __u16 lport, int dif = sk->sk_bound_dev_if; u32 ports = TCP_COMBINED_PORTS(inet->dport, lport); int hash = tcp_v6_hashfn(daddr, inet->num, saddr, inet->dport); - struct inet_ehash_bucket *head = &tcp_ehash[hash]; + struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash]; struct sock *sk2; struct hlist_node *node; struct tcp_tw_bucket *tw; @@ -469,7 +472,7 @@ static int __tcp_v6_check_established(struct sock *sk, __u16 lport, write_lock(&head->lock); /* Check TIME-WAIT sockets first. */ - sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) { + sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) { tw = (struct tcp_tw_bucket*)sk2; if(*((__u32 *)&(tw->tw_dport)) == ports && @@ -558,7 +561,7 @@ static int tcp_v6_hash_connect(struct sock *sk) local_bh_disable(); for (i = 1; i <= range; i++) { port = low + (i + offset) % range; - head = &tcp_bhash[inet_bhashfn(port, tcp_bhash_size)]; + head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)]; spin_lock(&head->lock); /* Does not bother with rcv_saddr checks, @@ -578,7 +581,7 @@ static int tcp_v6_hash_connect(struct sock *sk) } } - tb = inet_bind_bucket_create(tcp_bucket_cachep, head, port); + tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port); if (!tb) { spin_unlock(&head->lock); break; @@ -613,7 +616,7 @@ ok: goto out; } - head = &tcp_bhash[inet_bhashfn(snum, tcp_bhash_size)]; + head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)]; tb = inet_sk(sk)->bind_hash; spin_lock_bh(&head->lock); -- cgit v0.10.2 From f3f05f7046e7c85b04af390d95a82a27160dd5d0 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:08:09 -0700 Subject: [INET]: Generalise the tcp_listen_ lock routines Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index da07411..f5d6512 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -19,10 +19,14 @@ #include #include #include +#include /* only for TCP_LISTEN, damn :-( */ #include +#include #include +#include + /* This is for all connections with a full identity, no wildcards. * New scheme, half the table is for TIME_WAIT, the other half is * for the rest. I'll experiment with dynamic table growth later. @@ -192,4 +196,48 @@ static inline void inet_inherit_port(struct inet_hashinfo *table, extern void inet_put_port(struct inet_hashinfo *table, struct sock *sk); +extern void inet_listen_wlock(struct inet_hashinfo *hashinfo); + +/* + * - We may sleep inside this lock. + * - If sleeping is not required (or called from BH), + * use plain read_(un)lock(&inet_hashinfo.lhash_lock). + */ +static inline void inet_listen_lock(struct inet_hashinfo *hashinfo) +{ + /* read_lock synchronizes to candidates to writers */ + read_lock(&hashinfo->lhash_lock); + atomic_inc(&hashinfo->lhash_users); + read_unlock(&hashinfo->lhash_lock); +} + +static inline void inet_listen_unlock(struct inet_hashinfo *hashinfo) +{ + if (atomic_dec_and_test(&hashinfo->lhash_users)) + wake_up(&hashinfo->lhash_wait); +} + +static inline void __inet_hash(struct inet_hashinfo *hashinfo, + struct sock *sk, const int listen_possible) +{ + struct hlist_head *list; + rwlock_t *lock; + + BUG_TRAP(sk_unhashed(sk)); + if (listen_possible && sk->sk_state == TCP_LISTEN) { + list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; + lock = &hashinfo->lhash_lock; + inet_listen_wlock(hashinfo); + } else { + sk->sk_hashent = inet_sk_ehashfn(sk, hashinfo->ehash_size); + list = &hashinfo->ehash[sk->sk_hashent].chain; + lock = &hashinfo->ehash[sk->sk_hashent].lock; + write_lock(lock); + } + __sk_add_node(sk, list); + sock_prot_inc_use(sk->sk_prot); + write_unlock(lock); + if (listen_possible && sk->sk_state == TCP_LISTEN) + wake_up(&hashinfo->lhash_wait); +} #endif /* _INET_HASHTABLES_H */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 99e4769..bc110cc 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1447,27 +1447,6 @@ static __inline__ void tcp_openreq_init(struct request_sock *req, extern void tcp_enter_memory_pressure(void); -extern void tcp_listen_wlock(void); - -/* - We may sleep inside this lock. - * - If sleeping is not required (or called from BH), - * use plain read_(un)lock(&inet_hashinfo.lhash_lock). - */ - -static inline void tcp_listen_lock(void) -{ - /* read_lock synchronizes to candidates to writers */ - read_lock(&tcp_hashinfo.lhash_lock); - atomic_inc(&tcp_hashinfo.lhash_users); - read_unlock(&tcp_hashinfo.lhash_lock); -} - -static inline void tcp_listen_unlock(void) -{ - if (atomic_dec_and_test(&tcp_hashinfo.lhash_users)) - wake_up(&tcp_hashinfo.lhash_wait); -} - static inline int keepalive_intvl_when(const struct tcp_sock *tp) { return tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 33d6cbe..06cbc6f 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -15,7 +15,9 @@ #include #include +#include #include +#include #include @@ -89,3 +91,33 @@ void inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk) } EXPORT_SYMBOL(inet_put_port); + +/* + * This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP. + * Look, when several writers sleep and reader wakes them up, all but one + * immediately hit write lock and grab all the cpus. Exclusive sleep solves + * this, _but_ remember, it adds useless work on UP machines (wake up each + * exclusive lock release). It should be ifdefed really. + */ +void inet_listen_wlock(struct inet_hashinfo *hashinfo) +{ + write_lock(&hashinfo->lhash_lock); + + if (atomic_read(&hashinfo->lhash_users)) { + DEFINE_WAIT(wait); + + for (;;) { + prepare_to_wait_exclusive(&hashinfo->lhash_wait, + &wait, TASK_UNINTERRUPTIBLE); + if (!atomic_read(&hashinfo->lhash_users)) + break; + write_unlock_bh(&hashinfo->lhash_lock); + schedule(); + write_lock_bh(&hashinfo->lhash_lock); + } + + finish_wait(&hashinfo->lhash_wait, &wait); + } +} + +EXPORT_SYMBOL(inet_listen_wlock); diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 0ae738b..1a89a03 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -589,7 +589,7 @@ static int tcpdiag_dump(struct sk_buff *skb, struct netlink_callback *cb) if (cb->args[0] == 0) { if (!(r->tcpdiag_states&(TCPF_LISTEN|TCPF_SYN_RECV))) goto skip_listen_ht; - tcp_listen_lock(); + inet_listen_lock(&tcp_hashinfo); for (i = s_i; i < INET_LHTABLE_SIZE; i++) { struct sock *sk; struct hlist_node *node; @@ -613,7 +613,7 @@ static int tcpdiag_dump(struct sk_buff *skb, struct netlink_callback *cb) goto syn_recv; if (tcpdiag_dump_sock(skb, sk, cb) < 0) { - tcp_listen_unlock(); + inet_listen_unlock(&tcp_hashinfo); goto done; } @@ -622,7 +622,7 @@ syn_recv: goto next_listen; if (tcpdiag_dump_reqs(skb, sk, cb) < 0) { - tcp_listen_unlock(); + inet_listen_unlock(&tcp_hashinfo); goto done; } @@ -636,7 +636,7 @@ next_listen: cb->args[3] = 0; cb->args[4] = 0; } - tcp_listen_unlock(); + inet_listen_unlock(&tcp_hashinfo); skip_listen_ht: cb->args[0] = 1; s_i = num = s_num = 0; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index f5373f9..5f9ad95 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -228,62 +228,11 @@ fail: return ret; } -/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP. - * Look, when several writers sleep and reader wakes them up, all but one - * immediately hit write lock and grab all the cpus. Exclusive sleep solves - * this, _but_ remember, it adds useless work on UP machines (wake up each - * exclusive lock release). It should be ifdefed really. - */ - -void tcp_listen_wlock(void) -{ - write_lock(&tcp_hashinfo.lhash_lock); - - if (atomic_read(&tcp_hashinfo.lhash_users)) { - DEFINE_WAIT(wait); - - for (;;) { - prepare_to_wait_exclusive(&tcp_hashinfo.lhash_wait, - &wait, TASK_UNINTERRUPTIBLE); - if (!atomic_read(&tcp_hashinfo.lhash_users)) - break; - write_unlock_bh(&tcp_hashinfo.lhash_lock); - schedule(); - write_lock_bh(&tcp_hashinfo.lhash_lock); - } - - finish_wait(&tcp_hashinfo.lhash_wait, &wait); - } -} - -static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible) -{ - struct hlist_head *list; - rwlock_t *lock; - - BUG_TRAP(sk_unhashed(sk)); - if (listen_possible && sk->sk_state == TCP_LISTEN) { - list = &tcp_hashinfo.listening_hash[inet_sk_listen_hashfn(sk)]; - lock = &tcp_hashinfo.lhash_lock; - tcp_listen_wlock(); - } else { - sk->sk_hashent = inet_sk_ehashfn(sk, tcp_hashinfo.ehash_size); - list = &tcp_hashinfo.ehash[sk->sk_hashent].chain; - lock = &tcp_hashinfo.ehash[sk->sk_hashent].lock; - write_lock(lock); - } - __sk_add_node(sk, list); - sock_prot_inc_use(sk->sk_prot); - write_unlock(lock); - if (listen_possible && sk->sk_state == TCP_LISTEN) - wake_up(&tcp_hashinfo.lhash_wait); -} - static void tcp_v4_hash(struct sock *sk) { if (sk->sk_state != TCP_CLOSE) { local_bh_disable(); - __tcp_v4_hash(sk, 1); + __inet_hash(&tcp_hashinfo, sk, 1); local_bh_enable(); } } @@ -297,7 +246,7 @@ void tcp_unhash(struct sock *sk) if (sk->sk_state == TCP_LISTEN) { local_bh_disable(); - tcp_listen_wlock(); + inet_listen_wlock(&tcp_hashinfo); lock = &tcp_hashinfo.lhash_lock; } else { struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[sk->sk_hashent]; @@ -624,7 +573,7 @@ ok: inet_bind_hash(sk, tb, port); if (sk_unhashed(sk)) { inet_sk(sk)->sport = htons(port); - __tcp_v4_hash(sk, 0); + __inet_hash(&tcp_hashinfo, sk, 0); } spin_unlock(&head->lock); @@ -641,7 +590,7 @@ ok: tb = inet_sk(sk)->bind_hash; spin_lock_bh(&head->lock); if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { - __tcp_v4_hash(sk, 0); + __inet_hash(&tcp_hashinfo, sk, 0); spin_unlock_bh(&head->lock); return 0; } else { @@ -1479,7 +1428,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newtp->advmss = dst_metric(dst, RTAX_ADVMSS); tcp_initialize_rcv_mss(newsk); - __tcp_v4_hash(newsk, 0); + __inet_hash(&tcp_hashinfo, newsk, 0); __inet_inherit_port(&tcp_hashinfo, sk, newsk); return newsk; @@ -2102,12 +2051,12 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos) void *rc; struct tcp_iter_state* st = seq->private; - tcp_listen_lock(); + inet_listen_lock(&tcp_hashinfo); st->state = TCP_SEQ_STATE_LISTENING; rc = listening_get_idx(seq, &pos); if (!rc) { - tcp_listen_unlock(); + inet_listen_unlock(&tcp_hashinfo); local_bh_disable(); st->state = TCP_SEQ_STATE_ESTABLISHED; rc = established_get_idx(seq, pos); @@ -2140,7 +2089,7 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) case TCP_SEQ_STATE_LISTENING: rc = listening_get_next(seq, v); if (!rc) { - tcp_listen_unlock(); + inet_listen_unlock(&tcp_hashinfo); local_bh_disable(); st->state = TCP_SEQ_STATE_ESTABLISHED; rc = established_get_first(seq); @@ -2168,7 +2117,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v) } case TCP_SEQ_STATE_LISTENING: if (v != SEQ_START_TOKEN) - tcp_listen_unlock(); + inet_listen_unlock(&tcp_hashinfo); break; case TCP_SEQ_STATE_TIME_WAIT: case TCP_SEQ_STATE_ESTABLISHED: @@ -2431,7 +2380,6 @@ void __init tcp_v4_init(struct net_proto_family *ops) EXPORT_SYMBOL(ipv4_specific); EXPORT_SYMBOL(inet_bind_bucket_create); EXPORT_SYMBOL(tcp_hashinfo); -EXPORT_SYMBOL(tcp_listen_wlock); EXPORT_SYMBOL(tcp_prot); EXPORT_SYMBOL(tcp_unhash); EXPORT_SYMBOL(tcp_v4_conn_request); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 362ef5a..93a66b9 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -229,7 +229,7 @@ static __inline__ void __tcp_v6_hash(struct sock *sk) if (sk->sk_state == TCP_LISTEN) { list = &tcp_hashinfo.listening_hash[inet_sk_listen_hashfn(sk)]; lock = &tcp_hashinfo.lhash_lock; - tcp_listen_wlock(); + inet_listen_wlock(&tcp_hashinfo); } else { sk->sk_hashent = tcp_v6_sk_hashfn(sk); list = &tcp_hashinfo.ehash[sk->sk_hashent].chain; -- cgit v0.10.2 From c752f0739f09b803aed191c4765a3b6650a08653 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:08:28 -0700 Subject: [TCP]: Move the tcp sock states to net/tcp_states.h Lots of places just needs the states, not even linux/tcp.h, where this enum was, needs it. This speeds up development of the refactorings as less sources are rebuilt when things get moved from net/tcp.h. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/fs/smbfs/sock.c b/fs/smbfs/sock.c index 93f3cd2..6815b1b 100644 --- a/fs/smbfs/sock.c +++ b/fs/smbfs/sock.c @@ -15,12 +15,12 @@ #include #include #include -#include #include #include #include #include #include +#include #include #include diff --git a/include/linux/tcp.h b/include/linux/tcp.h index e70ab19..b88fe05 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -55,24 +55,6 @@ struct tcphdr { __u16 urg_ptr; }; - -enum { - TCP_ESTABLISHED = 1, - TCP_SYN_SENT, - TCP_SYN_RECV, - TCP_FIN_WAIT1, - TCP_FIN_WAIT2, - TCP_TIME_WAIT, - TCP_CLOSE, - TCP_CLOSE_WAIT, - TCP_LAST_ACK, - TCP_LISTEN, - TCP_CLOSING, /* now a valid state */ - - TCP_MAX_STATES /* Leave at the end! */ -}; - -#define TCP_STATE_MASK 0xF #define TCP_ACTION_FIN (1 << 7) enum { diff --git a/include/net/dn.h b/include/net/dn.h index 5551c46..c1dbbd2 100644 --- a/include/net/dn.h +++ b/include/net/dn.h @@ -3,6 +3,7 @@ #include #include +#include #include typedef unsigned short dn_address; diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index f5d6512..c816708 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -19,11 +19,11 @@ #include #include #include -#include /* only for TCP_LISTEN, damn :-( */ #include #include #include +#include #include diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index f920706..1f2e428 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -12,7 +12,6 @@ #include #include #include -#include #include #include diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 52da5d2..7a3c437 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -255,7 +255,6 @@ struct ip_vs_daemon_user { #include /* for struct atomic_t */ #include /* for struct neighbour */ #include /* for struct dst_entry */ -#include #include #include diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h index 5999e56..c51541e 100644 --- a/include/net/sctp/constants.h +++ b/include/net/sctp/constants.h @@ -47,10 +47,10 @@ #ifndef __sctp_constants_h__ #define __sctp_constants_h__ -#include /* For TCP states used in sctp_sock_state_t */ #include #include /* For ipv6hdr. */ #include +#include /* For TCP states used in sctp_sock_state_t */ /* Value used for stream negotiation. */ enum { SCTP_MAX_STREAM = 0xffff }; diff --git a/include/net/tcp.h b/include/net/tcp.h index bc110cc..9d026d8 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -36,6 +36,8 @@ #include #include #include +#include + #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) #include #endif diff --git a/include/net/tcp_states.h b/include/net/tcp_states.h new file mode 100644 index 0000000..b9d4176 --- /dev/null +++ b/include/net/tcp_states.h @@ -0,0 +1,34 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Definitions for the TCP protocol sk_state field. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#ifndef _LINUX_TCP_STATES_H +#define _LINUX_TCP_STATES_H + +enum { + TCP_ESTABLISHED = 1, + TCP_SYN_SENT, + TCP_SYN_RECV, + TCP_FIN_WAIT1, + TCP_FIN_WAIT2, + TCP_TIME_WAIT, + TCP_CLOSE, + TCP_CLOSE_WAIT, + TCP_LAST_ACK, + TCP_LISTEN, + TCP_CLOSING, /* Now a valid state */ + + TCP_MAX_STATES /* Leave at the end! */ +}; + +#define TCP_STATE_MASK 0xF + +#endif /* _LINUX_TCP_STATES_H */ diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index ffde33c..1d31b3a 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -53,12 +53,12 @@ #include #include -#include #include #include /* For TIOCOUTQ/INQ */ #include #include #include +#include #include #include diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index a5c94f1..ea43dfb 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -45,7 +45,7 @@ #include #include #include -#include +#include #include #include diff --git a/net/ax25/ax25_ds_in.c b/net/ax25/ax25_ds_in.c index 8adc002..5d0f8fb 100644 --- a/net/ax25/ax25_ds_in.c +++ b/net/ax25/ax25_ds_in.c @@ -23,7 +23,7 @@ #include #include #include /* For ip_rcv */ -#include +#include #include #include #include diff --git a/net/ax25/ax25_ds_timer.c b/net/ax25/ax25_ds_timer.c index 3a8b673..061083e 100644 --- a/net/ax25/ax25_ds_timer.c +++ b/net/ax25/ax25_ds_timer.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/net/ax25/ax25_in.c b/net/ax25/ax25_in.c index 124eec8..0357705 100644 --- a/net/ax25/ax25_in.c +++ b/net/ax25/ax25_in.c @@ -27,7 +27,7 @@ #include #include #include /* For ip_rcv */ -#include +#include #include /* For arp_rcv */ #include #include diff --git a/net/ax25/ax25_std_in.c b/net/ax25/ax25_std_in.c index 7131873..83a3338 100644 --- a/net/ax25/ax25_std_in.c +++ b/net/ax25/ax25_std_in.c @@ -30,7 +30,7 @@ #include #include #include /* For ip_rcv */ -#include +#include #include #include #include diff --git a/net/ax25/ax25_std_timer.c b/net/ax25/ax25_std_timer.c index 066897b..a29c480 100644 --- a/net/ax25/ax25_std_timer.c +++ b/net/ax25/ax25_std_timer.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/net/ax25/ax25_subr.c b/net/ax25/ax25_subr.c index eb7343c..c41dbe5 100644 --- a/net/ax25/ax25_subr.c +++ b/net/ax25/ax25_subr.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/net/core/datagram.c b/net/core/datagram.c index fcee054..da9bf71 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -43,7 +43,6 @@ #include #include #include -#include #include #include #include @@ -51,9 +50,10 @@ #include #include -#include -#include +#include +#include +#include /* * Is a socket 'connection oriented' ? diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index bd49dd9..621680f 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -118,7 +118,7 @@ Version 0.0.6 2.1.110 07-aug-98 Eduardo Marcelo Serrat #include #include #include -#include +#include #include #include #include diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c index 202dbde..369f25b 100644 --- a/net/decnet/dn_nsp_in.c +++ b/net/decnet/dn_nsp_in.c @@ -60,7 +60,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index b1db561..3fd49f4 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -17,8 +17,8 @@ #include #include #include -#include #include +#include int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c index d9212ad..6e092da 100644 --- a/net/ipv4/ipvs/ip_vs_app.c +++ b/net/ipv4/ipvs/ip_vs_app.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index 0db405a..291831e 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c @@ -40,7 +40,6 @@ #include #include #include -#include #include #include #include diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index e222c5c..304bb0a 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -59,7 +59,6 @@ #include #include #include -#include #include #include #include @@ -71,6 +70,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index dc4d073..a8135e1 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -95,7 +95,8 @@ #include #include #include -#include +#include +#include #include #include #include diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 5229365..761984f 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index a082646..766e1c7 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index eff050a..2ffe34c 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index 3a13c5d..39d5939 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -44,7 +44,6 @@ #include #include #include -#include #include #include @@ -52,6 +51,7 @@ #include #include #include +#include #include diff --git a/net/ipx/ipx_proc.c b/net/ipx/ipx_proc.c index b676191..1f73d9e 100644 --- a/net/ipx/ipx_proc.c +++ b/net/ipx/ipx_proc.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include static __inline__ struct ipx_interface *ipx_get_interface_idx(loff_t pos) diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index 92c6e8d..6f92f9c 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -56,7 +56,7 @@ #include #include -#include +#include #include diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index f49b82d..66f55e5 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -23,13 +23,13 @@ #include #include #include -#include #include #include #include #include #include #include +#include /* remember: uninitialized global data is zeroed because its in .bss */ static u16 llc_ui_sap_last_autoport = LLC_SAP_DYN_START; diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index 5715486..4c644bc 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/net/llc/llc_if.c b/net/llc/llc_if.c index 0f9fc48..0f84f66 100644 --- a/net/llc/llc_if.c +++ b/net/llc/llc_if.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include @@ -25,6 +24,7 @@ #include #include #include +#include u8 llc_mac_null_var[IFHWADDRLEN]; diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c index 965c94e..34228ef 100644 --- a/net/llc/llc_sap.c +++ b/net/llc/llc_sap.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include /** diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 162a85f..9aa8b14 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -39,7 +39,7 @@ #include #include #include -#include +#include #include #include diff --git a/net/netrom/nr_in.c b/net/netrom/nr_in.c index 9c44b37..2fcba9e 100644 --- a/net/netrom/nr_in.c +++ b/net/netrom/nr_in.c @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include /* For ip_rcv */ #include #include diff --git a/net/netrom/nr_subr.c b/net/netrom/nr_subr.c index 252c1b3..587bed2 100644 --- a/net/netrom/nr_subr.c +++ b/net/netrom/nr_subr.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/net/netrom/nr_timer.c b/net/netrom/nr_timer.c index faabda8..75b72d3 100644 --- a/net/netrom/nr_timer.c +++ b/net/netrom/nr_timer.c @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 5480caf..c6e59f8 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -41,7 +41,7 @@ #include #include #include -#include +#include #include #include diff --git a/net/rose/rose_in.c b/net/rose/rose_in.c index ef475a1..a52417b 100644 --- a/net/rose/rose_in.c +++ b/net/rose/rose_in.c @@ -27,7 +27,7 @@ #include #include #include /* For ip_rcv */ -#include +#include #include #include #include diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index 25da6f6..4510cd7 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/net/rose/rose_subr.c b/net/rose/rose_subr.c index ae135e2..a29a3a9 100644 --- a/net/rose/rose_subr.c +++ b/net/rose/rose_subr.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/net/rose/rose_timer.c b/net/rose/rose_timer.c index 84dd440..50ae037 100644 --- a/net/rose/rose_timer.c +++ b/net/rose/rose_timer.c @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 4a6421a..fa3be2b 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -66,8 +66,8 @@ #include #include -#include #include +#include #include #include #include diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index d0c3120..e750cb6 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include @@ -34,7 +33,7 @@ #include #include #include -#include +#include #include #include diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index d403e34..bc4c445 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -105,7 +105,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 46252d2..6ffc64e 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -76,11 +76,11 @@ #include #include #include -#include #include #include #include +#include /* Internal data structures and random procedures: */ diff --git a/net/wanrouter/af_wanpipe.c b/net/wanrouter/af_wanpipe.c index d93b19f..596cb96 100644 --- a/net/wanrouter/af_wanpipe.c +++ b/net/wanrouter/af_wanpipe.c @@ -57,7 +57,7 @@ #include #include #include -#include +#include #include #include diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 04bec04..020d73c 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -47,7 +47,7 @@ #include #include #include -#include +#include #include #include #include /* For TIOCINQ/OUTQ */ diff --git a/net/x25/x25_in.c b/net/x25/x25_in.c index b0197c7..2614687 100644 --- a/net/x25/x25_in.c +++ b/net/x25/x25_in.c @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include static int x25_queue_rx_frame(struct sock *sk, struct sk_buff *skb, int more) diff --git a/net/x25/x25_subr.c b/net/x25/x25_subr.c index e20cfad..8be9b8f 100644 --- a/net/x25/x25_subr.c +++ b/net/x25/x25_subr.c @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include /* diff --git a/net/x25/x25_timer.c b/net/x25/x25_timer.c index d6a21a3..0a92e1d 100644 --- a/net/x25/x25_timer.c +++ b/net/x25/x25_timer.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include static void x25_heartbeat_expiry(unsigned long); -- cgit v0.10.2 From 81849d106b1fb97f8e2d311c0c4d36347def55b8 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:08:50 -0700 Subject: [INET]: Generalise tcp_v4_hash & tcp_unhash It really just makes the existing code be a helper function that tcp_v4_hash and tcp_unhash uses, specifying the right inet_hashinfo, tcp_hashinfo. One thing I'll investigate at some point is to have the inet_hashinfo pointer in sk_prot, so that we get all the hashtable information from the sk pointer, this can lead to some extra indirections that may well hurt performance/code size, we'll see. Ultimate idea would be that sk_prot would provide _all_ the information about a protocol implementation. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index c816708..6731df2 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -240,4 +240,38 @@ static inline void __inet_hash(struct inet_hashinfo *hashinfo, if (listen_possible && sk->sk_state == TCP_LISTEN) wake_up(&hashinfo->lhash_wait); } + +static inline void inet_hash(struct inet_hashinfo *hashinfo, struct sock *sk) +{ + if (sk->sk_state != TCP_CLOSE) { + local_bh_disable(); + __inet_hash(hashinfo, sk, 1); + local_bh_enable(); + } +} + +static inline void inet_unhash(struct inet_hashinfo *hashinfo, struct sock *sk) +{ + rwlock_t *lock; + + if (sk_unhashed(sk)) + goto out; + + if (sk->sk_state == TCP_LISTEN) { + local_bh_disable(); + inet_listen_wlock(hashinfo); + lock = &hashinfo->lhash_lock; + } else { + struct inet_ehash_bucket *head = &hashinfo->ehash[sk->sk_hashent]; + lock = &head->lock; + write_lock_bh(&head->lock); + } + + if (__sk_del_node_init(sk)) + sock_prot_dec_use(sk->sk_prot); + write_unlock_bh(lock); +out: + if (sk->sk_state == TCP_LISTEN) + wake_up(&hashinfo->lhash_wait); +} #endif /* _INET_HASHTABLES_H */ diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5f9ad95..dca1be6 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -230,37 +230,12 @@ fail: static void tcp_v4_hash(struct sock *sk) { - if (sk->sk_state != TCP_CLOSE) { - local_bh_disable(); - __inet_hash(&tcp_hashinfo, sk, 1); - local_bh_enable(); - } + inet_hash(&tcp_hashinfo, sk); } void tcp_unhash(struct sock *sk) { - rwlock_t *lock; - - if (sk_unhashed(sk)) - goto ende; - - if (sk->sk_state == TCP_LISTEN) { - local_bh_disable(); - inet_listen_wlock(&tcp_hashinfo); - lock = &tcp_hashinfo.lhash_lock; - } else { - struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[sk->sk_hashent]; - lock = &head->lock; - write_lock_bh(&head->lock); - } - - if (__sk_del_node_init(sk)) - sock_prot_dec_use(sk->sk_prot); - write_unlock_bh(lock); - - ende: - if (sk->sk_state == TCP_LISTEN) - wake_up(&tcp_hashinfo.lhash_wait); + inet_unhash(&tcp_hashinfo, sk); } /* Don't inline this cruft. Here are some nice properties to -- cgit v0.10.2 From 33b62231908c58ae04185e4f1063d1e35a7c8576 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:09:06 -0700 Subject: [INET]: Generalise tcp_v4_lookup_listener [acme@toy net-2.6.14]$ grep built-in /tmp/before /tmp/after /tmp/before: 282560 13122 9312 304994 4a762 net/ipv4/built-in.o /tmp/after: 282560 13122 9312 304994 4a762 net/ipv4/built-in.o Will be used in DCCP, not exporting it right now not to get in Adrian Bunk's exported-but-not-used-on-modules radar 8) Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 6731df2..1c4fa00 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -16,8 +16,10 @@ #include #include +#include #include #include +#include #include #include #include @@ -274,4 +276,38 @@ out: if (sk->sk_state == TCP_LISTEN) wake_up(&hashinfo->lhash_wait); } + +extern struct sock *__inet_lookup_listener(const struct hlist_head *head, + const u32 daddr, + const unsigned short hnum, + const int dif); + +/* Optimize the common listener case. */ +static inline struct sock *inet_lookup_listener(struct inet_hashinfo *hashinfo, + const u32 daddr, + const unsigned short hnum, + const int dif) +{ + struct sock *sk = NULL; + struct hlist_head *head; + + read_lock(&hashinfo->lhash_lock); + head = &hashinfo->listening_hash[inet_lhashfn(hnum)]; + if (!hlist_empty(head)) { + const struct inet_sock *inet = inet_sk((sk = __sk_head(head))); + + if (inet->num == hnum && !sk->sk_node.next && + (!inet->rcv_saddr || inet->rcv_saddr == daddr) && + (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) && + !sk->sk_bound_dev_if) + goto sherry_cache; + sk = __inet_lookup_listener(head, daddr, hnum, dif); + } + if (sk) { +sherry_cache: + sock_hold(sk); + } + read_unlock(&hashinfo->lhash_lock); + return sk; +} #endif /* _INET_HASHTABLES_H */ diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 06cbc6f..88fcba0 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -121,3 +121,44 @@ void inet_listen_wlock(struct inet_hashinfo *hashinfo) } EXPORT_SYMBOL(inet_listen_wlock); + +/* + * Don't inline this cruft. Here are some nice properties to exploit here. The + * BSD API does not allow a listening sock to specify the remote port nor the + * remote address for the connection. So always assume those are both + * wildcarded during the search since they can never be otherwise. + */ +struct sock *__inet_lookup_listener(const struct hlist_head *head, const u32 daddr, + const unsigned short hnum, const int dif) +{ + struct sock *result = NULL, *sk; + const struct hlist_node *node; + int hiscore = -1; + + sk_for_each(sk, node, head) { + const struct inet_sock *inet = inet_sk(sk); + + if (inet->num == hnum && !ipv6_only_sock(sk)) { + const __u32 rcv_saddr = inet->rcv_saddr; + int score = sk->sk_family == PF_INET ? 1 : 0; + + if (rcv_saddr) { + if (rcv_saddr != daddr) + continue; + score += 2; + } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + continue; + score += 2; + } + if (score == 5) + return sk; + if (score > hiscore) { + hiscore = score; + result = sk; + } + } + } + return result; +} diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index dca1be6..a678709 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -238,78 +238,6 @@ void tcp_unhash(struct sock *sk) inet_unhash(&tcp_hashinfo, sk); } -/* Don't inline this cruft. Here are some nice properties to - * exploit here. The BSD API does not allow a listening TCP - * to specify the remote port nor the remote address for the - * connection. So always assume those are both wildcarded - * during the search since they can never be otherwise. - */ -static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, - const u32 daddr, - const unsigned short hnum, - const int dif) -{ - struct sock *result = NULL, *sk; - struct hlist_node *node; - int score, hiscore; - - hiscore=-1; - sk_for_each(sk, node, head) { - struct inet_sock *inet = inet_sk(sk); - - if (inet->num == hnum && !ipv6_only_sock(sk)) { - __u32 rcv_saddr = inet->rcv_saddr; - - score = (sk->sk_family == PF_INET ? 1 : 0); - if (rcv_saddr) { - if (rcv_saddr != daddr) - continue; - score+=2; - } - if (sk->sk_bound_dev_if) { - if (sk->sk_bound_dev_if != dif) - continue; - score+=2; - } - if (score == 5) - return sk; - if (score > hiscore) { - hiscore = score; - result = sk; - } - } - } - return result; -} - -/* Optimize the common listener case. */ -static inline struct sock *tcp_v4_lookup_listener(const u32 daddr, - const unsigned short hnum, - const int dif) -{ - struct sock *sk = NULL; - struct hlist_head *head; - - read_lock(&tcp_hashinfo.lhash_lock); - head = &tcp_hashinfo.listening_hash[inet_lhashfn(hnum)]; - if (!hlist_empty(head)) { - struct inet_sock *inet = inet_sk((sk = __sk_head(head))); - - if (inet->num == hnum && !sk->sk_node.next && - (!inet->rcv_saddr || inet->rcv_saddr == daddr) && - (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) && - !sk->sk_bound_dev_if) - goto sherry_cache; - sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif); - } - if (sk) { -sherry_cache: - sock_hold(sk); - } - read_unlock(&tcp_hashinfo.lhash_lock); - return sk; -} - /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM * @@ -358,7 +286,7 @@ static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport, struct sock *sk = __tcp_v4_lookup_established(saddr, sport, daddr, hnum, dif); - return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif); + return sk ? : inet_lookup_listener(&tcp_hashinfo, daddr, hnum, dif); } inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, @@ -1641,9 +1569,10 @@ do_time_wait: switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk, skb, th, skb->len)) { case TCP_TW_SYN: { - struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr, - ntohs(th->dest), - tcp_v4_iif(skb)); + struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo, + skb->nh.iph->daddr, + ntohs(th->dest), + tcp_v4_iif(skb)); if (sk2) { tcp_tw_deschedule((struct tcp_tw_bucket *)sk); tcp_tw_put((struct tcp_tw_bucket *)sk); -- cgit v0.10.2 From 8feaf0c0a5488b3d898a9c207eb6678f44ba3f26 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:09:30 -0700 Subject: [INET]: Generalise tcp_tw_bucket, aka TIME_WAIT sockets This paves the way to generalise the rest of the sock ID lookup routines and saves some bytes in TCPv4 TIME_WAIT sockets on distro kernels (where IPv6 is always built as a module): [root@qemu ~]# grep tw_sock /proc/slabinfo tw_sock_TCPv6 0 0 128 31 1 tw_sock_TCP 0 0 96 41 1 [root@qemu ~]# Now if a protocol wants to use the TIME_WAIT generic infrastructure it only has to set the sk_prot->twsk_obj_size field with the size of its inet_timewait_sock derived sock and proto_register will create sk_prot->twsk_slab, for now its only for INET sockets, but we can introduce timewait_sock later if some non INET transport protocolo wants to use this stuff. Next changesets will take advantage of this new infrastructure to generalise even more TCP code. [acme@toy net-2.6.14]$ grep built-in /tmp/before.size /tmp/after.size /tmp/before.size: 188646 11764 5068 205478 322a6 net/ipv4/built-in.o /tmp/after.size: 188144 11764 5068 204976 320b0 net/ipv4/built-in.o [acme@toy net-2.6.14]$ Tested with both IPv4 & IPv6 (::1 (localhost) & ::ffff:172.20.0.1 (qemu host)). Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 6fcd6a0..98fa323 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -308,6 +308,41 @@ static inline void inet_sk_copy_descendant(struct sock *sk_to, #define __ipv6_only_sock(sk) (inet6_sk(sk)->ipv6only) #define ipv6_only_sock(sk) ((sk)->sk_family == PF_INET6 && __ipv6_only_sock(sk)) + +#include + +struct tcp6_timewait_sock { + struct tcp_timewait_sock tw_v6_sk; + struct in6_addr tw_v6_daddr; + struct in6_addr tw_v6_rcv_saddr; +}; + +static inline struct tcp6_timewait_sock *tcp6_twsk(const struct sock *sk) +{ + return (struct tcp6_timewait_sock *)sk; +} + +static inline struct in6_addr *__tcp_v6_rcv_saddr(const struct sock *sk) +{ + return likely(sk->sk_state != TCP_TIME_WAIT) ? + &inet6_sk(sk)->rcv_saddr : &tcp6_twsk(sk)->tw_v6_rcv_saddr; +} + +static inline struct in6_addr *tcp_v6_rcv_saddr(const struct sock *sk) +{ + return sk->sk_family == AF_INET6 ? __tcp_v6_rcv_saddr(sk) : NULL; +} + +static inline int tcp_twsk_ipv6only(const struct sock *sk) +{ + return inet_twsk(sk)->tw_ipv6only; +} + +static inline int tcp_v6_ipv6only(const struct sock *sk) +{ + return likely(sk->sk_state != TCP_TIME_WAIT) ? + ipv6_only_sock(sk) : tcp_twsk_ipv6only(sk); +} #else #define __ipv6_only_sock(sk) 0 #define ipv6_only_sock(sk) 0 @@ -322,8 +357,19 @@ static inline struct raw6_sock *raw6_sk(const struct sock *sk) return NULL; } -#endif +#define __tcp_v6_rcv_saddr(__sk) NULL +#define tcp_v6_rcv_saddr(__sk) NULL +#define tcp_twsk_ipv6only(__sk) 0 +#define tcp_v6_ipv6only(__sk) 0 +#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */ -#endif +#define INET6_MATCH(__sk, __saddr, __daddr, __ports, __dif) \ + (((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports)) && \ + ((__sk)->sk_family == AF_INET6) && \ + ipv6_addr_equal(&inet6_sk(__sk)->daddr, (__saddr)) && \ + ipv6_addr_equal(&inet6_sk(__sk)->rcv_saddr, (__daddr)) && \ + (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) -#endif +#endif /* __KERNEL__ */ + +#endif /* _IPV6_H */ diff --git a/include/linux/tcp.h b/include/linux/tcp.h index b88fe05..5d295b1 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -179,6 +179,7 @@ struct tcp_info #include #include #include +#include /* This defines a selective acknowledgement block. */ struct tcp_sack_block { @@ -387,6 +388,20 @@ static inline struct tcp_sock *tcp_sk(const struct sock *sk) return (struct tcp_sock *)sk; } +struct tcp_timewait_sock { + struct inet_timewait_sock tw_sk; + __u32 tw_rcv_nxt; + __u32 tw_snd_nxt; + __u32 tw_rcv_wnd; + __u32 tw_ts_recent; + long tw_ts_recent_stamp; +}; + +static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk) +{ + return (struct tcp_timewait_sock *)sk; +} + static inline void *tcp_ca(const struct tcp_sock *tp) { return (void *) tp->ca_priv; diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 1c4fa00..c38c637 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -14,6 +14,8 @@ #ifndef _INET_HASHTABLES_H #define _INET_HASHTABLES_H +#include + #include #include #include @@ -310,4 +312,43 @@ sherry_cache: read_unlock(&hashinfo->lhash_lock); return sk; } + +/* Socket demux engine toys. */ +#ifdef __BIG_ENDIAN +#define INET_COMBINED_PORTS(__sport, __dport) \ + (((__u32)(__sport) << 16) | (__u32)(__dport)) +#else /* __LITTLE_ENDIAN */ +#define INET_COMBINED_PORTS(__sport, __dport) \ + (((__u32)(__dport) << 16) | (__u32)(__sport)) +#endif + +#if (BITS_PER_LONG == 64) +#ifdef __BIG_ENDIAN +#define INET_ADDR_COOKIE(__name, __saddr, __daddr) \ + const __u64 __name = (((__u64)(__saddr)) << 32) | ((__u64)(__daddr)); +#else /* __LITTLE_ENDIAN */ +#define INET_ADDR_COOKIE(__name, __saddr, __daddr) \ + const __u64 __name = (((__u64)(__daddr)) << 32) | ((__u64)(__saddr)); +#endif /* __BIG_ENDIAN */ +#define INET_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\ + (((*((__u64 *)&(inet_sk(__sk)->daddr))) == (__cookie)) && \ + ((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports)) && \ + (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) +#define INET_TW_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\ + (((*((__u64 *)&(inet_twsk(__sk)->tw_daddr))) == (__cookie)) && \ + ((*((__u32 *)&(inet_twsk(__sk)->tw_dport))) == (__ports)) && \ + (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) +#else /* 32-bit arch */ +#define INET_ADDR_COOKIE(__name, __saddr, __daddr) +#define INET_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif) \ + ((inet_sk(__sk)->daddr == (__saddr)) && \ + (inet_sk(__sk)->rcv_saddr == (__daddr)) && \ + ((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports)) && \ + (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) +#define INET_TW_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif) \ + ((inet_twsk(__sk)->tw_daddr == (__saddr)) && \ + (inet_twsk(__sk)->tw_rcv_saddr == (__daddr)) && \ + ((*((__u32 *)&(inet_twsk(__sk)->tw_dport))) == (__ports)) && \ + (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) +#endif /* 64-bit arch */ #endif /* _INET_HASHTABLES_H */ diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h new file mode 100644 index 0000000..ce11704 --- /dev/null +++ b/include/net/inet_timewait_sock.h @@ -0,0 +1,142 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Definitions for a generic INET TIMEWAIT sock + * + * From code originally in net/tcp.h + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#ifndef _INET_TIMEWAIT_SOCK_ +#define _INET_TIMEWAIT_SOCK_ + +#include + +#include +#include + +#include +#include + +#include + +#if (BITS_PER_LONG == 64) +#define INET_TIMEWAIT_ADDRCMP_ALIGN_BYTES 8 +#else +#define INET_TIMEWAIT_ADDRCMP_ALIGN_BYTES 4 +#endif + +struct inet_bind_bucket; + +/* + * This is a TIME_WAIT sock. It works around the memory consumption + * problems of sockets in such a state on heavily loaded servers, but + * without violating the protocol specification. + */ +struct inet_timewait_sock { + /* + * Now struct sock also uses sock_common, so please just + * don't add nothing before this first member (__tw_common) --acme + */ + struct sock_common __tw_common; +#define tw_family __tw_common.skc_family +#define tw_state __tw_common.skc_state +#define tw_reuse __tw_common.skc_reuse +#define tw_bound_dev_if __tw_common.skc_bound_dev_if +#define tw_node __tw_common.skc_node +#define tw_bind_node __tw_common.skc_bind_node +#define tw_refcnt __tw_common.skc_refcnt +#define tw_prot __tw_common.skc_prot + volatile unsigned char tw_substate; + /* 3 bits hole, try to pack */ + unsigned char tw_rcv_wscale; + /* Socket demultiplex comparisons on incoming packets. */ + /* these five are in inet_sock */ + __u16 tw_sport; + __u32 tw_daddr __attribute__((aligned(INET_TIMEWAIT_ADDRCMP_ALIGN_BYTES))); + __u32 tw_rcv_saddr; + __u16 tw_dport; + __u16 tw_num; + /* And these are ours. */ + __u8 tw_ipv6only:1; + /* 31 bits hole, try to pack */ + int tw_hashent; + int tw_timeout; + unsigned long tw_ttd; + struct inet_bind_bucket *tw_tb; + struct hlist_node tw_death_node; +}; + +static inline void inet_twsk_add_node(struct inet_timewait_sock *tw, + struct hlist_head *list) +{ + hlist_add_head(&tw->tw_node, list); +} + +static inline void inet_twsk_add_bind_node(struct inet_timewait_sock *tw, + struct hlist_head *list) +{ + hlist_add_head(&tw->tw_bind_node, list); +} + +static inline int inet_twsk_dead_hashed(const struct inet_timewait_sock *tw) +{ + return tw->tw_death_node.pprev != NULL; +} + +static inline void inet_twsk_dead_node_init(struct inet_timewait_sock *tw) +{ + tw->tw_death_node.pprev = NULL; +} + +static inline void __inet_twsk_del_dead_node(struct inet_timewait_sock *tw) +{ + __hlist_del(&tw->tw_death_node); + inet_twsk_dead_node_init(tw); +} + +static inline int inet_twsk_del_dead_node(struct inet_timewait_sock *tw) +{ + if (inet_twsk_dead_hashed(tw)) { + __inet_twsk_del_dead_node(tw); + return 1; + } + return 0; +} + +#define inet_twsk_for_each(tw, node, head) \ + hlist_for_each_entry(tw, node, head, tw_node) + +#define inet_twsk_for_each_inmate(tw, node, jail) \ + hlist_for_each_entry(tw, node, jail, tw_death_node) + +#define inet_twsk_for_each_inmate_safe(tw, node, safe, jail) \ + hlist_for_each_entry_safe(tw, node, safe, jail, tw_death_node) + +static inline struct inet_timewait_sock *inet_twsk(const struct sock *sk) +{ + return (struct inet_timewait_sock *)sk; +} + +static inline u32 inet_rcv_saddr(const struct sock *sk) +{ + return likely(sk->sk_state != TCP_TIME_WAIT) ? + inet_sk(sk)->rcv_saddr : inet_twsk(sk)->tw_rcv_saddr; +} + +static inline void inet_twsk_put(struct inet_timewait_sock *tw) +{ + if (atomic_dec_and_test(&tw->tw_refcnt)) { +#ifdef SOCK_REFCNT_DEBUG + printk(KERN_DEBUG "%s timewait_sock %p released\n", + tw->tw_prot->name, tw); +#endif + kmem_cache_free(tw->tw_prot->twsk_slab, tw); + } +} +#endif /* _INET_TIMEWAIT_SOCK_ */ diff --git a/include/net/sock.h b/include/net/sock.h index 391d00b..c902c57 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -88,6 +88,7 @@ do { spin_lock_init(&((__sk)->sk_lock.slock)); \ } while(0) struct sock; +struct proto; /** * struct sock_common - minimal network layer representation of sockets @@ -98,10 +99,11 @@ struct sock; * @skc_node: main hash linkage for various protocol lookup tables * @skc_bind_node: bind hash linkage for various protocol lookup tables * @skc_refcnt: reference count + * @skc_prot: protocol handlers inside a network family * * This is the minimal network layer representation of sockets, the header - * for struct sock and struct tcp_tw_bucket. - */ + * for struct sock and struct inet_timewait_sock. + */ struct sock_common { unsigned short skc_family; volatile unsigned char skc_state; @@ -110,11 +112,12 @@ struct sock_common { struct hlist_node skc_node; struct hlist_node skc_bind_node; atomic_t skc_refcnt; + struct proto *skc_prot; }; /** * struct sock - network layer representation of sockets - * @__sk_common: shared layout with tcp_tw_bucket + * @__sk_common: shared layout with inet_timewait_sock * @sk_shutdown: mask of %SEND_SHUTDOWN and/or %RCV_SHUTDOWN * @sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings * @sk_lock: synchronizer @@ -140,7 +143,6 @@ struct sock_common { * @sk_backlog: always used with the per-socket spinlock held * @sk_callback_lock: used with the callbacks in the end of this struct * @sk_error_queue: rarely used - * @sk_prot: protocol handlers inside a network family * @sk_prot_creator: sk_prot of original sock creator (see ipv6_setsockopt, IPV6_ADDRFORM for instance) * @sk_err: last error * @sk_err_soft: errors that don't cause failure but are the cause of a persistent failure not just 'timed out' @@ -173,7 +175,7 @@ struct sock_common { */ struct sock { /* - * Now struct tcp_tw_bucket also uses sock_common, so please just + * Now struct inet_timewait_sock also uses sock_common, so please just * don't add nothing before this first member (__sk_common) --acme */ struct sock_common __sk_common; @@ -184,6 +186,7 @@ struct sock { #define sk_node __sk_common.skc_node #define sk_bind_node __sk_common.skc_bind_node #define sk_refcnt __sk_common.skc_refcnt +#define sk_prot __sk_common.skc_prot unsigned char sk_shutdown : 2, sk_no_check : 2, sk_userlocks : 4; @@ -218,7 +221,6 @@ struct sock { struct sk_buff *tail; } sk_backlog; struct sk_buff_head sk_error_queue; - struct proto *sk_prot; struct proto *sk_prot_creator; rwlock_t sk_callback_lock; int sk_err, @@ -557,6 +559,9 @@ struct proto { kmem_cache_t *slab; unsigned int obj_size; + kmem_cache_t *twsk_slab; + unsigned int twsk_obj_size; + struct request_sock_ops *rsk_prot; struct module *owner; diff --git a/include/net/tcp.h b/include/net/tcp.h index 9d026d8..cf8e664 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -38,207 +38,14 @@ #include #include -#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) -#include -#endif #include extern struct inet_hashinfo tcp_hashinfo; -#if (BITS_PER_LONG == 64) -#define TCP_ADDRCMP_ALIGN_BYTES 8 -#else -#define TCP_ADDRCMP_ALIGN_BYTES 4 -#endif - -/* This is a TIME_WAIT bucket. It works around the memory consumption - * problems of sockets in such a state on heavily loaded servers, but - * without violating the protocol specification. - */ -struct tcp_tw_bucket { - /* - * Now struct sock also uses sock_common, so please just - * don't add nothing before this first member (__tw_common) --acme - */ - struct sock_common __tw_common; -#define tw_family __tw_common.skc_family -#define tw_state __tw_common.skc_state -#define tw_reuse __tw_common.skc_reuse -#define tw_bound_dev_if __tw_common.skc_bound_dev_if -#define tw_node __tw_common.skc_node -#define tw_bind_node __tw_common.skc_bind_node -#define tw_refcnt __tw_common.skc_refcnt - volatile unsigned char tw_substate; - unsigned char tw_rcv_wscale; - __u16 tw_sport; - /* Socket demultiplex comparisons on incoming packets. */ - /* these five are in inet_sock */ - __u32 tw_daddr - __attribute__((aligned(TCP_ADDRCMP_ALIGN_BYTES))); - __u32 tw_rcv_saddr; - __u16 tw_dport; - __u16 tw_num; - /* And these are ours. */ - int tw_hashent; - int tw_timeout; - __u32 tw_rcv_nxt; - __u32 tw_snd_nxt; - __u32 tw_rcv_wnd; - __u32 tw_ts_recent; - long tw_ts_recent_stamp; - unsigned long tw_ttd; - struct inet_bind_bucket *tw_tb; - struct hlist_node tw_death_node; -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - struct in6_addr tw_v6_daddr; - struct in6_addr tw_v6_rcv_saddr; - int tw_v6_ipv6only; -#endif -}; - -static __inline__ void tw_add_node(struct tcp_tw_bucket *tw, - struct hlist_head *list) -{ - hlist_add_head(&tw->tw_node, list); -} - -static __inline__ void tw_add_bind_node(struct tcp_tw_bucket *tw, - struct hlist_head *list) -{ - hlist_add_head(&tw->tw_bind_node, list); -} - -static inline int tw_dead_hashed(struct tcp_tw_bucket *tw) -{ - return tw->tw_death_node.pprev != NULL; -} - -static __inline__ void tw_dead_node_init(struct tcp_tw_bucket *tw) -{ - tw->tw_death_node.pprev = NULL; -} - -static __inline__ void __tw_del_dead_node(struct tcp_tw_bucket *tw) -{ - __hlist_del(&tw->tw_death_node); - tw_dead_node_init(tw); -} - -static __inline__ int tw_del_dead_node(struct tcp_tw_bucket *tw) -{ - if (tw_dead_hashed(tw)) { - __tw_del_dead_node(tw); - return 1; - } - return 0; -} - -#define tw_for_each(tw, node, head) \ - hlist_for_each_entry(tw, node, head, tw_node) - -#define tw_for_each_inmate(tw, node, jail) \ - hlist_for_each_entry(tw, node, jail, tw_death_node) - -#define tw_for_each_inmate_safe(tw, node, safe, jail) \ - hlist_for_each_entry_safe(tw, node, safe, jail, tw_death_node) - -#define tcptw_sk(__sk) ((struct tcp_tw_bucket *)(__sk)) - -static inline u32 tcp_v4_rcv_saddr(const struct sock *sk) -{ - return likely(sk->sk_state != TCP_TIME_WAIT) ? - inet_sk(sk)->rcv_saddr : tcptw_sk(sk)->tw_rcv_saddr; -} - -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -static inline struct in6_addr *__tcp_v6_rcv_saddr(const struct sock *sk) -{ - return likely(sk->sk_state != TCP_TIME_WAIT) ? - &inet6_sk(sk)->rcv_saddr : &tcptw_sk(sk)->tw_v6_rcv_saddr; -} - -static inline struct in6_addr *tcp_v6_rcv_saddr(const struct sock *sk) -{ - return sk->sk_family == AF_INET6 ? __tcp_v6_rcv_saddr(sk) : NULL; -} - -#define tcptw_sk_ipv6only(__sk) (tcptw_sk(__sk)->tw_v6_ipv6only) - -static inline int tcp_v6_ipv6only(const struct sock *sk) -{ - return likely(sk->sk_state != TCP_TIME_WAIT) ? - ipv6_only_sock(sk) : tcptw_sk_ipv6only(sk); -} -#else -# define __tcp_v6_rcv_saddr(__sk) NULL -# define tcp_v6_rcv_saddr(__sk) NULL -# define tcptw_sk_ipv6only(__sk) 0 -# define tcp_v6_ipv6only(__sk) 0 -#endif - -extern kmem_cache_t *tcp_timewait_cachep; - -static inline void tcp_tw_put(struct tcp_tw_bucket *tw) -{ - if (atomic_dec_and_test(&tw->tw_refcnt)) { -#ifdef SOCK_REFCNT_DEBUG - printk(KERN_DEBUG "tw_bucket %p released\n", tw); -#endif - kmem_cache_free(tcp_timewait_cachep, tw); - } -} - extern atomic_t tcp_orphan_count; extern int tcp_tw_count; extern void tcp_time_wait(struct sock *sk, int state, int timeo); -extern void tcp_tw_deschedule(struct tcp_tw_bucket *tw); - - -/* Socket demux engine toys. */ -#ifdef __BIG_ENDIAN -#define TCP_COMBINED_PORTS(__sport, __dport) \ - (((__u32)(__sport)<<16) | (__u32)(__dport)) -#else /* __LITTLE_ENDIAN */ -#define TCP_COMBINED_PORTS(__sport, __dport) \ - (((__u32)(__dport)<<16) | (__u32)(__sport)) -#endif - -#if (BITS_PER_LONG == 64) -#ifdef __BIG_ENDIAN -#define TCP_V4_ADDR_COOKIE(__name, __saddr, __daddr) \ - __u64 __name = (((__u64)(__saddr))<<32)|((__u64)(__daddr)); -#else /* __LITTLE_ENDIAN */ -#define TCP_V4_ADDR_COOKIE(__name, __saddr, __daddr) \ - __u64 __name = (((__u64)(__daddr))<<32)|((__u64)(__saddr)); -#endif /* __BIG_ENDIAN */ -#define TCP_IPV4_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\ - (((*((__u64 *)&(inet_sk(__sk)->daddr)))== (__cookie)) && \ - ((*((__u32 *)&(inet_sk(__sk)->dport)))== (__ports)) && \ - (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) -#define TCP_IPV4_TW_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\ - (((*((__u64 *)&(tcptw_sk(__sk)->tw_daddr))) == (__cookie)) && \ - ((*((__u32 *)&(tcptw_sk(__sk)->tw_dport))) == (__ports)) && \ - (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) -#else /* 32-bit arch */ -#define TCP_V4_ADDR_COOKIE(__name, __saddr, __daddr) -#define TCP_IPV4_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\ - ((inet_sk(__sk)->daddr == (__saddr)) && \ - (inet_sk(__sk)->rcv_saddr == (__daddr)) && \ - ((*((__u32 *)&(inet_sk(__sk)->dport)))== (__ports)) && \ - (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) -#define TCP_IPV4_TW_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\ - ((tcptw_sk(__sk)->tw_daddr == (__saddr)) && \ - (tcptw_sk(__sk)->tw_rcv_saddr == (__daddr)) && \ - ((*((__u32 *)&(tcptw_sk(__sk)->tw_dport))) == (__ports)) && \ - (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) -#endif /* 64-bit arch */ - -#define TCP_IPV6_MATCH(__sk, __saddr, __daddr, __ports, __dif) \ - (((*((__u32 *)&(inet_sk(__sk)->dport)))== (__ports)) && \ - ((__sk)->sk_family == AF_INET6) && \ - ipv6_addr_equal(&inet6_sk(__sk)->daddr, (__saddr)) && \ - ipv6_addr_equal(&inet6_sk(__sk)->rcv_saddr, (__daddr)) && \ - (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) +extern void tcp_tw_deschedule(struct inet_timewait_sock *tw); #define MAX_TCP_HEADER (128 + MAX_HEADER) @@ -543,7 +350,7 @@ extern int tcp_v4_rcv(struct sk_buff *skb); extern int tcp_v4_remember_stamp(struct sock *sk); -extern int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw); +extern int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw); extern int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t size); @@ -616,10 +423,9 @@ enum tcp_tw_status }; -extern enum tcp_tw_status tcp_timewait_state_process(struct tcp_tw_bucket *tw, +extern enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, - struct tcphdr *th, - unsigned len); + const struct tcphdr *th); extern struct sock * tcp_check_req(struct sock *sk,struct sk_buff *skb, struct request_sock *req, diff --git a/net/core/sock.c b/net/core/sock.c index a1a23be..aba31fe 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1378,7 +1378,8 @@ static LIST_HEAD(proto_list); int proto_register(struct proto *prot, int alloc_slab) { - char *request_sock_slab_name; + char *request_sock_slab_name = NULL; + char *timewait_sock_slab_name; int rc = -ENOBUFS; if (alloc_slab) { @@ -1409,6 +1410,23 @@ int proto_register(struct proto *prot, int alloc_slab) goto out_free_request_sock_slab_name; } } + + if (prot->twsk_obj_size) { + static const char mask[] = "tw_sock_%s"; + + timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL); + + if (timewait_sock_slab_name == NULL) + goto out_free_request_sock_slab; + + sprintf(timewait_sock_slab_name, mask, prot->name); + prot->twsk_slab = kmem_cache_create(timewait_sock_slab_name, + prot->twsk_obj_size, + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (prot->twsk_slab == NULL) + goto out_free_timewait_sock_slab_name; + } } write_lock(&proto_list_lock); @@ -1417,6 +1435,13 @@ int proto_register(struct proto *prot, int alloc_slab) rc = 0; out: return rc; +out_free_timewait_sock_slab_name: + kfree(timewait_sock_slab_name); +out_free_request_sock_slab: + if (prot->rsk_prot && prot->rsk_prot->slab) { + kmem_cache_destroy(prot->rsk_prot->slab); + prot->rsk_prot->slab = NULL; + } out_free_request_sock_slab_name: kfree(request_sock_slab_name); out_free_sock_slab: @@ -1444,6 +1469,14 @@ void proto_unregister(struct proto *prot) prot->rsk_prot->slab = NULL; } + if (prot->twsk_slab != NULL) { + const char *name = kmem_cache_name(prot->twsk_slab); + + kmem_cache_destroy(prot->twsk_slab); + kfree(name); + prot->twsk_slab = NULL; + } + list_del(&prot->node); write_unlock(&proto_list_lock); } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 2f4b1a3..f1a708b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -271,8 +271,6 @@ int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics); -kmem_cache_t *tcp_timewait_cachep; - atomic_t tcp_orphan_count = ATOMIC_INIT(0); int sysctl_tcp_mem[3]; @@ -2264,13 +2262,6 @@ void __init tcp_init(void) if (!tcp_hashinfo.bind_bucket_cachep) panic("tcp_init: Cannot alloc tcp_bind_bucket cache."); - tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket", - sizeof(struct tcp_tw_bucket), - 0, SLAB_HWCACHE_ALIGN, - NULL, NULL); - if (!tcp_timewait_cachep) - panic("tcp_init: Cannot alloc tcp_tw_bucket cache."); - /* Size and allocate the main established and bind bucket * hash tables. * @@ -2363,4 +2354,3 @@ EXPORT_SYMBOL(tcp_sendpage); EXPORT_SYMBOL(tcp_setsockopt); EXPORT_SYMBOL(tcp_shutdown); EXPORT_SYMBOL(tcp_statistics); -EXPORT_SYMBOL(tcp_timewait_cachep); diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 1a89a03..6f2d6f2 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -81,7 +81,7 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, r->id.tcpdiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1); if (r->tcpdiag_state == TCP_TIME_WAIT) { - struct tcp_tw_bucket *tw = (struct tcp_tw_bucket*)sk; + const struct inet_timewait_sock *tw = inet_twsk(sk); long tmo = tw->tw_ttd - jiffies; if (tmo < 0) tmo = 0; @@ -99,10 +99,12 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, r->tcpdiag_inode = 0; #ifdef CONFIG_IP_TCPDIAG_IPV6 if (r->tcpdiag_family == AF_INET6) { + const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk); + ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src, - &tw->tw_v6_rcv_saddr); + &tcp6tw->tw_v6_rcv_saddr); ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst, - &tw->tw_v6_daddr); + &tcp6tw->tw_v6_daddr); } #endif nlh->nlmsg_len = skb->tail - b; @@ -239,7 +241,7 @@ static int tcpdiag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh) out: if (sk) { if (sk->sk_state == TCP_TIME_WAIT) - tcp_tw_put((struct tcp_tw_bucket*)sk); + inet_twsk_put((struct inet_timewait_sock *)sk); else sock_put(sk); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a678709..ce423e4 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -106,7 +106,7 @@ int sysctl_local_port_range[2] = { 1024, 4999 }; static inline int tcp_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb) { - const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk); + const u32 sk_rcv_saddr = inet_rcv_saddr(sk); struct sock *sk2; struct hlist_node *node; int reuse = sk->sk_reuse; @@ -119,7 +119,7 @@ static inline int tcp_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { if (!reuse || !sk2->sk_reuse || sk2->sk_state == TCP_LISTEN) { - const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2); + const u32 sk2_rcv_saddr = inet_rcv_saddr(sk2); if (!sk2_rcv_saddr || !sk_rcv_saddr || sk2_rcv_saddr == sk_rcv_saddr) break; @@ -251,10 +251,10 @@ static inline struct sock *__tcp_v4_lookup_established(const u32 saddr, const int dif) { struct inet_ehash_bucket *head; - TCP_V4_ADDR_COOKIE(acookie, saddr, daddr) - __u32 ports = TCP_COMBINED_PORTS(sport, hnum); + INET_ADDR_COOKIE(acookie, saddr, daddr) + const __u32 ports = INET_COMBINED_PORTS(sport, hnum); struct sock *sk; - struct hlist_node *node; + const struct hlist_node *node; /* Optimize here for direct hit, only listening connections can * have wildcards anyways. */ @@ -262,13 +262,13 @@ static inline struct sock *__tcp_v4_lookup_established(const u32 saddr, head = &tcp_hashinfo.ehash[hash]; read_lock(&head->lock); sk_for_each(sk, node, &head->chain) { - if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif)) + if (INET_MATCH(sk, acookie, saddr, daddr, ports, dif)) goto hit; /* You sunk my battleship! */ } /* Must check for a TIME_WAIT'er before going to listener hash. */ sk_for_each(sk, node, &(head + tcp_hashinfo.ehash_size)->chain) { - if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif)) + if (INET_TW_MATCH(sk, acookie, saddr, daddr, ports, dif)) goto hit; } sk = NULL; @@ -313,27 +313,28 @@ static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb) /* called with local bh disabled */ static int __tcp_v4_check_established(struct sock *sk, __u16 lport, - struct tcp_tw_bucket **twp) + struct inet_timewait_sock **twp) { struct inet_sock *inet = inet_sk(sk); u32 daddr = inet->rcv_saddr; u32 saddr = inet->daddr; int dif = sk->sk_bound_dev_if; - TCP_V4_ADDR_COOKIE(acookie, saddr, daddr) - __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport); + INET_ADDR_COOKIE(acookie, saddr, daddr) + const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport); const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_hashinfo.ehash_size); struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash]; struct sock *sk2; - struct hlist_node *node; - struct tcp_tw_bucket *tw; + const struct hlist_node *node; + struct inet_timewait_sock *tw; write_lock(&head->lock); /* Check TIME-WAIT sockets first. */ sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) { - tw = (struct tcp_tw_bucket *)sk2; + tw = inet_twsk(sk2); - if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) { + if (INET_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) { + const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2); struct tcp_sock *tp = tcp_sk(sk); /* With PAWS, it is safe from the viewpoint @@ -350,15 +351,15 @@ static int __tcp_v4_check_established(struct sock *sk, __u16 lport, fall back to VJ's scheme and use initial timestamp retrieved from peer table. */ - if (tw->tw_ts_recent_stamp && + if (tcptw->tw_ts_recent_stamp && (!twp || (sysctl_tcp_tw_reuse && xtime.tv_sec - - tw->tw_ts_recent_stamp > 1))) { - if ((tp->write_seq = - tw->tw_snd_nxt + 65535 + 2) == 0) + tcptw->tw_ts_recent_stamp > 1))) { + tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; + if (tp->write_seq == 0) tp->write_seq = 1; - tp->rx_opt.ts_recent = tw->tw_ts_recent; - tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp; + tp->rx_opt.ts_recent = tcptw->tw_ts_recent; + tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; sock_hold(sk2); goto unique; } else @@ -369,7 +370,7 @@ static int __tcp_v4_check_established(struct sock *sk, __u16 lport, /* And established part... */ sk_for_each(sk2, node, &head->chain) { - if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif)) + if (INET_MATCH(sk2, acookie, saddr, daddr, ports, dif)) goto not_unique; } @@ -392,7 +393,7 @@ unique: tcp_tw_deschedule(tw); NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); - tcp_tw_put(tw); + inet_twsk_put(tw); } return 0; @@ -429,7 +430,7 @@ static inline int tcp_v4_hash_connect(struct sock *sk) static u32 hint; u32 offset = hint + connect_port_offset(sk); struct hlist_node *node; - struct tcp_tw_bucket *tw = NULL; + struct inet_timewait_sock *tw = NULL; local_bh_disable(); for (i = 1; i <= range; i++) { @@ -482,7 +483,7 @@ ok: if (tw) { tcp_tw_deschedule(tw); - tcp_tw_put(tw); + inet_twsk_put(tw); } ret = 0; @@ -757,7 +758,7 @@ void tcp_v4_err(struct sk_buff *skb, u32 info) return; } if (sk->sk_state == TCP_TIME_WAIT) { - tcp_tw_put((struct tcp_tw_bucket *)sk); + inet_twsk_put((struct inet_timewait_sock *)sk); return; } @@ -1002,12 +1003,13 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) { - struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk; + struct inet_timewait_sock *tw = inet_twsk(sk); + const struct tcp_timewait_sock *tcptw = tcp_twsk(sk); - tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt, - tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent); + tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, + tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent); - tcp_tw_put(tw); + inet_twsk_put(tw); } static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req) @@ -1368,7 +1370,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) bh_lock_sock(nsk); return nsk; } - tcp_tw_put((struct tcp_tw_bucket *)nsk); + inet_twsk_put((struct inet_timewait_sock *)nsk); return NULL; } @@ -1557,25 +1559,25 @@ discard_and_relse: do_time_wait: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { - tcp_tw_put((struct tcp_tw_bucket *) sk); + inet_twsk_put((struct inet_timewait_sock *) sk); goto discard_it; } if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) { TCP_INC_STATS_BH(TCP_MIB_INERRS); - tcp_tw_put((struct tcp_tw_bucket *) sk); + inet_twsk_put((struct inet_timewait_sock *) sk); goto discard_it; } - switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk, - skb, th, skb->len)) { + switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk, + skb, th)) { case TCP_TW_SYN: { struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo, skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb)); if (sk2) { - tcp_tw_deschedule((struct tcp_tw_bucket *)sk); - tcp_tw_put((struct tcp_tw_bucket *)sk); + tcp_tw_deschedule((struct inet_timewait_sock *)sk); + inet_twsk_put((struct inet_timewait_sock *)sk); sk = sk2; goto process; } @@ -1639,18 +1641,18 @@ int tcp_v4_remember_stamp(struct sock *sk) return 0; } -int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw) +int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw) { - struct inet_peer *peer = NULL; - - peer = inet_getpeer(tw->tw_daddr, 1); + struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1); if (peer) { - if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 || + const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); + + if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 || (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec && - peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) { - peer->tcp_ts_stamp = tw->tw_ts_recent_stamp; - peer->tcp_ts = tw->tw_ts_recent; + peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) { + peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp; + peer->tcp_ts = tcptw->tw_ts_recent; } inet_putpeer(peer); return 1; @@ -1758,13 +1760,13 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock); #ifdef CONFIG_PROC_FS /* Proc filesystem TCP sock list dumping. */ -static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head) +static inline struct inet_timewait_sock *tw_head(struct hlist_head *head) { return hlist_empty(head) ? NULL : - list_entry(head->first, struct tcp_tw_bucket, tw_node); + list_entry(head->first, struct inet_timewait_sock, tw_node); } -static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw) +static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw) { return tw->tw_node.next ? hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL; @@ -1860,7 +1862,7 @@ static void *established_get_first(struct seq_file *seq) for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) { struct sock *sk; struct hlist_node *node; - struct tcp_tw_bucket *tw; + struct inet_timewait_sock *tw; /* We can reschedule _before_ having picked the target: */ cond_resched_softirq(); @@ -1874,8 +1876,8 @@ static void *established_get_first(struct seq_file *seq) goto out; } st->state = TCP_SEQ_STATE_TIME_WAIT; - tw_for_each(tw, node, - &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) { + inet_twsk_for_each(tw, node, + &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) { if (tw->tw_family != st->family) { continue; } @@ -1892,7 +1894,7 @@ out: static void *established_get_next(struct seq_file *seq, void *cur) { struct sock *sk = cur; - struct tcp_tw_bucket *tw; + struct inet_timewait_sock *tw; struct hlist_node *node; struct tcp_iter_state* st = seq->private; @@ -2159,7 +2161,7 @@ static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i) tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh); } -static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i) +static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i) { unsigned int dest, src; __u16 destp, srcp; @@ -2261,6 +2263,7 @@ struct proto tcp_prot = { .sysctl_rmem = sysctl_tcp_rmem, .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp_sock), + .twsk_obj_size = sizeof(struct tcp_timewait_sock), .rsk_prot = &tcp_request_sock_ops, }; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index f29e2f6..5b5a493 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -41,7 +41,7 @@ int sysctl_tcp_max_tw_buckets = NR_FILE*2; int sysctl_tcp_syncookies = SYNC_INIT; int sysctl_tcp_abort_on_overflow; -static void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo); +static void tcp_tw_schedule(struct inet_timewait_sock *tw, int timeo); static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) { @@ -58,7 +58,7 @@ int tcp_tw_count; /* Must be called with locally disabled BHs. */ -static void tcp_timewait_kill(struct tcp_tw_bucket *tw) +static void tcp_timewait_kill(struct inet_timewait_sock *tw) { struct inet_bind_hashbucket *bhead; struct inet_bind_bucket *tb; @@ -85,11 +85,11 @@ static void tcp_timewait_kill(struct tcp_tw_bucket *tw) #ifdef SOCK_REFCNT_DEBUG if (atomic_read(&tw->tw_refcnt) != 1) { - printk(KERN_DEBUG "tw_bucket %p refcnt=%d\n", tw, - atomic_read(&tw->tw_refcnt)); + printk(KERN_DEBUG "%s timewait_sock %p refcnt=%d\n", + tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt)); } #endif - tcp_tw_put(tw); + inet_twsk_put(tw); } /* @@ -121,19 +121,20 @@ static void tcp_timewait_kill(struct tcp_tw_bucket *tw) * to avoid misread sequence numbers, states etc. --ANK */ enum tcp_tw_status -tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, - struct tcphdr *th, unsigned len) +tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, + const struct tcphdr *th) { + struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); struct tcp_options_received tmp_opt; int paws_reject = 0; tmp_opt.saw_tstamp = 0; - if (th->doff > (sizeof(struct tcphdr) >> 2) && tw->tw_ts_recent_stamp) { + if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { tcp_parse_options(skb, &tmp_opt, 0); if (tmp_opt.saw_tstamp) { - tmp_opt.ts_recent = tw->tw_ts_recent; - tmp_opt.ts_recent_stamp = tw->tw_ts_recent_stamp; + tmp_opt.ts_recent = tcptw->tw_ts_recent; + tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; paws_reject = tcp_paws_check(&tmp_opt, th->rst); } } @@ -144,20 +145,20 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, /* Out of window, send ACK */ if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, - tw->tw_rcv_nxt, - tw->tw_rcv_nxt + tw->tw_rcv_wnd)) + tcptw->tw_rcv_nxt, + tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd)) return TCP_TW_ACK; if (th->rst) goto kill; - if (th->syn && !before(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt)) + if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt)) goto kill_with_rst; /* Dup ACK? */ - if (!after(TCP_SKB_CB(skb)->end_seq, tw->tw_rcv_nxt) || + if (!after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) || TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) { - tcp_tw_put(tw); + inet_twsk_put(tw); return TCP_TW_SUCCESS; } @@ -165,19 +166,19 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, * reset. */ if (!th->fin || - TCP_SKB_CB(skb)->end_seq != tw->tw_rcv_nxt + 1) { + TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) { kill_with_rst: tcp_tw_deschedule(tw); - tcp_tw_put(tw); + inet_twsk_put(tw); return TCP_TW_RST; } /* FIN arrived, enter true time-wait state. */ - tw->tw_substate = TCP_TIME_WAIT; - tw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq; + tw->tw_substate = TCP_TIME_WAIT; + tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq; if (tmp_opt.saw_tstamp) { - tw->tw_ts_recent_stamp = xtime.tv_sec; - tw->tw_ts_recent = tmp_opt.rcv_tsval; + tcptw->tw_ts_recent_stamp = xtime.tv_sec; + tcptw->tw_ts_recent = tmp_opt.rcv_tsval; } /* I am shamed, but failed to make it more elegant. @@ -186,7 +187,7 @@ kill_with_rst: * do not undertsnad recycling in any case, it not * a big problem in practice. --ANK */ if (tw->tw_family == AF_INET && - sysctl_tcp_tw_recycle && tw->tw_ts_recent_stamp && + sysctl_tcp_tw_recycle && tcptw->tw_ts_recent_stamp && tcp_v4_tw_remember_stamp(tw)) tcp_tw_schedule(tw, tw->tw_timeout); else @@ -212,7 +213,7 @@ kill_with_rst: */ if (!paws_reject && - (TCP_SKB_CB(skb)->seq == tw->tw_rcv_nxt && + (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt && (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) { /* In window segment, it may be only reset or bare ack. */ @@ -224,18 +225,18 @@ kill_with_rst: if (sysctl_tcp_rfc1337 == 0) { kill: tcp_tw_deschedule(tw); - tcp_tw_put(tw); + inet_twsk_put(tw); return TCP_TW_SUCCESS; } } tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); if (tmp_opt.saw_tstamp) { - tw->tw_ts_recent = tmp_opt.rcv_tsval; - tw->tw_ts_recent_stamp = xtime.tv_sec; + tcptw->tw_ts_recent = tmp_opt.rcv_tsval; + tcptw->tw_ts_recent_stamp = xtime.tv_sec; } - tcp_tw_put(tw); + inet_twsk_put(tw); return TCP_TW_SUCCESS; } @@ -257,9 +258,10 @@ kill: */ if (th->syn && !th->rst && !th->ack && !paws_reject && - (after(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt) || - (tmp_opt.saw_tstamp && (s32)(tw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) { - u32 isn = tw->tw_snd_nxt + 65535 + 2; + (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) || + (tmp_opt.saw_tstamp && + (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) { + u32 isn = tcptw->tw_snd_nxt + 65535 + 2; if (isn == 0) isn++; TCP_SKB_CB(skb)->when = isn; @@ -284,7 +286,7 @@ kill: */ return TCP_TW_ACK; } - tcp_tw_put(tw); + inet_twsk_put(tw); return TCP_TW_SUCCESS; } @@ -293,7 +295,7 @@ kill: * relevant info into it from the SK, and mess with hash chains * and list linkage. */ -static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw) +static void __tcp_tw_hashdance(struct sock *sk, struct inet_timewait_sock *tw) { const struct inet_sock *inet = inet_sk(sk); struct inet_ehash_bucket *ehead = &tcp_hashinfo.ehash[sk->sk_hashent]; @@ -306,7 +308,7 @@ static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw) spin_lock(&bhead->lock); tw->tw_tb = inet->bind_hash; BUG_TRAP(inet->bind_hash); - tw_add_bind_node(tw, &tw->tw_tb->owners); + inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); spin_unlock(&bhead->lock); write_lock(&ehead->lock); @@ -316,7 +318,7 @@ static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw) sock_prot_dec_use(sk->sk_prot); /* Step 3: Hash TW into TIMEWAIT half of established hash table. */ - tw_add_node(tw, &(ehead + tcp_hashinfo.ehash_size)->chain); + inet_twsk_add_node(tw, &(ehead + tcp_hashinfo.ehash_size)->chain); atomic_inc(&tw->tw_refcnt); write_unlock(&ehead->lock); @@ -327,19 +329,23 @@ static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw) */ void tcp_time_wait(struct sock *sk, int state, int timeo) { - struct tcp_tw_bucket *tw = NULL; - struct tcp_sock *tp = tcp_sk(sk); + struct inet_timewait_sock *tw = NULL; + const struct tcp_sock *tp = tcp_sk(sk); int recycle_ok = 0; if (sysctl_tcp_tw_recycle && tp->rx_opt.ts_recent_stamp) recycle_ok = tp->af_specific->remember_stamp(sk); if (tcp_tw_count < sysctl_tcp_max_tw_buckets) - tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC); + tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_slab, SLAB_ATOMIC); + + if (tw != NULL) { + struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); + const struct inet_sock *inet = inet_sk(sk); + const int rto = (tp->rto << 2) - (tp->rto >> 1); - if(tw != NULL) { - struct inet_sock *inet = inet_sk(sk); - int rto = (tp->rto<<2) - (tp->rto>>1); + /* Remember our protocol */ + tw->tw_prot = sk->sk_prot_creator; /* Give us an identity. */ tw->tw_daddr = inet->daddr; @@ -356,25 +362,23 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) atomic_set(&tw->tw_refcnt, 1); tw->tw_hashent = sk->sk_hashent; - tw->tw_rcv_nxt = tp->rcv_nxt; - tw->tw_snd_nxt = tp->snd_nxt; - tw->tw_rcv_wnd = tcp_receive_window(tp); - tw->tw_ts_recent = tp->rx_opt.ts_recent; - tw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; - tw_dead_node_init(tw); + tcptw->tw_rcv_nxt = tp->rcv_nxt; + tcptw->tw_snd_nxt = tp->snd_nxt; + tcptw->tw_rcv_wnd = tcp_receive_window(tp); + tcptw->tw_ts_recent = tp->rx_opt.ts_recent; + tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; + inet_twsk_dead_node_init(tw); #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) if (tw->tw_family == PF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); + struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw); - ipv6_addr_copy(&tw->tw_v6_daddr, &np->daddr); - ipv6_addr_copy(&tw->tw_v6_rcv_saddr, &np->rcv_saddr); - tw->tw_v6_ipv6only = np->ipv6only; - } else { - memset(&tw->tw_v6_daddr, 0, sizeof(tw->tw_v6_daddr)); - memset(&tw->tw_v6_rcv_saddr, 0, sizeof(tw->tw_v6_rcv_saddr)); - tw->tw_v6_ipv6only = 0; - } + ipv6_addr_copy(&tcp6tw->tw_v6_daddr, &np->daddr); + ipv6_addr_copy(&tcp6tw->tw_v6_rcv_saddr, &np->rcv_saddr); + tw->tw_ipv6only = np->ipv6only; + } else + tw->tw_ipv6only = 0; #endif /* Linkage updates. */ __tcp_tw_hashdance(sk, tw); @@ -392,7 +396,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) } tcp_tw_schedule(tw, timeo); - tcp_tw_put(tw); + inet_twsk_put(tw); } else { /* Sorry, if we're out of memory, just CLOSE this * socket up. We've got bigger problems than @@ -427,7 +431,7 @@ static u32 twkill_thread_slots; /* Returns non-zero if quota exceeded. */ static int tcp_do_twkill_work(int slot, unsigned int quota) { - struct tcp_tw_bucket *tw; + struct inet_timewait_sock *tw; struct hlist_node *node; unsigned int killed; int ret; @@ -441,11 +445,11 @@ static int tcp_do_twkill_work(int slot, unsigned int quota) killed = 0; ret = 0; rescan: - tw_for_each_inmate(tw, node, &tcp_tw_death_row[slot]) { - __tw_del_dead_node(tw); + inet_twsk_for_each_inmate(tw, node, &tcp_tw_death_row[slot]) { + __inet_twsk_del_dead_node(tw); spin_unlock(&tw_death_lock); tcp_timewait_kill(tw); - tcp_tw_put(tw); + inet_twsk_put(tw); killed++; spin_lock(&tw_death_lock); if (killed > quota) { @@ -531,11 +535,11 @@ static void twkill_work(void *dummy) */ /* This is for handling early-kills of TIME_WAIT sockets. */ -void tcp_tw_deschedule(struct tcp_tw_bucket *tw) +void tcp_tw_deschedule(struct inet_timewait_sock *tw) { spin_lock(&tw_death_lock); - if (tw_del_dead_node(tw)) { - tcp_tw_put(tw); + if (inet_twsk_del_dead_node(tw)) { + inet_twsk_put(tw); if (--tcp_tw_count == 0) del_timer(&tcp_tw_timer); } @@ -552,7 +556,7 @@ static struct timer_list tcp_twcal_timer = TIMER_INITIALIZER(tcp_twcal_tick, 0, 0); static struct hlist_head tcp_twcal_row[TCP_TW_RECYCLE_SLOTS]; -static void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo) +static void tcp_tw_schedule(struct inet_timewait_sock *tw, const int timeo) { struct hlist_head *list; int slot; @@ -586,7 +590,7 @@ static void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo) spin_lock(&tw_death_lock); /* Unlink it, if it was scheduled */ - if (tw_del_dead_node(tw)) + if (inet_twsk_del_dead_node(tw)) tcp_tw_count--; else atomic_inc(&tw->tw_refcnt); @@ -644,13 +648,13 @@ void tcp_twcal_tick(unsigned long dummy) for (n=0; nrcv_saddr; const struct in6_addr *sk2_rcv_saddr6 = tcp_v6_rcv_saddr(sk2); u32 sk_rcv_saddr = inet_sk(sk)->rcv_saddr; - u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2); + u32 sk2_rcv_saddr = inet_rcv_saddr(sk2); int sk_ipv6only = ipv6_only_sock(sk); int sk2_ipv6only = tcp_v6_ipv6only(sk2); int addr_type = ipv6_addr_type(sk_rcv_saddr6); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 93a66b9..af8ad5b 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -308,33 +308,32 @@ static inline struct sock *__tcp_v6_lookup_established(struct in6_addr *saddr, u struct in6_addr *daddr, u16 hnum, int dif) { - struct inet_ehash_bucket *head; struct sock *sk; - struct hlist_node *node; - __u32 ports = TCP_COMBINED_PORTS(sport, hnum); - int hash; - + const struct hlist_node *node; + const __u32 ports = INET_COMBINED_PORTS(sport, hnum); /* Optimize here for direct hit, only listening connections can * have wildcards anyways. */ - hash = tcp_v6_hashfn(daddr, hnum, saddr, sport); - head = &tcp_hashinfo.ehash[hash]; + const int hash = tcp_v6_hashfn(daddr, hnum, saddr, sport); + struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash]; + read_lock(&head->lock); sk_for_each(sk, node, &head->chain) { /* For IPV6 do the cheaper port and family tests first. */ - if(TCP_IPV6_MATCH(sk, saddr, daddr, ports, dif)) + if (INET6_MATCH(sk, saddr, daddr, ports, dif)) goto hit; /* You sunk my battleship! */ } /* Must check for a TIME_WAIT'er before going to listener hash. */ sk_for_each(sk, node, &(head + tcp_hashinfo.ehash_size)->chain) { - /* FIXME: acme: check this... */ - struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk; + const struct inet_timewait_sock *tw = inet_twsk(sk); if(*((__u32 *)&(tw->tw_dport)) == ports && sk->sk_family == PF_INET6) { - if(ipv6_addr_equal(&tw->tw_v6_daddr, saddr) && - ipv6_addr_equal(&tw->tw_v6_rcv_saddr, daddr) && - (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dif)) + const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk); + + if (ipv6_addr_equal(&tcp6tw->tw_v6_daddr, saddr) && + ipv6_addr_equal(&tcp6tw->tw_v6_rcv_saddr, daddr) && + (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dif)) goto hit; } } @@ -455,43 +454,46 @@ static __u32 tcp_v6_init_sequence(struct sock *sk, struct sk_buff *skb) } static int __tcp_v6_check_established(struct sock *sk, __u16 lport, - struct tcp_tw_bucket **twp) + struct inet_timewait_sock **twp) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct in6_addr *daddr = &np->rcv_saddr; struct in6_addr *saddr = &np->daddr; int dif = sk->sk_bound_dev_if; - u32 ports = TCP_COMBINED_PORTS(inet->dport, lport); - int hash = tcp_v6_hashfn(daddr, inet->num, saddr, inet->dport); + const u32 ports = INET_COMBINED_PORTS(inet->dport, lport); + const int hash = tcp_v6_hashfn(daddr, inet->num, saddr, inet->dport); struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash]; struct sock *sk2; - struct hlist_node *node; - struct tcp_tw_bucket *tw; + const struct hlist_node *node; + struct inet_timewait_sock *tw; write_lock(&head->lock); /* Check TIME-WAIT sockets first. */ sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) { - tw = (struct tcp_tw_bucket*)sk2; + const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk2); + + tw = inet_twsk(sk2); if(*((__u32 *)&(tw->tw_dport)) == ports && sk2->sk_family == PF_INET6 && - ipv6_addr_equal(&tw->tw_v6_daddr, saddr) && - ipv6_addr_equal(&tw->tw_v6_rcv_saddr, daddr) && + ipv6_addr_equal(&tcp6tw->tw_v6_daddr, saddr) && + ipv6_addr_equal(&tcp6tw->tw_v6_rcv_saddr, daddr) && sk2->sk_bound_dev_if == sk->sk_bound_dev_if) { + const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2); struct tcp_sock *tp = tcp_sk(sk); - if (tw->tw_ts_recent_stamp && - (!twp || (sysctl_tcp_tw_reuse && - xtime.tv_sec - - tw->tw_ts_recent_stamp > 1))) { + if (tcptw->tw_ts_recent_stamp && + (!twp || + (sysctl_tcp_tw_reuse && + xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) { /* See comment in tcp_ipv4.c */ - tp->write_seq = tw->tw_snd_nxt + 65535 + 2; + tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; if (!tp->write_seq) tp->write_seq = 1; - tp->rx_opt.ts_recent = tw->tw_ts_recent; - tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp; + tp->rx_opt.ts_recent = tcptw->tw_ts_recent; + tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; sock_hold(sk2); goto unique; } else @@ -502,7 +504,7 @@ static int __tcp_v6_check_established(struct sock *sk, __u16 lport, /* And established part... */ sk_for_each(sk2, node, &head->chain) { - if(TCP_IPV6_MATCH(sk2, saddr, daddr, ports, dif)) + if (INET6_MATCH(sk2, saddr, daddr, ports, dif)) goto not_unique; } @@ -521,7 +523,7 @@ unique: tcp_tw_deschedule(tw); NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); - tcp_tw_put(tw); + inet_twsk_put(tw); } return 0; @@ -556,7 +558,7 @@ static int tcp_v6_hash_connect(struct sock *sk) static u32 hint; u32 offset = hint + tcpv6_port_offset(sk); struct hlist_node *node; - struct tcp_tw_bucket *tw = NULL; + struct inet_timewait_sock *tw = NULL; local_bh_disable(); for (i = 1; i <= range; i++) { @@ -609,7 +611,7 @@ ok: if (tw) { tcp_tw_deschedule(tw); - tcp_tw_put(tw); + inet_twsk_put(tw); } ret = 0; @@ -845,7 +847,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } if (sk->sk_state == TCP_TIME_WAIT) { - tcp_tw_put((struct tcp_tw_bucket*)sk); + inet_twsk_put((struct inet_timewait_sock *)sk); return; } @@ -1223,12 +1225,14 @@ static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) { - struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk; + struct inet_timewait_sock *tw = inet_twsk(sk); + const struct tcp_timewait_sock *tcptw = tcp_twsk(sk); - tcp_v6_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt, - tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent); + tcp_v6_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, + tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, + tcptw->tw_ts_recent); - tcp_tw_put(tw); + inet_twsk_put(tw); } static void tcp_v6_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req) @@ -1261,7 +1265,7 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) bh_lock_sock(nsk); return nsk; } - tcp_tw_put((struct tcp_tw_bucket*)nsk); + inet_twsk_put((struct inet_timewait_sock *)nsk); return NULL; } @@ -1798,26 +1802,26 @@ discard_and_relse: do_time_wait: if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { - tcp_tw_put((struct tcp_tw_bucket *) sk); + inet_twsk_put((struct inet_timewait_sock *)sk); goto discard_it; } if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) { TCP_INC_STATS_BH(TCP_MIB_INERRS); - tcp_tw_put((struct tcp_tw_bucket *) sk); + inet_twsk_put((struct inet_timewait_sock *)sk); goto discard_it; } - switch(tcp_timewait_state_process((struct tcp_tw_bucket *)sk, - skb, th, skb->len)) { + switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk, + skb, th)) { case TCP_TW_SYN: { struct sock *sk2; sk2 = tcp_v6_lookup_listener(&skb->nh.ipv6h->daddr, ntohs(th->dest), tcp_v6_iif(skb)); if (sk2 != NULL) { - tcp_tw_deschedule((struct tcp_tw_bucket *)sk); - tcp_tw_put((struct tcp_tw_bucket *)sk); + tcp_tw_deschedule((struct inet_timewait_sock *)sk); + inet_twsk_put((struct inet_timewait_sock *)sk); sk = sk2; goto process; } @@ -2137,17 +2141,18 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) } static void get_timewait6_sock(struct seq_file *seq, - struct tcp_tw_bucket *tw, int i) + struct inet_timewait_sock *tw, int i) { struct in6_addr *dest, *src; __u16 destp, srcp; + struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw); int ttd = tw->tw_ttd - jiffies; if (ttd < 0) ttd = 0; - dest = &tw->tw_v6_daddr; - src = &tw->tw_v6_rcv_saddr; + dest = &tcp6tw->tw_v6_daddr; + src = &tcp6tw->tw_v6_rcv_saddr; destp = ntohs(tw->tw_dport); srcp = ntohs(tw->tw_sport); @@ -2244,6 +2249,7 @@ struct proto tcpv6_prot = { .sysctl_rmem = sysctl_tcp_rmem, .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp6_sock), + .twsk_obj_size = sizeof(struct tcp6_timewait_sock), .rsk_prot = &tcp6_request_sock_ops, }; -- cgit v0.10.2 From e48c414ee61f4ac8d5cff2973e66a7cbc8a93aa5 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:09:46 -0700 Subject: [INET]: Generalise the TCP sock ID lookup routines And also some TIME_WAIT functions. [acme@toy net-2.6.14]$ grep built-in /tmp/before.size /tmp/after.size /tmp/before.size: 282955 13122 9312 305389 4a8ed net/ipv4/built-in.o /tmp/after.size: 281566 13122 9312 304000 4a380 net/ipv4/built-in.o [acme@toy net-2.6.14]$ I kept them still inlined, will uninline at some point to see what would be the performance difference. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index c38c637..b5c0d64 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -30,6 +30,7 @@ #include #include +#include /* This is for all connections with a full identity, no wildcards. * New scheme, half the table is for TIME_WAIT, the other half is @@ -285,13 +286,13 @@ extern struct sock *__inet_lookup_listener(const struct hlist_head *head, const int dif); /* Optimize the common listener case. */ -static inline struct sock *inet_lookup_listener(struct inet_hashinfo *hashinfo, - const u32 daddr, - const unsigned short hnum, - const int dif) +static inline struct sock * + inet_lookup_listener(struct inet_hashinfo *hashinfo, + const u32 daddr, + const unsigned short hnum, const int dif) { struct sock *sk = NULL; - struct hlist_head *head; + const struct hlist_head *head; read_lock(&hashinfo->lhash_lock); head = &hashinfo->listening_hash[inet_lhashfn(hnum)]; @@ -351,4 +352,70 @@ sherry_cache: ((*((__u32 *)&(inet_twsk(__sk)->tw_dport))) == (__ports)) && \ (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) #endif /* 64-bit arch */ + +/* + * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so we need + * not check it for lookups anymore, thanks Alexey. -DaveM + * + * Local BH must be disabled here. + */ +static inline struct sock * + __inet_lookup_established(struct inet_hashinfo *hashinfo, + const u32 saddr, const u16 sport, + const u32 daddr, const u16 hnum, + const int dif) +{ + INET_ADDR_COOKIE(acookie, saddr, daddr) + const __u32 ports = INET_COMBINED_PORTS(sport, hnum); + struct sock *sk; + const struct hlist_node *node; + /* Optimize here for direct hit, only listening connections can + * have wildcards anyways. + */ + const int hash = inet_ehashfn(daddr, hnum, saddr, sport, hashinfo->ehash_size); + struct inet_ehash_bucket *head = &hashinfo->ehash[hash]; + + read_lock(&head->lock); + sk_for_each(sk, node, &head->chain) { + if (INET_MATCH(sk, acookie, saddr, daddr, ports, dif)) + goto hit; /* You sunk my battleship! */ + } + + /* Must check for a TIME_WAIT'er before going to listener hash. */ + sk_for_each(sk, node, &(head + hashinfo->ehash_size)->chain) { + if (INET_TW_MATCH(sk, acookie, saddr, daddr, ports, dif)) + goto hit; + } + sk = NULL; +out: + read_unlock(&head->lock); + return sk; +hit: + sock_hold(sk); + goto out; +} + +static inline struct sock *__inet_lookup(struct inet_hashinfo *hashinfo, + const u32 saddr, const u16 sport, + const u32 daddr, const u16 hnum, + const int dif) +{ + struct sock *sk = __inet_lookup_established(hashinfo, saddr, sport, daddr, + hnum, dif); + return sk ? : inet_lookup_listener(hashinfo, daddr, hnum, dif); +} + +static inline struct sock *inet_lookup(struct inet_hashinfo *hashinfo, + const u32 saddr, const u16 sport, + const u32 daddr, const u16 dport, + const int dif) +{ + struct sock *sk; + + local_bh_disable(); + sk = __inet_lookup(hashinfo, saddr, sport, daddr, ntohs(dport), dif); + local_bh_enable(); + + return sk; +} #endif /* _INET_HASHTABLES_H */ diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index ce11704..020f280 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -17,6 +17,7 @@ #include +#include #include #include @@ -32,6 +33,7 @@ #endif struct inet_bind_bucket; +struct inet_hashinfo; /* * This is a TIME_WAIT sock. It works around the memory consumption @@ -139,4 +141,11 @@ static inline void inet_twsk_put(struct inet_timewait_sock *tw) kmem_cache_free(tw->tw_prot->twsk_slab, tw); } } + +extern void __inet_twsk_kill(struct inet_timewait_sock *tw, + struct inet_hashinfo *hashinfo); + +extern void __inet_twsk_hashdance(struct inet_timewait_sock *tw, + struct sock *sk, + struct inet_hashinfo *hashinfo); #endif /* _INET_TIMEWAIT_SOCK_ */ diff --git a/include/net/sock.h b/include/net/sock.h index c902c57..bdae0a5 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -255,28 +255,28 @@ struct sock { /* * Hashed lists helper routines */ -static inline struct sock *__sk_head(struct hlist_head *head) +static inline struct sock *__sk_head(const struct hlist_head *head) { return hlist_entry(head->first, struct sock, sk_node); } -static inline struct sock *sk_head(struct hlist_head *head) +static inline struct sock *sk_head(const struct hlist_head *head) { return hlist_empty(head) ? NULL : __sk_head(head); } -static inline struct sock *sk_next(struct sock *sk) +static inline struct sock *sk_next(const struct sock *sk) { return sk->sk_node.next ? hlist_entry(sk->sk_node.next, struct sock, sk_node) : NULL; } -static inline int sk_unhashed(struct sock *sk) +static inline int sk_unhashed(const struct sock *sk) { return hlist_unhashed(&sk->sk_node); } -static inline int sk_hashed(struct sock *sk) +static inline int sk_hashed(const struct sock *sk) { return sk->sk_node.pprev != NULL; } @@ -494,7 +494,7 @@ extern int sk_wait_data(struct sock *sk, long *timeo); struct request_sock_ops; /* Here is the right place to enable sock refcounting debugging */ -#define SOCK_REFCNT_DEBUG +//#define SOCK_REFCNT_DEBUG /* Networking protocol blocks we attach to sockets. * socket layer -> transport layer interface diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 2d8d30e..6650d18 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -5,6 +5,7 @@ obj-y := route.o inetpeer.o protocol.o \ ip_input.o ip_fragment.o ip_forward.o ip_options.o \ ip_output.o ip_sockglue.o inet_hashtables.o \ + inet_timewait_sock.o \ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ tcp_minisocks.o tcp_cong.o \ datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 88fcba0..d94e962 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -162,3 +162,5 @@ struct sock *__inet_lookup_listener(const struct hlist_head *head, const u32 dad } return result; } + +EXPORT_SYMBOL_GPL(__inet_lookup_listener); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c new file mode 100644 index 0000000..d38d160 --- /dev/null +++ b/net/ipv4/inet_timewait_sock.c @@ -0,0 +1,83 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Generic TIME_WAIT sockets functions + * + * From code orinally in TCP + */ + +#include + +#include +#include + +/* Must be called with locally disabled BHs. */ +void __inet_twsk_kill(struct inet_timewait_sock *tw, struct inet_hashinfo *hashinfo) +{ + struct inet_bind_hashbucket *bhead; + struct inet_bind_bucket *tb; + /* Unlink from established hashes. */ + struct inet_ehash_bucket *ehead = &hashinfo->ehash[tw->tw_hashent]; + + write_lock(&ehead->lock); + if (hlist_unhashed(&tw->tw_node)) { + write_unlock(&ehead->lock); + return; + } + __hlist_del(&tw->tw_node); + sk_node_init(&tw->tw_node); + write_unlock(&ehead->lock); + + /* Disassociate with bind bucket. */ + bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, hashinfo->bhash_size)]; + spin_lock(&bhead->lock); + tb = tw->tw_tb; + __hlist_del(&tw->tw_bind_node); + tw->tw_tb = NULL; + inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); + spin_unlock(&bhead->lock); +#ifdef SOCK_REFCNT_DEBUG + if (atomic_read(&tw->tw_refcnt) != 1) { + printk(KERN_DEBUG "%s timewait_sock %p refcnt=%d\n", + tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt)); + } +#endif + inet_twsk_put(tw); +} + +/* + * Enter the time wait state. This is called with locally disabled BH. + * Essentially we whip up a timewait bucket, copy the relevant info into it + * from the SK, and mess with hash chains and list linkage. + */ +void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, + struct inet_hashinfo *hashinfo) +{ + const struct inet_sock *inet = inet_sk(sk); + struct inet_ehash_bucket *ehead = &hashinfo->ehash[sk->sk_hashent]; + struct inet_bind_hashbucket *bhead; + /* Step 1: Put TW into bind hash. Original socket stays there too. + Note, that any socket with inet->num != 0 MUST be bound in + binding cache, even if it is closed. + */ + bhead = &hashinfo->bhash[inet_bhashfn(inet->num, hashinfo->bhash_size)]; + spin_lock(&bhead->lock); + tw->tw_tb = inet->bind_hash; + BUG_TRAP(inet->bind_hash); + inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); + spin_unlock(&bhead->lock); + + write_lock(&ehead->lock); + + /* Step 2: Remove SK from established hash. */ + if (__sk_del_node_init(sk)) + sock_prot_dec_use(sk->sk_prot); + + /* Step 3: Hash TW into TIMEWAIT half of established hash table. */ + inet_twsk_add_node(tw, &(ehead + hashinfo->ehash_size)->chain); + atomic_inc(&tw->tw_refcnt); + + write_unlock(&ehead->lock); +} diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 6f2d6f2..60c6a79 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -174,8 +174,6 @@ nlmsg_failure: return -1; } -extern struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, - int dif); #ifdef CONFIG_IP_TCPDIAG_IPV6 extern struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport, struct in6_addr *daddr, u16 dport, @@ -197,9 +195,9 @@ static int tcpdiag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh) struct sk_buff *rep; if (req->tcpdiag_family == AF_INET) { - sk = tcp_v4_lookup(req->id.tcpdiag_dst[0], req->id.tcpdiag_dport, - req->id.tcpdiag_src[0], req->id.tcpdiag_sport, - req->id.tcpdiag_if); + sk = inet_lookup(&tcp_hashinfo, req->id.tcpdiag_dst[0], + req->id.tcpdiag_dport, req->id.tcpdiag_src[0], + req->id.tcpdiag_sport, req->id.tcpdiag_if); } #ifdef CONFIG_IP_TCPDIAG_IPV6 else if (req->tcpdiag_family == AF_INET6) { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ce423e4..e7e91e6 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -238,71 +238,6 @@ void tcp_unhash(struct sock *sk) inet_unhash(&tcp_hashinfo, sk); } -/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so - * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM - * - * Local BH must be disabled here. - */ - -static inline struct sock *__tcp_v4_lookup_established(const u32 saddr, - const u16 sport, - const u32 daddr, - const u16 hnum, - const int dif) -{ - struct inet_ehash_bucket *head; - INET_ADDR_COOKIE(acookie, saddr, daddr) - const __u32 ports = INET_COMBINED_PORTS(sport, hnum); - struct sock *sk; - const struct hlist_node *node; - /* Optimize here for direct hit, only listening connections can - * have wildcards anyways. - */ - const int hash = inet_ehashfn(daddr, hnum, saddr, sport, tcp_hashinfo.ehash_size); - head = &tcp_hashinfo.ehash[hash]; - read_lock(&head->lock); - sk_for_each(sk, node, &head->chain) { - if (INET_MATCH(sk, acookie, saddr, daddr, ports, dif)) - goto hit; /* You sunk my battleship! */ - } - - /* Must check for a TIME_WAIT'er before going to listener hash. */ - sk_for_each(sk, node, &(head + tcp_hashinfo.ehash_size)->chain) { - if (INET_TW_MATCH(sk, acookie, saddr, daddr, ports, dif)) - goto hit; - } - sk = NULL; -out: - read_unlock(&head->lock); - return sk; -hit: - sock_hold(sk); - goto out; -} - -static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport, - u32 daddr, u16 hnum, int dif) -{ - struct sock *sk = __tcp_v4_lookup_established(saddr, sport, - daddr, hnum, dif); - - return sk ? : inet_lookup_listener(&tcp_hashinfo, daddr, hnum, dif); -} - -inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, - u16 dport, int dif) -{ - struct sock *sk; - - local_bh_disable(); - sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif); - local_bh_enable(); - - return sk; -} - -EXPORT_SYMBOL_GPL(tcp_v4_lookup); - static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb) { return secure_tcp_sequence_number(skb->nh.iph->daddr, @@ -751,8 +686,8 @@ void tcp_v4_err(struct sk_buff *skb, u32 info) return; } - sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, - th->source, tcp_v4_iif(skb)); + sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr, + th->source, tcp_v4_iif(skb)); if (!sk) { ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); return; @@ -1359,11 +1294,9 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) if (req) return tcp_check_req(sk, skb, req, prev); - nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr, - th->source, - skb->nh.iph->daddr, - ntohs(th->dest), - tcp_v4_iif(skb)); + nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr, + th->source, skb->nh.iph->daddr, + ntohs(th->dest), tcp_v4_iif(skb)); if (nsk) { if (nsk->sk_state != TCP_TIME_WAIT) { @@ -1505,9 +1438,9 @@ int tcp_v4_rcv(struct sk_buff *skb) TCP_SKB_CB(skb)->flags = skb->nh.iph->tos; TCP_SKB_CB(skb)->sacked = 0; - sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source, - skb->nh.iph->daddr, ntohs(th->dest), - tcp_v4_iif(skb)); + sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source, + skb->nh.iph->daddr, ntohs(th->dest), + tcp_v4_iif(skb)); if (!sk) goto no_tcp_socket; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 5b5a493..4112f7a 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -56,42 +56,6 @@ static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) int tcp_tw_count; - -/* Must be called with locally disabled BHs. */ -static void tcp_timewait_kill(struct inet_timewait_sock *tw) -{ - struct inet_bind_hashbucket *bhead; - struct inet_bind_bucket *tb; - /* Unlink from established hashes. */ - struct inet_ehash_bucket *ehead = &tcp_hashinfo.ehash[tw->tw_hashent]; - - write_lock(&ehead->lock); - if (hlist_unhashed(&tw->tw_node)) { - write_unlock(&ehead->lock); - return; - } - __hlist_del(&tw->tw_node); - sk_node_init(&tw->tw_node); - write_unlock(&ehead->lock); - - /* Disassociate with bind bucket. */ - bhead = &tcp_hashinfo.bhash[inet_bhashfn(tw->tw_num, tcp_hashinfo.bhash_size)]; - spin_lock(&bhead->lock); - tb = tw->tw_tb; - __hlist_del(&tw->tw_bind_node); - tw->tw_tb = NULL; - inet_bind_bucket_destroy(tcp_hashinfo.bind_bucket_cachep, tb); - spin_unlock(&bhead->lock); - -#ifdef SOCK_REFCNT_DEBUG - if (atomic_read(&tw->tw_refcnt) != 1) { - printk(KERN_DEBUG "%s timewait_sock %p refcnt=%d\n", - tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt)); - } -#endif - inet_twsk_put(tw); -} - /* * * Main purpose of TIME-WAIT state is to close connection gracefully, * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN @@ -290,40 +254,6 @@ kill: return TCP_TW_SUCCESS; } -/* Enter the time wait state. This is called with locally disabled BH. - * Essentially we whip up a timewait bucket, copy the - * relevant info into it from the SK, and mess with hash chains - * and list linkage. - */ -static void __tcp_tw_hashdance(struct sock *sk, struct inet_timewait_sock *tw) -{ - const struct inet_sock *inet = inet_sk(sk); - struct inet_ehash_bucket *ehead = &tcp_hashinfo.ehash[sk->sk_hashent]; - struct inet_bind_hashbucket *bhead; - /* Step 1: Put TW into bind hash. Original socket stays there too. - Note, that any socket with inet->num != 0 MUST be bound in - binding cache, even if it is closed. - */ - bhead = &tcp_hashinfo.bhash[inet_bhashfn(inet->num, tcp_hashinfo.bhash_size)]; - spin_lock(&bhead->lock); - tw->tw_tb = inet->bind_hash; - BUG_TRAP(inet->bind_hash); - inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); - spin_unlock(&bhead->lock); - - write_lock(&ehead->lock); - - /* Step 2: Remove SK from established hash. */ - if (__sk_del_node_init(sk)) - sock_prot_dec_use(sk->sk_prot); - - /* Step 3: Hash TW into TIMEWAIT half of established hash table. */ - inet_twsk_add_node(tw, &(ehead + tcp_hashinfo.ehash_size)->chain); - atomic_inc(&tw->tw_refcnt); - - write_unlock(&ehead->lock); -} - /* * Move a socket to time-wait or dead fin-wait-2 state. */ @@ -381,7 +311,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tw->tw_ipv6only = 0; #endif /* Linkage updates. */ - __tcp_tw_hashdance(sk, tw); + __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); /* Get the TIME_WAIT timeout firing. */ if (timeo < rto) @@ -448,7 +378,7 @@ rescan: inet_twsk_for_each_inmate(tw, node, &tcp_tw_death_row[slot]) { __inet_twsk_del_dead_node(tw); spin_unlock(&tw_death_lock); - tcp_timewait_kill(tw); + __inet_twsk_kill(tw, &tcp_hashinfo); inet_twsk_put(tw); killed++; spin_lock(&tw_death_lock); @@ -544,7 +474,7 @@ void tcp_tw_deschedule(struct inet_timewait_sock *tw) del_timer(&tcp_tw_timer); } spin_unlock(&tw_death_lock); - tcp_timewait_kill(tw); + __inet_twsk_kill(tw, &tcp_hashinfo); } /* Short-time timewait calendar */ @@ -653,7 +583,7 @@ void tcp_twcal_tick(unsigned long dummy) inet_twsk_for_each_inmate_safe(tw, node, safe, &tcp_twcal_row[slot]) { __inet_twsk_del_dead_node(tw); - tcp_timewait_kill(tw); + __inet_twsk_kill(tw, &tcp_hashinfo); inet_twsk_put(tw); killed++; } -- cgit v0.10.2 From c676270bcd25015b978722ec0352c330dcc87883 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:09:59 -0700 Subject: [INET_TWSK]: Introduce inet_twsk_alloc With the parts of tcp_time_wait that are not TCP specific, tcp_time_wait uses it and so will dccp_time_wait. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 020f280..e00861b 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -142,6 +142,9 @@ static inline void inet_twsk_put(struct inet_timewait_sock *tw) } } +extern struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, + const int state); + extern void __inet_twsk_kill(struct inet_timewait_sock *tw, struct inet_hashinfo *hashinfo); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index d38d160..ceb577c 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -81,3 +81,31 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, write_unlock(&ehead->lock); } + +struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state) +{ + struct inet_timewait_sock *tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_slab, + SLAB_ATOMIC); + if (tw != NULL) { + const struct inet_sock *inet = inet_sk(sk); + + /* Give us an identity. */ + tw->tw_daddr = inet->daddr; + tw->tw_rcv_saddr = inet->rcv_saddr; + tw->tw_bound_dev_if = sk->sk_bound_dev_if; + tw->tw_num = inet->num; + tw->tw_state = TCP_TIME_WAIT; + tw->tw_substate = state; + tw->tw_sport = inet->sport; + tw->tw_dport = inet->dport; + tw->tw_family = sk->sk_family; + tw->tw_reuse = sk->sk_reuse; + tw->tw_hashent = sk->sk_hashent; + tw->tw_ipv6only = 0; + tw->tw_prot = sk->sk_prot_creator; + atomic_set(&tw->tw_refcnt, 1); + inet_twsk_dead_node_init(tw); + } + + return tw; +} diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 4112f7a..66ce179 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -267,37 +267,18 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) recycle_ok = tp->af_specific->remember_stamp(sk); if (tcp_tw_count < sysctl_tcp_max_tw_buckets) - tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_slab, SLAB_ATOMIC); + tw = inet_twsk_alloc(sk, state); if (tw != NULL) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); - const struct inet_sock *inet = inet_sk(sk); const int rto = (tp->rto << 2) - (tp->rto >> 1); - /* Remember our protocol */ - tw->tw_prot = sk->sk_prot_creator; - - /* Give us an identity. */ - tw->tw_daddr = inet->daddr; - tw->tw_rcv_saddr = inet->rcv_saddr; - tw->tw_bound_dev_if = sk->sk_bound_dev_if; - tw->tw_num = inet->num; - tw->tw_state = TCP_TIME_WAIT; - tw->tw_substate = state; - tw->tw_sport = inet->sport; - tw->tw_dport = inet->dport; - tw->tw_family = sk->sk_family; - tw->tw_reuse = sk->sk_reuse; tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; - atomic_set(&tw->tw_refcnt, 1); - - tw->tw_hashent = sk->sk_hashent; tcptw->tw_rcv_nxt = tp->rcv_nxt; tcptw->tw_snd_nxt = tp->snd_nxt; tcptw->tw_rcv_wnd = tcp_receive_window(tp); tcptw->tw_ts_recent = tp->rx_opt.ts_recent; tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; - inet_twsk_dead_node_init(tw); #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) if (tw->tw_family == PF_INET6) { @@ -307,8 +288,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) ipv6_addr_copy(&tcp6tw->tw_v6_daddr, &np->daddr); ipv6_addr_copy(&tcp6tw->tw_v6_rcv_saddr, &np->rcv_saddr); tw->tw_ipv6only = np->ipv6only; - } else - tw->tw_ipv6only = 0; + } #endif /* Linkage updates. */ __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); -- cgit v0.10.2 From 87d11ceb9deb7a3f13fdee6e89d9bb6be7d27a71 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:10:12 -0700 Subject: [SOCK]: Introduce sk_clone Out of tcp_create_openreq_child, will be used in dccp_create_openreq_child, and is a nice sock function anyway. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/net/sock.h b/include/net/sock.h index bdae0a5..828dc08 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -737,6 +737,8 @@ extern struct sock *sk_alloc(int family, unsigned int __nocast priority, struct proto *prot, int zero_it); extern void sk_free(struct sock *sk); +extern struct sock *sk_clone(const struct sock *sk, + const unsigned int __nocast priority); extern struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, diff --git a/net/core/sock.c b/net/core/sock.c index aba31fe..ccd10fd 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -700,6 +700,80 @@ void sk_free(struct sock *sk) module_put(owner); } +struct sock *sk_clone(const struct sock *sk, const unsigned int __nocast priority) +{ + struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0); + + if (newsk != NULL) { + struct sk_filter *filter; + + memcpy(newsk, sk, sk->sk_prot->obj_size); + + /* SANITY */ + sk_node_init(&newsk->sk_node); + sock_lock_init(newsk); + bh_lock_sock(newsk); + + atomic_set(&newsk->sk_rmem_alloc, 0); + atomic_set(&newsk->sk_wmem_alloc, 0); + atomic_set(&newsk->sk_omem_alloc, 0); + skb_queue_head_init(&newsk->sk_receive_queue); + skb_queue_head_init(&newsk->sk_write_queue); + + rwlock_init(&newsk->sk_dst_lock); + rwlock_init(&newsk->sk_callback_lock); + + newsk->sk_dst_cache = NULL; + newsk->sk_wmem_queued = 0; + newsk->sk_forward_alloc = 0; + newsk->sk_send_head = NULL; + newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; + newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; + + sock_reset_flag(newsk, SOCK_DONE); + skb_queue_head_init(&newsk->sk_error_queue); + + filter = newsk->sk_filter; + if (filter != NULL) + sk_filter_charge(newsk, filter); + + if (unlikely(xfrm_sk_clone_policy(newsk))) { + /* It is still raw copy of parent, so invalidate + * destructor and make plain sk_free() */ + newsk->sk_destruct = NULL; + sk_free(newsk); + newsk = NULL; + goto out; + } + + newsk->sk_err = 0; + newsk->sk_priority = 0; + atomic_set(&newsk->sk_refcnt, 2); + + /* + * Increment the counter in the same struct proto as the master + * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that + * is the same as sk->sk_prot->socks, as this field was copied + * with memcpy). + * + * This _changes_ the previous behaviour, where + * tcp_create_openreq_child always was incrementing the + * equivalent to tcp_prot->socks (inet_sock_nr), so this have + * to be taken into account in all callers. -acme + */ + sk_refcnt_debug_inc(newsk); + newsk->sk_socket = NULL; + newsk->sk_sleep = NULL; + + if (newsk->sk_prot->sockets_allocated) + atomic_inc(newsk->sk_prot->sockets_allocated); + } +out: + return newsk; +} + +EXPORT_SYMBOL_GPL(sk_clone); + void __init sk_init(void) { if (num_physpages <= 4096) { diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 66ce179..8b6cd8d 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -599,67 +599,26 @@ out: */ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb) { - /* allocate the newsk from the same slab of the master sock, - * if not, at sk_free time we'll try to free it from the wrong - * slabcache (i.e. is it TCPv4 or v6?), this is handled thru sk->sk_prot -acme */ - struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, sk->sk_prot, 0); + struct sock *newsk = sk_clone(sk, GFP_ATOMIC); - if(newsk != NULL) { + if (newsk != NULL) { struct inet_request_sock *ireq = inet_rsk(req); struct tcp_request_sock *treq = tcp_rsk(req); struct inet_sock *newinet = inet_sk(newsk); struct tcp_sock *newtp; - struct sk_filter *filter; - memcpy(newsk, sk, sizeof(struct tcp_sock)); newsk->sk_state = TCP_SYN_RECV; - - /* SANITY */ - sk_node_init(&newsk->sk_node); newinet->bind_hash = NULL; /* Clone the TCP header template */ newinet->dport = ireq->rmt_port; - - sock_lock_init(newsk); - bh_lock_sock(newsk); - - rwlock_init(&newsk->sk_dst_lock); - newsk->sk_dst_cache = NULL; - atomic_set(&newsk->sk_rmem_alloc, 0); - skb_queue_head_init(&newsk->sk_receive_queue); - atomic_set(&newsk->sk_wmem_alloc, 0); - skb_queue_head_init(&newsk->sk_write_queue); - atomic_set(&newsk->sk_omem_alloc, 0); - newsk->sk_wmem_queued = 0; - newsk->sk_forward_alloc = 0; - - sock_reset_flag(newsk, SOCK_DONE); - newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; - newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; - newsk->sk_send_head = NULL; - rwlock_init(&newsk->sk_callback_lock); - skb_queue_head_init(&newsk->sk_error_queue); newsk->sk_write_space = sk_stream_write_space; - if ((filter = newsk->sk_filter) != NULL) - sk_filter_charge(newsk, filter); - - if (unlikely(xfrm_sk_clone_policy(newsk))) { - /* It is still raw copy of parent, so invalidate - * destructor and make plain sk_free() */ - newsk->sk_destruct = NULL; - sk_free(newsk); - return NULL; - } - /* Now setup tcp_sock */ newtp = tcp_sk(newsk); newtp->pred_flags = 0; newtp->rcv_nxt = treq->rcv_isn + 1; - newtp->snd_nxt = treq->snt_isn + 1; - newtp->snd_una = treq->snt_isn + 1; - newtp->snd_sml = treq->snt_isn + 1; + newtp->snd_nxt = newtp->snd_una = newtp->snd_sml = treq->snt_isn + 1; tcp_prequeue_init(newtp); @@ -710,32 +669,9 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, /* Deinitialize accept_queue to trap illegal accesses. */ memset(&newtp->accept_queue, 0, sizeof(newtp->accept_queue)); - /* Back to base struct sock members. */ - newsk->sk_err = 0; - newsk->sk_priority = 0; - atomic_set(&newsk->sk_refcnt, 2); - - /* - * Increment the counter in the same struct proto as the master - * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that - * is the same as sk->sk_prot->socks, as this field was copied - * with memcpy), same rationale as the first comment in this - * function. - * - * This _changes_ the previous behaviour, where - * tcp_create_openreq_child always was incrementing the - * equivalent to tcp_prot->socks (inet_sock_nr), so this have - * to be taken into account in all callers. -acme - */ - sk_refcnt_debug_inc(newsk); - - atomic_inc(&tcp_sockets_allocated); - if (sock_flag(newsk, SOCK_KEEPOPEN)) tcp_reset_keepalive_timer(newsk, keepalive_time_when(newtp)); - newsk->sk_socket = NULL; - newsk->sk_sleep = NULL; newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; if((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) { -- cgit v0.10.2 From 463c84b97f24010a67cd871746d6a7e4c925a5f9 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:10:42 -0700 Subject: [NET]: Introduce inet_connection_sock This creates struct inet_connection_sock, moving members out of struct tcp_sock that are shareable with other INET connection oriented protocols, such as DCCP, that in my private tree already uses most of these members. The functions that operate on these members were renamed, using a inet_csk_ prefix while not being moved yet to a new file, so as to ease the review of these changes. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/linux/ip.h b/include/linux/ip.h index 2c54bbd..33e8a19 100644 --- a/include/linux/ip.h +++ b/include/linux/ip.h @@ -128,7 +128,6 @@ static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk) return (struct inet_request_sock *)sk; } -struct inet_bind_bucket; struct ipv6_pinfo; struct inet_sock { @@ -158,7 +157,6 @@ struct inet_sock { int mc_index; /* Multicast device index */ __u32 mc_addr; struct ip_mc_socklist *mc_list; /* Group array */ - struct inet_bind_bucket *bind_hash; /* * Following members are used to retain the infomation to build * an ip header on each ip fragmentation while the socket is corked. diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 98fa323..8859191 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -333,15 +333,15 @@ static inline struct in6_addr *tcp_v6_rcv_saddr(const struct sock *sk) return sk->sk_family == AF_INET6 ? __tcp_v6_rcv_saddr(sk) : NULL; } -static inline int tcp_twsk_ipv6only(const struct sock *sk) +static inline int inet_twsk_ipv6only(const struct sock *sk) { return inet_twsk(sk)->tw_ipv6only; } -static inline int tcp_v6_ipv6only(const struct sock *sk) +static inline int inet_v6_ipv6only(const struct sock *sk) { return likely(sk->sk_state != TCP_TIME_WAIT) ? - ipv6_only_sock(sk) : tcp_twsk_ipv6only(sk); + ipv6_only_sock(sk) : inet_twsk_ipv6only(sk); } #else #define __ipv6_only_sock(sk) 0 @@ -360,7 +360,7 @@ static inline struct raw6_sock *raw6_sk(const struct sock *sk) #define __tcp_v6_rcv_saddr(__sk) NULL #define tcp_v6_rcv_saddr(__sk) NULL #define tcp_twsk_ipv6only(__sk) 0 -#define tcp_v6_ipv6only(__sk) 0 +#define inet_v6_ipv6only(__sk) 0 #endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */ #define INET6_MATCH(__sk, __saddr, __daddr, __ports, __dif) \ diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 5d295b1..800930f 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -177,8 +177,8 @@ struct tcp_info #include #include -#include #include +#include #include /* This defines a selective acknowledgement block. */ @@ -219,8 +219,8 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req) } struct tcp_sock { - /* inet_sock has to be the first member of tcp_sock */ - struct inet_sock inet; + /* inet_connection_sock has to be the first member of tcp_sock */ + struct inet_connection_sock inet_conn; int tcp_header_len; /* Bytes of tcp header to send */ /* @@ -241,18 +241,6 @@ struct tcp_sock { __u32 snd_sml; /* Last byte of the most recently transmitted small packet */ __u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ __u32 lsndtime; /* timestamp of last sent data packet (for restart window) */ - /* Delayed ACK control data */ - struct { - __u8 pending; /* ACK is pending */ - __u8 quick; /* Scheduled number of quick acks */ - __u8 pingpong; /* The session is interactive */ - __u8 blocked; /* Delayed ACK was blocked by socket lock*/ - __u32 ato; /* Predicted tick of soft clock */ - unsigned long timeout; /* Currently scheduled timeout */ - __u32 lrcvtime; /* timestamp of last received data packet*/ - __u16 last_seg_size; /* Size of last incoming segment */ - __u16 rcv_mss; /* MSS used for delayed ACK decisions */ - } ack; /* Data for direct copy to user */ struct { @@ -271,8 +259,8 @@ struct tcp_sock { __u16 xmit_size_goal; /* Goal for segmenting output packets */ __u16 ext_header_len; /* Network protocol overhead (IP/IPv6 options) */ __u8 ca_state; /* State of fast-retransmit machine */ - __u8 retransmits; /* Number of unrecovered RTO timeouts. */ + __u8 keepalive_probes; /* num of allowed keep alive probes */ __u16 advmss; /* Advertised MSS */ __u32 window_clamp; /* Maximal window to advertise */ __u32 rcv_ssthresh; /* Current window clamp */ @@ -281,7 +269,7 @@ struct tcp_sock { __u8 reordering; /* Packet reordering metric. */ __u8 frto_counter; /* Number of new acks after RTO */ - __u8 unused; + __u8 nonagle; /* Disable Nagle algorithm? */ __u8 defer_accept; /* User waits for some data after accept() */ /* RTT measurement */ @@ -290,19 +278,13 @@ struct tcp_sock { __u32 mdev_max; /* maximal mdev for the last rtt period */ __u32 rttvar; /* smoothed mdev_max */ __u32 rtt_seq; /* sequence number to update rttvar */ - __u32 rto; /* retransmit timeout */ __u32 packets_out; /* Packets which are "in flight" */ __u32 left_out; /* Packets which leaved network */ __u32 retrans_out; /* Retransmitted packets out */ - __u8 backoff; /* backoff */ /* * Options received (usually on last packet, some only on SYN packets). */ - __u8 nonagle; /* Disable Nagle algorithm? */ - __u8 keepalive_probes; /* num of allowed keep alive probes */ - - __u8 probes_out; /* unanswered 0 window probes */ struct tcp_options_received rx_opt; /* @@ -315,11 +297,6 @@ struct tcp_sock { __u32 snd_cwnd_used; __u32 snd_cwnd_stamp; - /* Two commonly used timers in both sender and receiver paths. */ - unsigned long timeout; - struct timer_list retransmit_timer; /* Resend (no ack) */ - struct timer_list delack_timer; /* Ack delay */ - struct sk_buff_head out_of_order_queue; /* Out of order segments go here */ struct tcp_func *af_specific; /* Operations which are AF_INET{4,6} specific */ @@ -334,7 +311,7 @@ struct tcp_sock { struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */ struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/ - __u8 syn_retries; /* num of allowed syn retries */ + __u8 probes_out; /* unanswered 0 window probes */ __u8 ecn_flags; /* ECN status bits. */ __u16 prior_ssthresh; /* ssthresh saved at recovery start */ __u32 lost_out; /* Lost packets */ @@ -349,14 +326,12 @@ struct tcp_sock { int undo_retrans; /* number of undoable retransmissions. */ __u32 urg_seq; /* Seq of received urgent pointer */ __u16 urg_data; /* Saved octet of OOB data and control flags */ - __u8 pending; /* Scheduled timer event */ __u8 urg_mode; /* In urgent mode */ + /* ONE BYTE HOLE, TRY TO PACK! */ __u32 snd_up; /* Urgent pointer */ __u32 total_retrans; /* Total retransmits for entire connection */ - struct request_sock_queue accept_queue; /* FIFO of established children */ - unsigned int keepalive_time; /* time before keep alive takes place */ unsigned int keepalive_intvl; /* time interval between keep alive probes */ int linger2; diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h new file mode 100644 index 0000000..ef60939 --- /dev/null +++ b/include/net/inet_connection_sock.h @@ -0,0 +1,86 @@ +/* + * NET Generic infrastructure for INET connection oriented protocols. + * + * Definitions for inet_connection_sock + * + * Authors: Many people, see the TCP sources + * + * From code originally in TCP + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#ifndef _INET_CONNECTION_SOCK_H +#define _INET_CONNECTION_SOCK_H + +#include +#include +#include + +struct inet_bind_bucket; +struct inet_hashinfo; + +/** inet_connection_sock - INET connection oriented sock + * + * @icsk_accept_queue: FIFO of established children + * @icsk_bind_hash: Bind node + * @icsk_timeout: Timeout + * @icsk_retransmit_timer: Resend (no ack) + * @icsk_rto: Retransmit timeout + * @icsk_retransmits: Number of unrecovered [RTO] timeouts + * @icsk_pending: Scheduled timer event + * @icsk_backoff: Backoff + * @icsk_syn_retries: Number of allowed SYN (or equivalent) retries + * @icsk_ack: Delayed ACK control data + */ +struct inet_connection_sock { + /* inet_sock has to be the first member! */ + struct inet_sock icsk_inet; + struct request_sock_queue icsk_accept_queue; + struct inet_bind_bucket *icsk_bind_hash; + unsigned long icsk_timeout; + struct timer_list icsk_retransmit_timer; + struct timer_list icsk_delack_timer; + __u32 icsk_rto; + __u8 icsk_retransmits; + __u8 icsk_pending; + __u8 icsk_backoff; + __u8 icsk_syn_retries; + struct { + __u8 pending; /* ACK is pending */ + __u8 quick; /* Scheduled number of quick acks */ + __u8 pingpong; /* The session is interactive */ + __u8 blocked; /* Delayed ACK was blocked by socket lock */ + __u32 ato; /* Predicted tick of soft clock */ + unsigned long timeout; /* Currently scheduled timeout */ + __u32 lrcvtime; /* timestamp of last received data packet */ + __u16 last_seg_size; /* Size of last incoming segment */ + __u16 rcv_mss; /* MSS used for delayed ACK decisions */ + } icsk_ack; +}; + +static inline struct inet_connection_sock *inet_csk(const struct sock *sk) +{ + return (struct inet_connection_sock *)sk; +} + +extern void inet_csk_init_xmit_timers(struct sock *sk, + void (*retransmit_handler)(unsigned long), + void (*delack_handler)(unsigned long), + void (*keepalive_handler)(unsigned long)); +extern void inet_csk_clear_xmit_timers(struct sock *sk); + +extern struct request_sock *inet_csk_search_req(const struct sock *sk, + struct request_sock ***prevp, + const __u16 rport, + const __u32 raddr, + const __u32 laddr); +extern int inet_csk_get_port(struct inet_hashinfo *hashinfo, + struct sock *sk, unsigned short snum); + +extern struct dst_entry* inet_csk_route_req(struct sock *sk, + const struct request_sock *req); + +#endif /* _INET_CONNECTION_SOCK_H */ diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index b5c0d64..f0c21c0 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -17,7 +17,6 @@ #include #include -#include #include #include #include @@ -26,6 +25,7 @@ #include #include +#include #include #include @@ -185,9 +185,9 @@ static inline void __inet_inherit_port(struct inet_hashinfo *table, struct inet_bind_bucket *tb; spin_lock(&head->lock); - tb = inet_sk(sk)->bind_hash; + tb = inet_csk(sk)->icsk_bind_hash; sk_add_bind_node(child, &tb->owners); - inet_sk(child)->bind_hash = tb; + inet_csk(child)->icsk_bind_hash = tb; spin_unlock(&head->lock); } diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 334717b..b7c7eec 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -224,17 +224,17 @@ static inline int reqsk_queue_added(struct request_sock_queue *queue) return prev_qlen; } -static inline int reqsk_queue_len(struct request_sock_queue *queue) +static inline int reqsk_queue_len(const struct request_sock_queue *queue) { return queue->listen_opt != NULL ? queue->listen_opt->qlen : 0; } -static inline int reqsk_queue_len_young(struct request_sock_queue *queue) +static inline int reqsk_queue_len_young(const struct request_sock_queue *queue) { return queue->listen_opt->qlen_young; } -static inline int reqsk_queue_is_full(struct request_sock_queue *queue) +static inline int reqsk_queue_is_full(const struct request_sock_queue *queue) { return queue->listen_opt->qlen >> queue->listen_opt->max_qlen_log; } diff --git a/include/net/sock.h b/include/net/sock.h index 828dc08..48cc337 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -493,9 +493,6 @@ extern int sk_wait_data(struct sock *sk, long *timeo); struct request_sock_ops; -/* Here is the right place to enable sock refcounting debugging */ -//#define SOCK_REFCNT_DEBUG - /* Networking protocol blocks we attach to sockets. * socket layer -> transport layer interface * transport -> network interface is defined by struct inet_proto diff --git a/include/net/tcp.h b/include/net/tcp.h index cf8e664..a943c79 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -19,10 +19,11 @@ #define _TCP_H #define TCP_DEBUG 1 +#define INET_CSK_DEBUG 1 #define FASTRETRANS_DEBUG 1 /* Cancel timers, when they are not required. */ -#undef TCP_CLEAR_TIMERS +#undef INET_CSK_CLEAR_TIMERS #include #include @@ -205,10 +206,10 @@ extern void tcp_tw_deschedule(struct inet_timewait_sock *tw); #define TCPOLEN_SACK_BASE_ALIGNED 4 #define TCPOLEN_SACK_PERBLOCK 8 -#define TCP_TIME_RETRANS 1 /* Retransmit timer */ -#define TCP_TIME_DACK 2 /* Delayed ack timer */ -#define TCP_TIME_PROBE0 3 /* Zero window probe timer */ -#define TCP_TIME_KEEPOPEN 4 /* Keepalive timer */ +#define ICSK_TIME_RETRANS 1 /* Retransmit timer */ +#define ICSK_TIME_DACK 2 /* Delayed ack timer */ +#define ICSK_TIME_PROBE0 3 /* Zero window probe timer */ +#define ICSK_TIME_KEEPOPEN 4 /* Keepalive timer */ /* Flags in tp->nonagle */ #define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */ @@ -257,9 +258,9 @@ extern atomic_t tcp_sockets_allocated; extern int tcp_memory_pressure; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -#define TCP_INET_FAMILY(fam) ((fam) == AF_INET) +#define AF_INET_FAMILY(fam) ((fam) == AF_INET) #else -#define TCP_INET_FAMILY(fam) 1 +#define AF_INET_FAMILY(fam) 1 #endif /* @@ -372,41 +373,42 @@ extern int tcp_rcv_established(struct sock *sk, extern void tcp_rcv_space_adjust(struct sock *sk); -enum tcp_ack_state_t -{ - TCP_ACK_SCHED = 1, - TCP_ACK_TIMER = 2, - TCP_ACK_PUSHED= 4 +enum inet_csk_ack_state_t { + ICSK_ACK_SCHED = 1, + ICSK_ACK_TIMER = 2, + ICSK_ACK_PUSHED = 4 }; -static inline void tcp_schedule_ack(struct tcp_sock *tp) +static inline void inet_csk_schedule_ack(struct sock *sk) { - tp->ack.pending |= TCP_ACK_SCHED; + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_SCHED; } -static inline int tcp_ack_scheduled(struct tcp_sock *tp) +static inline int inet_csk_ack_scheduled(const struct sock *sk) { - return tp->ack.pending&TCP_ACK_SCHED; + return inet_csk(sk)->icsk_ack.pending & ICSK_ACK_SCHED; } -static __inline__ void tcp_dec_quickack_mode(struct tcp_sock *tp, unsigned int pkts) +static inline void tcp_dec_quickack_mode(struct sock *sk, + const unsigned int pkts) { - if (tp->ack.quick) { - if (pkts >= tp->ack.quick) { - tp->ack.quick = 0; + struct inet_connection_sock *icsk = inet_csk(sk); + if (icsk->icsk_ack.quick) { + if (pkts >= icsk->icsk_ack.quick) { + icsk->icsk_ack.quick = 0; /* Leaving quickack mode we deflate ATO. */ - tp->ack.ato = TCP_ATO_MIN; + icsk->icsk_ack.ato = TCP_ATO_MIN; } else - tp->ack.quick -= pkts; + icsk->icsk_ack.quick -= pkts; } } -extern void tcp_enter_quickack_mode(struct tcp_sock *tp); +extern void tcp_enter_quickack_mode(struct sock *sk); -static __inline__ void tcp_delack_init(struct tcp_sock *tp) +static inline void inet_csk_delack_init(struct sock *sk) { - memset(&tp->ack, 0, sizeof(tp->ack)); + memset(&inet_csk(sk)->icsk_ack, 0, sizeof(inet_csk(sk)->icsk_ack)); } static inline void tcp_clear_options(struct tcp_options_received *rx_opt) @@ -440,7 +442,7 @@ extern void tcp_update_metrics(struct sock *sk); extern void tcp_close(struct sock *sk, long timeout); -extern struct sock * tcp_accept(struct sock *sk, int flags, int *err); +extern struct sock * inet_csk_accept(struct sock *sk, int flags, int *err); extern unsigned int tcp_poll(struct file * file, struct socket *sock, struct poll_table_struct *wait); extern int tcp_getsockopt(struct sock *sk, int level, @@ -534,15 +536,18 @@ extern void tcp_cwnd_application_limited(struct sock *sk); /* tcp_timer.c */ extern void tcp_init_xmit_timers(struct sock *); -extern void tcp_clear_xmit_timers(struct sock *); +static inline void tcp_clear_xmit_timers(struct sock *sk) +{ + inet_csk_clear_xmit_timers(sk); +} -extern void tcp_delete_keepalive_timer(struct sock *); -extern void tcp_reset_keepalive_timer(struct sock *, unsigned long); +extern void inet_csk_delete_keepalive_timer(struct sock *sk); +extern void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long timeout); extern unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu); extern unsigned int tcp_current_mss(struct sock *sk, int large); -#ifdef TCP_DEBUG -extern const char tcp_timer_bug_msg[]; +#ifdef INET_CSK_DEBUG +extern const char inet_csk_timer_bug_msg[]; #endif /* tcp_diag.c */ @@ -554,70 +559,58 @@ typedef int (*sk_read_actor_t)(read_descriptor_t *, struct sk_buff *, extern int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor); -static inline void tcp_clear_xmit_timer(struct sock *sk, int what) +static inline void inet_csk_clear_xmit_timer(struct sock *sk, const int what) { - struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); - switch (what) { - case TCP_TIME_RETRANS: - case TCP_TIME_PROBE0: - tp->pending = 0; - -#ifdef TCP_CLEAR_TIMERS - sk_stop_timer(sk, &tp->retransmit_timer); + if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0) { + icsk->icsk_pending = 0; +#ifdef INET_CSK_CLEAR_TIMERS + sk_stop_timer(sk, &icsk->icsk_retransmit_timer); #endif - break; - case TCP_TIME_DACK: - tp->ack.blocked = 0; - tp->ack.pending = 0; - -#ifdef TCP_CLEAR_TIMERS - sk_stop_timer(sk, &tp->delack_timer); + } else if (what == ICSK_TIME_DACK) { + icsk->icsk_ack.blocked = icsk->icsk_ack.pending = 0; +#ifdef INET_CSK_CLEAR_TIMERS + sk_stop_timer(sk, &icsk->icsk_delack_timer); #endif - break; - default: -#ifdef TCP_DEBUG - printk(tcp_timer_bug_msg); + } +#ifdef INET_CSK_DEBUG + else { + pr_debug(inet_csk_timer_bug_msg); + } #endif - return; - }; - } /* * Reset the retransmission timer */ -static inline void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long when) +static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what, + unsigned long when) { - struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); if (when > TCP_RTO_MAX) { -#ifdef TCP_DEBUG - printk(KERN_DEBUG "reset_xmit_timer sk=%p %d when=0x%lx, caller=%p\n", sk, what, when, current_text_addr()); +#ifdef INET_CSK_DEBUG + pr_debug("reset_xmit_timer: sk=%p %d when=0x%lx, caller=%p\n", + sk, what, when, current_text_addr()); #endif when = TCP_RTO_MAX; } - switch (what) { - case TCP_TIME_RETRANS: - case TCP_TIME_PROBE0: - tp->pending = what; - tp->timeout = jiffies+when; - sk_reset_timer(sk, &tp->retransmit_timer, tp->timeout); - break; - - case TCP_TIME_DACK: - tp->ack.pending |= TCP_ACK_TIMER; - tp->ack.timeout = jiffies+when; - sk_reset_timer(sk, &tp->delack_timer, tp->ack.timeout); - break; - - default: -#ifdef TCP_DEBUG - printk(tcp_timer_bug_msg); + if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0) { + icsk->icsk_pending = what; + icsk->icsk_timeout = jiffies + when; + sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); + } else if (what == ICSK_TIME_DACK) { + icsk->icsk_ack.pending |= ICSK_ACK_TIMER; + icsk->icsk_ack.timeout = jiffies + when; + sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout); + } +#ifdef INET_CSK_DEBUG + else { + pr_debug(inet_csk_timer_bug_msg); + } #endif - return; - }; } /* Initialize RCV_MSS value. @@ -637,7 +630,7 @@ static inline void tcp_initialize_rcv_mss(struct sock *sk) hint = min(hint, TCP_MIN_RCVMSS); hint = max(hint, TCP_MIN_MSS); - tp->ack.rcv_mss = hint; + inet_csk(sk)->icsk_ack.rcv_mss = hint; } static __inline__ void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) @@ -772,7 +765,7 @@ static inline void tcp_packets_out_inc(struct sock *sk, tp->packets_out += tcp_skb_pcount(skb); if (!orig) - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto); } static inline void tcp_packets_out_dec(struct tcp_sock *tp, @@ -939,8 +932,9 @@ static __inline__ void tcp_minshall_update(struct tcp_sock *tp, int mss, static __inline__ void tcp_check_probe_timer(struct sock *sk, struct tcp_sock *tp) { - if (!tp->packets_out && !tp->pending) - tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, tp->rto); + const struct inet_connection_sock *icsk = inet_csk(sk); + if (!tp->packets_out && !icsk->icsk_pending) + inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, icsk->icsk_rto); } static __inline__ void tcp_push_pending_frames(struct sock *sk, @@ -1021,8 +1015,9 @@ static __inline__ int tcp_prequeue(struct sock *sk, struct sk_buff *skb) tp->ucopy.memory = 0; } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) { wake_up_interruptible(sk->sk_sleep); - if (!tcp_ack_scheduled(tp)) - tcp_reset_xmit_timer(sk, TCP_TIME_DACK, (3*TCP_RTO_MIN)/4); + if (!inet_csk_ack_scheduled(sk)) + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, + (3 * TCP_RTO_MIN) / 4); } return 1; } @@ -1055,7 +1050,7 @@ static __inline__ void tcp_set_state(struct sock *sk, int state) TCP_INC_STATS(TCP_MIB_ESTABRESETS); sk->sk_prot->unhash(sk); - if (inet_sk(sk)->bind_hash && + if (inet_csk(sk)->icsk_bind_hash && !(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) inet_put_port(&tcp_hashinfo, sk); /* fall through */ @@ -1186,51 +1181,55 @@ static inline int tcp_full_space(const struct sock *sk) return tcp_win_from_space(sk->sk_rcvbuf); } -static inline void tcp_acceptq_queue(struct sock *sk, struct request_sock *req, - struct sock *child) +static inline void inet_csk_reqsk_queue_add(struct sock *sk, + struct request_sock *req, + struct sock *child) { - reqsk_queue_add(&tcp_sk(sk)->accept_queue, req, sk, child); + reqsk_queue_add(&inet_csk(sk)->icsk_accept_queue, req, sk, child); } -static inline void -tcp_synq_removed(struct sock *sk, struct request_sock *req) +static inline void inet_csk_reqsk_queue_removed(struct sock *sk, + struct request_sock *req) { - if (reqsk_queue_removed(&tcp_sk(sk)->accept_queue, req) == 0) - tcp_delete_keepalive_timer(sk); + if (reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req) == 0) + inet_csk_delete_keepalive_timer(sk); } -static inline void tcp_synq_added(struct sock *sk) +static inline void inet_csk_reqsk_queue_added(struct sock *sk, + const unsigned long timeout) { - if (reqsk_queue_added(&tcp_sk(sk)->accept_queue) == 0) - tcp_reset_keepalive_timer(sk, TCP_TIMEOUT_INIT); + if (reqsk_queue_added(&inet_csk(sk)->icsk_accept_queue) == 0) + inet_csk_reset_keepalive_timer(sk, timeout); } -static inline int tcp_synq_len(struct sock *sk) +static inline int inet_csk_reqsk_queue_len(const struct sock *sk) { - return reqsk_queue_len(&tcp_sk(sk)->accept_queue); + return reqsk_queue_len(&inet_csk(sk)->icsk_accept_queue); } -static inline int tcp_synq_young(struct sock *sk) +static inline int inet_csk_reqsk_queue_young(const struct sock *sk) { - return reqsk_queue_len_young(&tcp_sk(sk)->accept_queue); + return reqsk_queue_len_young(&inet_csk(sk)->icsk_accept_queue); } -static inline int tcp_synq_is_full(struct sock *sk) +static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk) { - return reqsk_queue_is_full(&tcp_sk(sk)->accept_queue); + return reqsk_queue_is_full(&inet_csk(sk)->icsk_accept_queue); } -static inline void tcp_synq_unlink(struct tcp_sock *tp, struct request_sock *req, - struct request_sock **prev) +static inline void inet_csk_reqsk_queue_unlink(struct sock *sk, + struct request_sock *req, + struct request_sock **prev) { - reqsk_queue_unlink(&tp->accept_queue, req, prev); + reqsk_queue_unlink(&inet_csk(sk)->icsk_accept_queue, req, prev); } -static inline void tcp_synq_drop(struct sock *sk, struct request_sock *req, - struct request_sock **prev) +static inline void inet_csk_reqsk_queue_drop(struct sock *sk, + struct request_sock *req, + struct request_sock **prev) { - tcp_synq_unlink(tcp_sk(sk), req, prev); - tcp_synq_removed(sk, req); + inet_csk_reqsk_queue_unlink(sk, req, prev); + inet_csk_reqsk_queue_removed(sk, req); reqsk_free(req); } @@ -1265,12 +1264,13 @@ static inline int keepalive_time_when(const struct tcp_sock *tp) return tp->keepalive_time ? : sysctl_tcp_keepalive_time; } -static inline int tcp_fin_time(const struct tcp_sock *tp) +static inline int tcp_fin_time(const struct sock *sk) { - int fin_timeout = tp->linger2 ? : sysctl_tcp_fin_timeout; + int fin_timeout = tcp_sk(sk)->linger2 ? : sysctl_tcp_fin_timeout; + const int rto = inet_csk(sk)->icsk_rto; - if (fin_timeout < (tp->rto<<2) - (tp->rto>>1)) - fin_timeout = (tp->rto<<2) - (tp->rto>>1); + if (fin_timeout < (rto << 2) - (rto >> 1)) + fin_timeout = (rto << 2) - (rto >> 1); return fin_timeout; } diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h index 64980ee..c6b8439 100644 --- a/include/net/tcp_ecn.h +++ b/include/net/tcp_ecn.h @@ -88,7 +88,7 @@ static inline void TCP_ECN_check_ce(struct tcp_sock *tp, struct sk_buff *skb) * it is surely retransmit. It is not in ECN RFC, * but Linux follows this rule. */ else if (INET_ECN_is_not_ect((TCP_SKB_CB(skb)->flags))) - tcp_enter_quickack_mode(tp); + tcp_enter_quickack_mode((struct sock *)tp); } } diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index d94e962..e8d29fe 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -19,6 +19,7 @@ #include #include +#include #include /* @@ -56,10 +57,9 @@ void inet_bind_bucket_destroy(kmem_cache_t *cachep, struct inet_bind_bucket *tb) void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, const unsigned short snum) { - struct inet_sock *inet = inet_sk(sk); - inet->num = snum; + inet_sk(sk)->num = snum; sk_add_bind_node(sk, &tb->owners); - inet->bind_hash = tb; + inet_csk(sk)->icsk_bind_hash = tb; } EXPORT_SYMBOL(inet_bind_hash); @@ -69,16 +69,15 @@ EXPORT_SYMBOL(inet_bind_hash); */ static void __inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk) { - struct inet_sock *inet = inet_sk(sk); - const int bhash = inet_bhashfn(inet->num, hashinfo->bhash_size); + const int bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size); struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; struct inet_bind_bucket *tb; spin_lock(&head->lock); - tb = inet->bind_hash; + tb = inet_csk(sk)->icsk_bind_hash; __sk_del_bind_node(sk); - inet->bind_hash = NULL; - inet->num = 0; + inet_csk(sk)->icsk_bind_hash = NULL; + inet_sk(sk)->num = 0; inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); spin_unlock(&head->lock); } diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index ceb577c..5cba59b 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -56,6 +56,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, struct inet_hashinfo *hashinfo) { const struct inet_sock *inet = inet_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); struct inet_ehash_bucket *ehead = &hashinfo->ehash[sk->sk_hashent]; struct inet_bind_hashbucket *bhead; /* Step 1: Put TW into bind hash. Original socket stays there too. @@ -64,8 +65,8 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, */ bhead = &hashinfo->bhash[inet_bhashfn(inet->num, hashinfo->bhash_size)]; spin_lock(&bhead->lock); - tw->tw_tb = inet->bind_hash; - BUG_TRAP(inet->bind_hash); + tw->tw_tb = icsk->icsk_bind_hash; + BUG_TRAP(icsk->icsk_bind_hash); inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); spin_unlock(&bhead->lock); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 72d0144..8692cb9 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -180,7 +180,7 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, child = tp->af_specific->syn_recv_sock(sk, skb, req, dst); if (child) - tcp_acceptq_queue(sk, req, child); + inet_csk_reqsk_queue_add(sk, req, child); else reqsk_free(req); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f1a708b..8177b86 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -313,7 +313,7 @@ EXPORT_SYMBOL(tcp_enter_memory_pressure); static __inline__ unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait) { - return !reqsk_queue_empty(&tcp_sk(sk)->accept_queue) ? (POLLIN | POLLRDNORM) : 0; + return !reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue) ? (POLLIN | POLLRDNORM) : 0; } /* @@ -458,15 +458,15 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) int tcp_listen_start(struct sock *sk) { struct inet_sock *inet = inet_sk(sk); - struct tcp_sock *tp = tcp_sk(sk); - int rc = reqsk_queue_alloc(&tp->accept_queue, TCP_SYNQ_HSIZE); + struct inet_connection_sock *icsk = inet_csk(sk); + int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, TCP_SYNQ_HSIZE); if (rc != 0) return rc; sk->sk_max_ack_backlog = 0; sk->sk_ack_backlog = 0; - tcp_delack_init(tp); + inet_csk_delack_init(sk); /* There is race window here: we announce ourselves listening, * but this transition is still not validated by get_port(). @@ -484,7 +484,7 @@ int tcp_listen_start(struct sock *sk) } sk->sk_state = TCP_CLOSE; - __reqsk_queue_destroy(&tp->accept_queue); + __reqsk_queue_destroy(&icsk->icsk_accept_queue); return -EADDRINUSE; } @@ -495,14 +495,14 @@ int tcp_listen_start(struct sock *sk) static void tcp_listen_stop (struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); struct request_sock *acc_req; struct request_sock *req; - tcp_delete_keepalive_timer(sk); + inet_csk_delete_keepalive_timer(sk); /* make all the listen_opt local to us */ - acc_req = reqsk_queue_yank_acceptq(&tp->accept_queue); + acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue); /* Following specs, it would be better either to send FIN * (and enter FIN-WAIT-1, it is normal close) @@ -512,7 +512,7 @@ static void tcp_listen_stop (struct sock *sk) * To be honest, we are not able to make either * of the variants now. --ANK */ - reqsk_queue_destroy(&tp->accept_queue); + reqsk_queue_destroy(&icsk->icsk_accept_queue); while ((req = acc_req) != NULL) { struct sock *child = req->sk; @@ -1039,20 +1039,21 @@ static void cleanup_rbuf(struct sock *sk, int copied) BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)); #endif - if (tcp_ack_scheduled(tp)) { + if (inet_csk_ack_scheduled(sk)) { + const struct inet_connection_sock *icsk = inet_csk(sk); /* Delayed ACKs frequently hit locked sockets during bulk * receive. */ - if (tp->ack.blocked || + if (icsk->icsk_ack.blocked || /* Once-per-two-segments ACK was not sent by tcp_input.c */ - tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss || + tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss || /* * If this read emptied read buffer, we send ACK, if * connection is not bidirectional, user drained * receive buffer and there was a small segment * in queue. */ - (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) && - !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc))) + (copied > 0 && (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) && + !icsk->icsk_ack.pingpong && !atomic_read(&sk->sk_rmem_alloc))) time_to_ack = 1; } @@ -1569,7 +1570,7 @@ void tcp_destroy_sock(struct sock *sk) BUG_TRAP(sk_unhashed(sk)); /* If it has not 0 inet_sk(sk)->num, it must be bound */ - BUG_TRAP(!inet_sk(sk)->num || inet_sk(sk)->bind_hash); + BUG_TRAP(!inet_sk(sk)->num || inet_csk(sk)->icsk_bind_hash); sk->sk_prot->destroy(sk); @@ -1698,10 +1699,10 @@ adjudge_to_death: tcp_send_active_reset(sk, GFP_ATOMIC); NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER); } else { - int tmo = tcp_fin_time(tp); + const int tmo = tcp_fin_time(sk); if (tmo > TCP_TIMEWAIT_LEN) { - tcp_reset_keepalive_timer(sk, tcp_fin_time(tp)); + inet_csk_reset_keepalive_timer(sk, tcp_fin_time(sk)); } else { atomic_inc(&tcp_orphan_count); tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); @@ -1746,6 +1747,7 @@ static inline int tcp_need_reset(int state) int tcp_disconnect(struct sock *sk, int flags) { struct inet_sock *inet = inet_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int err = 0; int old_state = sk->sk_state; @@ -1782,7 +1784,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->srtt = 0; if ((tp->write_seq += tp->max_window + 2) == 0) tp->write_seq = 1; - tp->backoff = 0; + icsk->icsk_backoff = 0; tp->snd_cwnd = 2; tp->probes_out = 0; tp->packets_out = 0; @@ -1790,13 +1792,13 @@ int tcp_disconnect(struct sock *sk, int flags) tp->snd_cwnd_cnt = 0; tcp_set_ca_state(tp, TCP_CA_Open); tcp_clear_retrans(tp); - tcp_delack_init(tp); + inet_csk_delack_init(sk); sk->sk_send_head = NULL; tp->rx_opt.saw_tstamp = 0; tcp_sack_reset(&tp->rx_opt); __sk_dst_reset(sk); - BUG_TRAP(!inet->num || inet->bind_hash); + BUG_TRAP(!inet->num || icsk->icsk_bind_hash); sk->sk_error_report(sk); return err; @@ -1808,7 +1810,7 @@ int tcp_disconnect(struct sock *sk, int flags) */ static int wait_for_connect(struct sock *sk, long timeo) { - struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); DEFINE_WAIT(wait); int err; @@ -1830,11 +1832,11 @@ static int wait_for_connect(struct sock *sk, long timeo) prepare_to_wait_exclusive(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); release_sock(sk); - if (reqsk_queue_empty(&tp->accept_queue)) + if (reqsk_queue_empty(&icsk->icsk_accept_queue)) timeo = schedule_timeout(timeo); lock_sock(sk); err = 0; - if (!reqsk_queue_empty(&tp->accept_queue)) + if (!reqsk_queue_empty(&icsk->icsk_accept_queue)) break; err = -EINVAL; if (sk->sk_state != TCP_LISTEN) @@ -1854,9 +1856,9 @@ static int wait_for_connect(struct sock *sk, long timeo) * This will accept the next outstanding connection. */ -struct sock *tcp_accept(struct sock *sk, int flags, int *err) +struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) { - struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); struct sock *newsk; int error; @@ -1870,7 +1872,7 @@ struct sock *tcp_accept(struct sock *sk, int flags, int *err) goto out_err; /* Find already established connection */ - if (reqsk_queue_empty(&tp->accept_queue)) { + if (reqsk_queue_empty(&icsk->icsk_accept_queue)) { long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); /* If this is a non blocking socket don't sleep */ @@ -1883,7 +1885,7 @@ struct sock *tcp_accept(struct sock *sk, int flags, int *err) goto out_err; } - newsk = reqsk_queue_get_child(&tp->accept_queue, sk); + newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk); BUG_TRAP(newsk->sk_state != TCP_SYN_RECV); out: release_sock(sk); @@ -1901,6 +1903,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, int optlen) { struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); int val; int err = 0; @@ -1999,7 +2002,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, elapsed = tp->keepalive_time - elapsed; else elapsed = 0; - tcp_reset_keepalive_timer(sk, elapsed); + inet_csk_reset_keepalive_timer(sk, elapsed); } } break; @@ -2019,7 +2022,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, if (val < 1 || val > MAX_TCP_SYNCNT) err = -EINVAL; else - tp->syn_retries = val; + icsk->icsk_syn_retries = val; break; case TCP_LINGER2: @@ -2058,16 +2061,16 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, case TCP_QUICKACK: if (!val) { - tp->ack.pingpong = 1; + icsk->icsk_ack.pingpong = 1; } else { - tp->ack.pingpong = 0; + icsk->icsk_ack.pingpong = 0; if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && - tcp_ack_scheduled(tp)) { - tp->ack.pending |= TCP_ACK_PUSHED; + inet_csk_ack_scheduled(sk)) { + icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; cleanup_rbuf(sk, 1); if (!(val & 1)) - tp->ack.pingpong = 1; + icsk->icsk_ack.pingpong = 1; } } break; @@ -2084,15 +2087,16 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, void tcp_get_info(struct sock *sk, struct tcp_info *info) { struct tcp_sock *tp = tcp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); u32 now = tcp_time_stamp; memset(info, 0, sizeof(*info)); info->tcpi_state = sk->sk_state; info->tcpi_ca_state = tp->ca_state; - info->tcpi_retransmits = tp->retransmits; + info->tcpi_retransmits = icsk->icsk_retransmits; info->tcpi_probes = tp->probes_out; - info->tcpi_backoff = tp->backoff; + info->tcpi_backoff = icsk->icsk_backoff; if (tp->rx_opt.tstamp_ok) info->tcpi_options |= TCPI_OPT_TIMESTAMPS; @@ -2107,10 +2111,10 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) if (tp->ecn_flags&TCP_ECN_OK) info->tcpi_options |= TCPI_OPT_ECN; - info->tcpi_rto = jiffies_to_usecs(tp->rto); - info->tcpi_ato = jiffies_to_usecs(tp->ack.ato); + info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto); + info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato); info->tcpi_snd_mss = tp->mss_cache; - info->tcpi_rcv_mss = tp->ack.rcv_mss; + info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss; info->tcpi_unacked = tp->packets_out; info->tcpi_sacked = tp->sacked_out; @@ -2119,7 +2123,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_fackets = tp->fackets_out; info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime); - info->tcpi_last_data_recv = jiffies_to_msecs(now - tp->ack.lrcvtime); + info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime); info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp); info->tcpi_pmtu = tp->pmtu_cookie; @@ -2179,7 +2183,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes; break; case TCP_SYNCNT: - val = tp->syn_retries ? : sysctl_tcp_syn_retries; + val = inet_csk(sk)->icsk_syn_retries ? : sysctl_tcp_syn_retries; break; case TCP_LINGER2: val = tp->linger2; @@ -2209,7 +2213,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, return 0; } case TCP_QUICKACK: - val = !tp->ack.pingpong; + val = !inet_csk(sk)->icsk_ack.pingpong; break; case TCP_CONGESTION: @@ -2340,7 +2344,7 @@ void __init tcp_init(void) tcp_register_congestion_control(&tcp_reno); } -EXPORT_SYMBOL(tcp_accept); +EXPORT_SYMBOL(inet_csk_accept); EXPORT_SYMBOL(tcp_close); EXPORT_SYMBOL(tcp_destroy_sock); EXPORT_SYMBOL(tcp_disconnect); diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 60c6a79..5f4c74f 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -48,8 +48,9 @@ static struct sock *tcpnl; static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, int ext, u32 pid, u32 seq, u16 nlmsg_flags) { - struct inet_sock *inet = inet_sk(sk); + const struct inet_sock *inet = inet_sk(sk); struct tcp_sock *tp = tcp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); struct tcpdiagmsg *r; struct nlmsghdr *nlh; struct tcp_info *info = NULL; @@ -129,14 +130,14 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, #define EXPIRES_IN_MS(tmo) ((tmo-jiffies)*1000+HZ-1)/HZ - if (tp->pending == TCP_TIME_RETRANS) { + if (icsk->icsk_pending == ICSK_TIME_RETRANS) { r->tcpdiag_timer = 1; - r->tcpdiag_retrans = tp->retransmits; - r->tcpdiag_expires = EXPIRES_IN_MS(tp->timeout); - } else if (tp->pending == TCP_TIME_PROBE0) { + r->tcpdiag_retrans = icsk->icsk_retransmits; + r->tcpdiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout); + } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { r->tcpdiag_timer = 4; r->tcpdiag_retrans = tp->probes_out; - r->tcpdiag_expires = EXPIRES_IN_MS(tp->timeout); + r->tcpdiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout); } else if (timer_pending(&sk->sk_timer)) { r->tcpdiag_timer = 2; r->tcpdiag_retrans = tp->probes_out; @@ -497,7 +498,7 @@ static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk, { struct tcpdiag_entry entry; struct tcpdiagreq *r = NLMSG_DATA(cb->nlh); - struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); struct listen_sock *lopt; struct rtattr *bc = NULL; struct inet_sock *inet = inet_sk(sk); @@ -513,9 +514,9 @@ static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk, entry.family = sk->sk_family; - read_lock_bh(&tp->accept_queue.syn_wait_lock); + read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - lopt = tp->accept_queue.listen_opt; + lopt = icsk->icsk_accept_queue.listen_opt; if (!lopt || !lopt->qlen) goto out; @@ -572,7 +573,7 @@ static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk, } out: - read_unlock_bh(&tp->accept_queue.syn_wait_lock); + read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); return err; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ffa2402..8a8c5c2 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -114,20 +114,21 @@ int sysctl_tcp_moderate_rcvbuf = 1; /* Adapt the MSS value used to make delayed ack decision to the * real world. */ -static inline void tcp_measure_rcv_mss(struct tcp_sock *tp, - struct sk_buff *skb) +static inline void tcp_measure_rcv_mss(struct sock *sk, + const struct sk_buff *skb) { - unsigned int len, lss; + struct inet_connection_sock *icsk = inet_csk(sk); + const unsigned int lss = icsk->icsk_ack.last_seg_size; + unsigned int len; - lss = tp->ack.last_seg_size; - tp->ack.last_seg_size = 0; + icsk->icsk_ack.last_seg_size = 0; /* skb->len may jitter because of SACKs, even if peer * sends good full-sized frames. */ len = skb->len; - if (len >= tp->ack.rcv_mss) { - tp->ack.rcv_mss = len; + if (len >= icsk->icsk_ack.rcv_mss) { + icsk->icsk_ack.rcv_mss = len; } else { /* Otherwise, we make more careful check taking into account, * that SACKs block is variable. @@ -147,41 +148,44 @@ static inline void tcp_measure_rcv_mss(struct tcp_sock *tp, * tcp header plus fixed timestamp option length. * Resulting "len" is MSS free of SACK jitter. */ - len -= tp->tcp_header_len; - tp->ack.last_seg_size = len; + len -= tcp_sk(sk)->tcp_header_len; + icsk->icsk_ack.last_seg_size = len; if (len == lss) { - tp->ack.rcv_mss = len; + icsk->icsk_ack.rcv_mss = len; return; } } - tp->ack.pending |= TCP_ACK_PUSHED; + icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; } } -static void tcp_incr_quickack(struct tcp_sock *tp) +static void tcp_incr_quickack(struct sock *sk) { - unsigned quickacks = tp->rcv_wnd/(2*tp->ack.rcv_mss); + struct inet_connection_sock *icsk = inet_csk(sk); + unsigned quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss); if (quickacks==0) quickacks=2; - if (quickacks > tp->ack.quick) - tp->ack.quick = min(quickacks, TCP_MAX_QUICKACKS); + if (quickacks > icsk->icsk_ack.quick) + icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS); } -void tcp_enter_quickack_mode(struct tcp_sock *tp) +void tcp_enter_quickack_mode(struct sock *sk) { - tcp_incr_quickack(tp); - tp->ack.pingpong = 0; - tp->ack.ato = TCP_ATO_MIN; + struct inet_connection_sock *icsk = inet_csk(sk); + tcp_incr_quickack(sk); + icsk->icsk_ack.pingpong = 0; + icsk->icsk_ack.ato = TCP_ATO_MIN; } /* Send ACKs quickly, if "quick" count is not exhausted * and the session is not interactive. */ -static __inline__ int tcp_in_quickack_mode(struct tcp_sock *tp) +static inline int tcp_in_quickack_mode(const struct sock *sk) { - return (tp->ack.quick && !tp->ack.pingpong); + const struct inet_connection_sock *icsk = inet_csk(sk); + return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong; } /* Buffer size and advertised window tuning. @@ -224,8 +228,8 @@ static void tcp_fixup_sndbuf(struct sock *sk) */ /* Slow part of check#2. */ -static int __tcp_grow_window(struct sock *sk, struct tcp_sock *tp, - struct sk_buff *skb) +static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp, + const struct sk_buff *skb) { /* Optimize this! */ int truesize = tcp_win_from_space(skb->truesize)/2; @@ -233,7 +237,7 @@ static int __tcp_grow_window(struct sock *sk, struct tcp_sock *tp, while (tp->rcv_ssthresh <= window) { if (truesize <= skb->len) - return 2*tp->ack.rcv_mss; + return 2 * inet_csk(sk)->icsk_ack.rcv_mss; truesize >>= 1; window >>= 1; @@ -260,7 +264,7 @@ static inline void tcp_grow_window(struct sock *sk, struct tcp_sock *tp, if (incr) { tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp); - tp->ack.quick |= 1; + inet_csk(sk)->icsk_ack.quick |= 1; } } } @@ -325,7 +329,7 @@ static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) unsigned int app_win = tp->rcv_nxt - tp->copied_seq; int ofo_win = 0; - tp->ack.quick = 0; + inet_csk(sk)->icsk_ack.quick = 0; skb_queue_walk(&tp->out_of_order_queue, skb) { ofo_win += skb->len; @@ -346,8 +350,8 @@ static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) app_win += ofo_win; if (atomic_read(&sk->sk_rmem_alloc) >= 2 * sk->sk_rcvbuf) app_win >>= 1; - if (app_win > tp->ack.rcv_mss) - app_win -= tp->ack.rcv_mss; + if (app_win > inet_csk(sk)->icsk_ack.rcv_mss) + app_win -= inet_csk(sk)->icsk_ack.rcv_mss; app_win = max(app_win, 2U*tp->advmss); if (!ofo_win) @@ -415,11 +419,12 @@ new_measure: tp->rcv_rtt_est.time = tcp_time_stamp; } -static inline void tcp_rcv_rtt_measure_ts(struct tcp_sock *tp, struct sk_buff *skb) +static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, const struct sk_buff *skb) { + struct tcp_sock *tp = tcp_sk(sk); if (tp->rx_opt.rcv_tsecr && (TCP_SKB_CB(skb)->end_seq - - TCP_SKB_CB(skb)->seq >= tp->ack.rcv_mss)) + TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0); } @@ -492,41 +497,42 @@ new_measure: */ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) { + struct inet_connection_sock *icsk = inet_csk(sk); u32 now; - tcp_schedule_ack(tp); + inet_csk_schedule_ack(sk); - tcp_measure_rcv_mss(tp, skb); + tcp_measure_rcv_mss(sk, skb); tcp_rcv_rtt_measure(tp); now = tcp_time_stamp; - if (!tp->ack.ato) { + if (!icsk->icsk_ack.ato) { /* The _first_ data packet received, initialize * delayed ACK engine. */ - tcp_incr_quickack(tp); - tp->ack.ato = TCP_ATO_MIN; + tcp_incr_quickack(sk); + icsk->icsk_ack.ato = TCP_ATO_MIN; } else { - int m = now - tp->ack.lrcvtime; + int m = now - icsk->icsk_ack.lrcvtime; if (m <= TCP_ATO_MIN/2) { /* The fastest case is the first. */ - tp->ack.ato = (tp->ack.ato>>1) + TCP_ATO_MIN/2; - } else if (m < tp->ack.ato) { - tp->ack.ato = (tp->ack.ato>>1) + m; - if (tp->ack.ato > tp->rto) - tp->ack.ato = tp->rto; - } else if (m > tp->rto) { + icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2; + } else if (m < icsk->icsk_ack.ato) { + icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m; + if (icsk->icsk_ack.ato > icsk->icsk_rto) + icsk->icsk_ack.ato = icsk->icsk_rto; + } else if (m > icsk->icsk_rto) { /* Too long gap. Apparently sender falled to * restart window, so that we send ACKs quickly. */ - tcp_incr_quickack(tp); + tcp_incr_quickack(sk); sk_stream_mem_reclaim(sk); } } - tp->ack.lrcvtime = now; + icsk->icsk_ack.lrcvtime = now; TCP_ECN_check_ce(tp, skb); @@ -611,8 +617,9 @@ static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt) /* Calculate rto without backoff. This is the second half of Van Jacobson's * routine referred to above. */ -static inline void tcp_set_rto(struct tcp_sock *tp) +static inline void tcp_set_rto(struct sock *sk) { + const struct tcp_sock *tp = tcp_sk(sk); /* Old crap is replaced with new one. 8) * * More seriously: @@ -623,7 +630,7 @@ static inline void tcp_set_rto(struct tcp_sock *tp) * is invisible. Actually, Linux-2.4 also generates erratic * ACKs in some curcumstances. */ - tp->rto = (tp->srtt >> 3) + tp->rttvar; + inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar; /* 2. Fixups made earlier cannot be right. * If we do not estimate RTO correctly without them, @@ -635,10 +642,10 @@ static inline void tcp_set_rto(struct tcp_sock *tp) /* NOTE: clamping at TCP_RTO_MIN is not required, current algo * guarantees that rto is higher. */ -static inline void tcp_bound_rto(struct tcp_sock *tp) +static inline void tcp_bound_rto(struct sock *sk) { - if (tp->rto > TCP_RTO_MAX) - tp->rto = TCP_RTO_MAX; + if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX) + inet_csk(sk)->icsk_rto = TCP_RTO_MAX; } /* Save metrics learned by this TCP session. @@ -658,7 +665,7 @@ void tcp_update_metrics(struct sock *sk) if (dst && (dst->flags&DST_HOST)) { int m; - if (tp->backoff || !tp->srtt) { + if (inet_csk(sk)->icsk_backoff || !tp->srtt) { /* This session failed to estimate rtt. Why? * Probably, no packets returned in time. * Reset our results. @@ -801,9 +808,9 @@ static void tcp_init_metrics(struct sock *sk) tp->mdev = dst_metric(dst, RTAX_RTTVAR); tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN); } - tcp_set_rto(tp); - tcp_bound_rto(tp); - if (tp->rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) + tcp_set_rto(sk); + tcp_bound_rto(sk); + if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) goto reset; tp->snd_cwnd = tcp_init_cwnd(tp, dst); tp->snd_cwnd_stamp = tcp_time_stamp; @@ -817,7 +824,7 @@ reset: if (!tp->rx_opt.saw_tstamp && tp->srtt) { tp->srtt = 0; tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT; - tp->rto = TCP_TIMEOUT_INIT; + inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; } } @@ -1118,7 +1125,7 @@ void tcp_enter_frto(struct sock *sk) if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || - (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { + (tp->ca_state == TCP_CA_Loss && !inet_csk(sk)->icsk_retransmits)) { tp->prior_ssthresh = tcp_current_ssthresh(tp); tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); tcp_ca_event(tp, CA_EVENT_FRTO); @@ -1214,7 +1221,7 @@ void tcp_enter_loss(struct sock *sk, int how) /* Reduce ssthresh if it has not yet been made inside this window. */ if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || - (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { + (tp->ca_state == TCP_CA_Loss && !inet_csk(sk)->icsk_retransmits)) { tp->prior_ssthresh = tcp_current_ssthresh(tp); tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); tcp_ca_event(tp, CA_EVENT_LOSS); @@ -1253,7 +1260,7 @@ void tcp_enter_loss(struct sock *sk, int how) TCP_ECN_queue_cwr(tp); } -static int tcp_check_sack_reneging(struct sock *sk, struct tcp_sock *tp) +static int tcp_check_sack_reneging(struct sock *sk) { struct sk_buff *skb; @@ -1268,9 +1275,10 @@ static int tcp_check_sack_reneging(struct sock *sk, struct tcp_sock *tp) NET_INC_STATS_BH(LINUX_MIB_TCPSACKRENEGING); tcp_enter_loss(sk, 1); - tp->retransmits++; + inet_csk(sk)->icsk_retransmits++; tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)); - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto); return 1; } return 0; @@ -1281,15 +1289,15 @@ static inline int tcp_fackets_out(struct tcp_sock *tp) return IsReno(tp) ? tp->sacked_out+1 : tp->fackets_out; } -static inline int tcp_skb_timedout(struct tcp_sock *tp, struct sk_buff *skb) +static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb) { - return (tcp_time_stamp - TCP_SKB_CB(skb)->when > tp->rto); + return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto); } static inline int tcp_head_timedout(struct sock *sk, struct tcp_sock *tp) { return tp->packets_out && - tcp_skb_timedout(tp, skb_peek(&sk->sk_write_queue)); + tcp_skb_timedout(sk, skb_peek(&sk->sk_write_queue)); } /* Linux NewReno/SACK/FACK/ECN state machine. @@ -1509,7 +1517,7 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp) struct sk_buff *skb; sk_stream_for_retrans_queue(skb, sk) { - if (tcp_skb_timedout(tp, skb) && + if (tcp_skb_timedout(sk, skb) && !(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); @@ -1676,7 +1684,7 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp) tp->left_out = tp->sacked_out; tcp_undo_cwr(tp, 1); NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO); - tp->retransmits = 0; + inet_csk(sk)->icsk_retransmits = 0; tp->undo_marker = 0; if (!IsReno(tp)) tcp_set_ca_state(tp, TCP_CA_Open); @@ -1750,7 +1758,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, tp->prior_ssthresh = 0; /* B. In all the states check for reneging SACKs. */ - if (tp->sacked_out && tcp_check_sack_reneging(sk, tp)) + if (tp->sacked_out && tcp_check_sack_reneging(sk)) return; /* C. Process data loss notification, provided it is valid. */ @@ -1774,7 +1782,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, } else if (!before(tp->snd_una, tp->high_seq)) { switch (tp->ca_state) { case TCP_CA_Loss: - tp->retransmits = 0; + inet_csk(sk)->icsk_retransmits = 0; if (tcp_try_undo_recovery(sk, tp)) return; break; @@ -1824,7 +1832,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, break; case TCP_CA_Loss: if (flag&FLAG_DATA_ACKED) - tp->retransmits = 0; + inet_csk(sk)->icsk_retransmits = 0; if (!tcp_try_undo_loss(sk, tp)) { tcp_moderate_cwnd(tp); tcp_xmit_retransmit_queue(sk); @@ -1881,10 +1889,8 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, /* Read draft-ietf-tcplw-high-performance before mucking * with this code. (Superceeds RFC1323) */ -static void tcp_ack_saw_tstamp(struct tcp_sock *tp, u32 *usrtt, int flag) +static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag) { - __u32 seq_rtt; - /* RTTM Rule: A TSecr value received in a segment is used to * update the averaged RTT measurement only if the segment * acknowledges some new data, i.e., only if it advances the @@ -1900,14 +1906,15 @@ static void tcp_ack_saw_tstamp(struct tcp_sock *tp, u32 *usrtt, int flag) * answer arrives rto becomes 120 seconds! If at least one of segments * in window is lost... Voila. --ANK (010210) */ - seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; + struct tcp_sock *tp = tcp_sk(sk); + const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; tcp_rtt_estimator(tp, seq_rtt, usrtt); - tcp_set_rto(tp); - tp->backoff = 0; - tcp_bound_rto(tp); + tcp_set_rto(sk); + inet_csk(sk)->icsk_backoff = 0; + tcp_bound_rto(sk); } -static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, u32 *usrtt, int flag) +static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag) { /* We don't have a timestamp. Can only use * packets that are not retransmitted to determine @@ -1921,20 +1928,21 @@ static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, u32 *usrtt, int if (flag & FLAG_RETRANS_DATA_ACKED) return; - tcp_rtt_estimator(tp, seq_rtt, usrtt); - tcp_set_rto(tp); - tp->backoff = 0; - tcp_bound_rto(tp); + tcp_rtt_estimator(tcp_sk(sk), seq_rtt, usrtt); + tcp_set_rto(sk); + inet_csk(sk)->icsk_backoff = 0; + tcp_bound_rto(sk); } -static inline void tcp_ack_update_rtt(struct tcp_sock *tp, - int flag, s32 seq_rtt, u32 *usrtt) +static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, + const s32 seq_rtt, u32 *usrtt) { + const struct tcp_sock *tp = tcp_sk(sk); /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) - tcp_ack_saw_tstamp(tp, usrtt, flag); + tcp_ack_saw_tstamp(sk, usrtt, flag); else if (seq_rtt >= 0) - tcp_ack_no_tstamp(tp, seq_rtt, usrtt, flag); + tcp_ack_no_tstamp(sk, seq_rtt, usrtt, flag); } static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, @@ -1951,9 +1959,9 @@ static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp) { if (!tp->packets_out) { - tcp_clear_xmit_timer(sk, TCP_TIME_RETRANS); + inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); } else { - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto); } } @@ -2090,7 +2098,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt } if (acked&FLAG_ACKED) { - tcp_ack_update_rtt(tp, acked, seq_rtt, seq_usrtt); + tcp_ack_update_rtt(sk, acked, seq_rtt, seq_usrtt); tcp_ack_packets_out(sk, tp); if (tp->ca_ops->pkts_acked) @@ -2125,20 +2133,21 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt static void tcp_ack_probe(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); + const struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); /* Was it a usable window open? */ if (!after(TCP_SKB_CB(sk->sk_send_head)->end_seq, tp->snd_una + tp->snd_wnd)) { - tp->backoff = 0; - tcp_clear_xmit_timer(sk, TCP_TIME_PROBE0); + icsk->icsk_backoff = 0; + inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0); /* Socket must be waked up by subsequent tcp_data_snd_check(). * This function is not for random using! */ } else { - tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, - min(tp->rto << tp->backoff, TCP_RTO_MAX)); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, + min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX)); } } @@ -2157,8 +2166,8 @@ static inline int tcp_may_raise_cwnd(struct tcp_sock *tp, int flag) /* Check that window update is acceptable. * The function assumes that snd_una<=ack<=snd_next. */ -static inline int tcp_may_update_window(struct tcp_sock *tp, u32 ack, - u32 ack_seq, u32 nwin) +static inline int tcp_may_update_window(const struct tcp_sock *tp, const u32 ack, + const u32 ack_seq, const u32 nwin) { return (after(ack, tp->snd_una) || after(ack_seq, tp->snd_wl1) || @@ -2500,8 +2509,9 @@ static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) * up to bandwidth of 18Gigabit/sec. 8) ] */ -static int tcp_disordered_ack(struct tcp_sock *tp, struct sk_buff *skb) +static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb) { + struct tcp_sock *tp = tcp_sk(sk); struct tcphdr *th = skb->h.th; u32 seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; @@ -2516,14 +2526,15 @@ static int tcp_disordered_ack(struct tcp_sock *tp, struct sk_buff *skb) !tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) && /* 4. ... and sits in replay window. */ - (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (tp->rto*1024)/HZ); + (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ); } -static inline int tcp_paws_discard(struct tcp_sock *tp, struct sk_buff *skb) +static inline int tcp_paws_discard(const struct sock *sk, const struct sk_buff *skb) { + const struct tcp_sock *tp = tcp_sk(sk); return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW && xtime.tv_sec < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS && - !tcp_disordered_ack(tp, skb)); + !tcp_disordered_ack(sk, skb)); } /* Check segment sequence number for validity. @@ -2586,7 +2597,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) { struct tcp_sock *tp = tcp_sk(sk); - tcp_schedule_ack(tp); + inet_csk_schedule_ack(sk); sk->sk_shutdown |= RCV_SHUTDOWN; sock_set_flag(sk, SOCK_DONE); @@ -2596,7 +2607,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) case TCP_ESTABLISHED: /* Move to CLOSE_WAIT */ tcp_set_state(sk, TCP_CLOSE_WAIT); - tp->ack.pingpong = 1; + inet_csk(sk)->icsk_ack.pingpong = 1; break; case TCP_CLOSE_WAIT: @@ -2694,7 +2705,7 @@ static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb) if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST); - tcp_enter_quickack_mode(tp); + tcp_enter_quickack_mode(sk); if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) { u32 end_seq = TCP_SKB_CB(skb)->end_seq; @@ -2942,7 +2953,7 @@ queue_and_out: * gap in queue is filled. */ if (skb_queue_empty(&tp->out_of_order_queue)) - tp->ack.pingpong = 0; + inet_csk(sk)->icsk_ack.pingpong = 0; } if (tp->rx_opt.num_sacks) @@ -2963,8 +2974,8 @@ queue_and_out: tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); out_of_window: - tcp_enter_quickack_mode(tp); - tcp_schedule_ack(tp); + tcp_enter_quickack_mode(sk); + inet_csk_schedule_ack(sk); drop: __kfree_skb(skb); return; @@ -2974,7 +2985,7 @@ drop: if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp))) goto out_of_window; - tcp_enter_quickack_mode(tp); + tcp_enter_quickack_mode(sk); if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { /* Partial packet, seq < rcv_next < end_seq */ @@ -3003,7 +3014,7 @@ drop: /* Disable header prediction. */ tp->pred_flags = 0; - tcp_schedule_ack(tp); + inet_csk_schedule_ack(sk); SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); @@ -3373,13 +3384,13 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) struct tcp_sock *tp = tcp_sk(sk); /* More than one full frame received... */ - if (((tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss + if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss /* ... and right edge of window advances far enough. * (tcp_recvmsg() will send ACK otherwise). Or... */ && __tcp_select_window(sk) >= tp->rcv_wnd) || /* We ACK each frame or... */ - tcp_in_quickack_mode(tp) || + tcp_in_quickack_mode(sk) || /* We have out of order data. */ (ofo_possible && skb_peek(&tp->out_of_order_queue))) { @@ -3393,8 +3404,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) static __inline__ void tcp_ack_snd_check(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); - if (!tcp_ack_scheduled(tp)) { + if (!inet_csk_ack_scheduled(sk)) { /* We sent a data segment already. */ return; } @@ -3648,7 +3658,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tp->rcv_nxt == tp->rcv_wup) tcp_store_ts_recent(tp); - tcp_rcv_rtt_measure_ts(tp, skb); + tcp_rcv_rtt_measure_ts(sk, skb); /* We know that such packets are checksummed * on entry. @@ -3681,7 +3691,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tp->rcv_nxt == tp->rcv_wup) tcp_store_ts_recent(tp); - tcp_rcv_rtt_measure_ts(tp, skb); + tcp_rcv_rtt_measure_ts(sk, skb); __skb_pull(skb, tcp_header_len); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; @@ -3702,7 +3712,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tp->rcv_nxt == tp->rcv_wup) tcp_store_ts_recent(tp); - tcp_rcv_rtt_measure_ts(tp, skb); + tcp_rcv_rtt_measure_ts(sk, skb); if ((int)skb->truesize > sk->sk_forward_alloc) goto step5; @@ -3722,7 +3732,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, /* Well, only one small jumplet in fast path... */ tcp_ack(sk, skb, FLAG_DATA); tcp_data_snd_check(sk, tp); - if (!tcp_ack_scheduled(tp)) + if (!inet_csk_ack_scheduled(sk)) goto no_ack; } @@ -3744,7 +3754,7 @@ slow_path: * RFC1323: H1. Apply PAWS check first. */ if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && - tcp_paws_discard(tp, skb)) { + tcp_paws_discard(sk, skb)) { if (!th->rst) { NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED); tcp_send_dupack(sk, skb); @@ -3791,7 +3801,7 @@ step5: if(th->ack) tcp_ack(sk, skb, FLAG_SLOWPATH); - tcp_rcv_rtt_measure_ts(tp, skb); + tcp_rcv_rtt_measure_ts(sk, skb); /* Process urgent data. */ tcp_urg(sk, skb, th); @@ -3933,7 +3943,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_init_buffer_space(sk); if (sock_flag(sk, SOCK_KEEPOPEN)) - tcp_reset_keepalive_timer(sk, keepalive_time_when(tp)); + inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); if (!tp->rx_opt.snd_wscale) __tcp_fast_path_on(tp, tp->snd_wnd); @@ -3945,7 +3955,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, sk_wake_async(sk, 0, POLL_OUT); } - if (sk->sk_write_pending || tp->defer_accept || tp->ack.pingpong) { + if (sk->sk_write_pending || tp->defer_accept || inet_csk(sk)->icsk_ack.pingpong) { /* Save one ACK. Data will be ready after * several ticks, if write_pending is set. * @@ -3953,12 +3963,12 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * look so _wonderfully_ clever, that I was not able * to stand against the temptation 8) --ANK */ - tcp_schedule_ack(tp); - tp->ack.lrcvtime = tcp_time_stamp; - tp->ack.ato = TCP_ATO_MIN; - tcp_incr_quickack(tp); - tcp_enter_quickack_mode(tp); - tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX); + inet_csk_schedule_ack(sk); + inet_csk(sk)->icsk_ack.lrcvtime = tcp_time_stamp; + inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; + tcp_incr_quickack(sk); + tcp_enter_quickack_mode(sk); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, TCP_DELACK_MAX); discard: __kfree_skb(skb); @@ -4114,7 +4124,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, } if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && - tcp_paws_discard(tp, skb)) { + tcp_paws_discard(sk, skb)) { if (!th->rst) { NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED); tcp_send_dupack(sk, skb); @@ -4183,7 +4193,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, */ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && !tp->srtt) - tcp_ack_saw_tstamp(tp, 0, 0); + tcp_ack_saw_tstamp(sk, 0, 0); if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; @@ -4230,9 +4240,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, return 1; } - tmo = tcp_fin_time(tp); + tmo = tcp_fin_time(sk); if (tmo > TCP_TIMEWAIT_LEN) { - tcp_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); + inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); } else if (th->fin || sock_owned_by_user(sk)) { /* Bad case. We could lose such FIN otherwise. * It is not a big problem, but it looks confusing @@ -4240,7 +4250,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, * if it spins in bh_lock_sock(), but it is really * marginal case. */ - tcp_reset_keepalive_timer(sk, tmo); + inet_csk_reset_keepalive_timer(sk, tmo); } else { tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); goto discard; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index e7e91e6..2cd4126 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -104,7 +104,7 @@ struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { */ int sysctl_local_port_range[2] = { 1024, 4999 }; -static inline int tcp_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb) +static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb) { const u32 sk_rcv_saddr = inet_rcv_saddr(sk); struct sock *sk2; @@ -113,7 +113,7 @@ static inline int tcp_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb sk_for_each_bound(sk2, node, &tb->owners) { if (sk != sk2 && - !tcp_v6_ipv6only(sk2) && + !inet_v6_ipv6only(sk2) && (!sk->sk_bound_dev_if || !sk2->sk_bound_dev_if || sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { @@ -132,7 +132,8 @@ static inline int tcp_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb /* Obtain a reference to a local port for the given sock, * if snum is zero it means select any available local port. */ -static int tcp_v4_get_port(struct sock *sk, unsigned short snum) +int inet_csk_get_port(struct inet_hashinfo *hashinfo, + struct sock *sk, unsigned short snum) { struct inet_bind_hashbucket *head; struct hlist_node *node; @@ -146,16 +147,16 @@ static int tcp_v4_get_port(struct sock *sk, unsigned short snum) int remaining = (high - low) + 1; int rover; - spin_lock(&tcp_hashinfo.portalloc_lock); - if (tcp_hashinfo.port_rover < low) + spin_lock(&hashinfo->portalloc_lock); + if (hashinfo->port_rover < low) rover = low; else - rover = tcp_hashinfo.port_rover; + rover = hashinfo->port_rover; do { rover++; if (rover > high) rover = low; - head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)]; + head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)]; spin_lock(&head->lock); inet_bind_bucket_for_each(tb, node, &head->chain) if (tb->port == rover) @@ -164,8 +165,8 @@ static int tcp_v4_get_port(struct sock *sk, unsigned short snum) next: spin_unlock(&head->lock); } while (--remaining > 0); - tcp_hashinfo.port_rover = rover; - spin_unlock(&tcp_hashinfo.portalloc_lock); + hashinfo->port_rover = rover; + spin_unlock(&hashinfo->portalloc_lock); /* Exhausted local port range during search? It is not * possible for us to be holding one of the bind hash @@ -182,7 +183,7 @@ static int tcp_v4_get_port(struct sock *sk, unsigned short snum) */ snum = rover; } else { - head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)]; + head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)]; spin_lock(&head->lock); inet_bind_bucket_for_each(tb, node, &head->chain) if (tb->port == snum) @@ -199,13 +200,13 @@ tb_found: goto success; } else { ret = 1; - if (tcp_bind_conflict(sk, tb)) + if (inet_csk_bind_conflict(sk, tb)) goto fail_unlock; } } tb_not_found: ret = 1; - if (!tb && (tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, snum)) == NULL) + if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, head, snum)) == NULL) goto fail_unlock; if (hlist_empty(&tb->owners)) { if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) @@ -216,9 +217,9 @@ tb_not_found: (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) tb->fastreuse = 0; success: - if (!inet_sk(sk)->bind_hash) + if (!inet_csk(sk)->icsk_bind_hash) inet_bind_hash(sk, tb, snum); - BUG_TRAP(inet_sk(sk)->bind_hash == tb); + BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb); ret = 0; fail_unlock: @@ -228,6 +229,11 @@ fail: return ret; } +static int tcp_v4_get_port(struct sock *sk, unsigned short snum) +{ + return inet_csk_get_port(&tcp_hashinfo, sk, snum); +} + static void tcp_v4_hash(struct sock *sk) { inet_hash(&tcp_hashinfo, sk); @@ -426,7 +432,7 @@ ok: } head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)]; - tb = inet_sk(sk)->bind_hash; + tb = inet_csk(sk)->icsk_bind_hash; spin_lock_bh(&head->lock); if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { __inet_hash(&tcp_hashinfo, sk, 0); @@ -557,25 +563,28 @@ failure: return err; } -static __inline__ int tcp_v4_iif(struct sk_buff *skb) +static inline int inet_iif(const struct sk_buff *skb) { return ((struct rtable *)skb->dst)->rt_iif; } -static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd) +static inline u32 inet_synq_hash(const u32 raddr, const u16 rport, + const u32 rnd, const u16 synq_hsize) { - return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1)); + return jhash_2words(raddr, (u32)rport, rnd) & (synq_hsize - 1); } -static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp, - struct request_sock ***prevp, - __u16 rport, - __u32 raddr, __u32 laddr) +struct request_sock *inet_csk_search_req(const struct sock *sk, + struct request_sock ***prevp, + const __u16 rport, const __u32 raddr, + const __u32 laddr) { - struct listen_sock *lopt = tp->accept_queue.listen_opt; + const struct inet_connection_sock *icsk = inet_csk(sk); + struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; struct request_sock *req, **prev; - for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)]; + for (prev = &lopt->syn_table[inet_synq_hash(raddr, rport, lopt->hash_rnd, + lopt->nr_table_entries)]; (req = *prev) != NULL; prev = &req->dl_next) { const struct inet_request_sock *ireq = inet_rsk(req); @@ -583,7 +592,7 @@ static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp, if (ireq->rmt_port == rport && ireq->rmt_addr == raddr && ireq->loc_addr == laddr && - TCP_INET_FAMILY(req->rsk_ops->family)) { + AF_INET_FAMILY(req->rsk_ops->family)) { BUG_TRAP(!req->sk); *prevp = prev; break; @@ -595,12 +604,13 @@ static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp, static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req) { - struct tcp_sock *tp = tcp_sk(sk); - struct listen_sock *lopt = tp->accept_queue.listen_opt; - u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd); + struct inet_connection_sock *icsk = inet_csk(sk); + struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; + const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, + lopt->hash_rnd, lopt->nr_table_entries); - reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT); - tcp_synq_added(sk); + reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, TCP_TIMEOUT_INIT); + inet_csk_reqsk_queue_added(sk, TCP_TIMEOUT_INIT); } @@ -687,7 +697,7 @@ void tcp_v4_err(struct sk_buff *skb, u32 info) } sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr, - th->source, tcp_v4_iif(skb)); + th->source, inet_iif(skb)); if (!sk) { ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); return; @@ -747,8 +757,8 @@ void tcp_v4_err(struct sk_buff *skb, u32 info) if (sock_owned_by_user(sk)) goto out; - req = tcp_v4_search_req(tp, &prev, th->dest, - iph->daddr, iph->saddr); + req = inet_csk_search_req(sk, &prev, th->dest, + iph->daddr, iph->saddr); if (!req) goto out; @@ -768,7 +778,7 @@ void tcp_v4_err(struct sk_buff *skb, u32 info) * created socket, and POSIX does not want network * errors returned from accept(). */ - tcp_synq_drop(sk, req, prev); + inet_csk_reqsk_queue_drop(sk, req, prev); goto out; case TCP_SYN_SENT: @@ -953,8 +963,8 @@ static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req) req->ts_recent); } -static struct dst_entry* tcp_v4_route_req(struct sock *sk, - struct request_sock *req) +struct dst_entry* inet_csk_route_req(struct sock *sk, + const struct request_sock *req) { struct rtable *rt; const struct inet_request_sock *ireq = inet_rsk(req); @@ -966,7 +976,7 @@ static struct dst_entry* tcp_v4_route_req(struct sock *sk, ireq->rmt_addr), .saddr = ireq->loc_addr, .tos = RT_CONN_FLAGS(sk) } }, - .proto = IPPROTO_TCP, + .proto = sk->sk_protocol, .uli_u = { .ports = { .sport = inet_sk(sk)->sport, .dport = ireq->rmt_port } } }; @@ -996,7 +1006,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req, struct sk_buff * skb; /* First, grab a route. */ - if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL) + if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL) goto out; skb = tcp_make_synack(sk, dst, req); @@ -1098,7 +1108,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) * limitations, they conserve resources and peer is * evidently real one. */ - if (tcp_synq_is_full(sk) && !isn) { + if (inet_csk_reqsk_queue_is_full(sk) && !isn) { #ifdef CONFIG_SYN_COOKIES if (sysctl_tcp_syncookies) { want_cookie = 1; @@ -1112,7 +1122,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) * clogging syn queue with openreqs with exponentially increasing * timeout. */ - if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1) + if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) goto drop; req = reqsk_alloc(&tcp_request_sock_ops); @@ -1169,7 +1179,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) */ if (tmp_opt.saw_tstamp && sysctl_tcp_tw_recycle && - (dst = tcp_v4_route_req(sk, req)) != NULL && + (dst = inet_csk_route_req(sk, req)) != NULL && (peer = rt_get_peer((struct rtable *)dst)) != NULL && peer->v4daddr == saddr) { if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL && @@ -1182,7 +1192,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) } /* Kill the following clause, if you dislike this way. */ else if (!sysctl_tcp_syncookies && - (sysctl_max_syn_backlog - tcp_synq_len(sk) < + (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < (sysctl_max_syn_backlog >> 2)) && (!peer || !peer->tcp_ts_stamp) && (!dst || !dst_metric(dst, RTAX_RTT))) { @@ -1240,7 +1250,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, if (sk_acceptq_is_full(sk)) goto exit_overflow; - if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL) + if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL) goto exit; newsk = tcp_create_openreq_child(sk, req, skb); @@ -1257,7 +1267,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newinet->saddr = ireq->loc_addr; newinet->opt = ireq->opt; ireq->opt = NULL; - newinet->mc_index = tcp_v4_iif(skb); + newinet->mc_index = inet_iif(skb); newinet->mc_ttl = skb->nh.iph->ttl; newtp->ext_header_len = 0; if (newinet->opt) @@ -1285,18 +1295,17 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) { struct tcphdr *th = skb->h.th; struct iphdr *iph = skb->nh.iph; - struct tcp_sock *tp = tcp_sk(sk); struct sock *nsk; struct request_sock **prev; /* Find possible connection requests. */ - struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source, - iph->saddr, iph->daddr); + struct request_sock *req = inet_csk_search_req(sk, &prev, th->source, + iph->saddr, iph->daddr); if (req) return tcp_check_req(sk, skb, req, prev); nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr, th->source, skb->nh.iph->daddr, - ntohs(th->dest), tcp_v4_iif(skb)); + ntohs(th->dest), inet_iif(skb)); if (nsk) { if (nsk->sk_state != TCP_TIME_WAIT) { @@ -1440,7 +1449,7 @@ int tcp_v4_rcv(struct sk_buff *skb) sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source, skb->nh.iph->daddr, ntohs(th->dest), - tcp_v4_iif(skb)); + inet_iif(skb)); if (!sk) goto no_tcp_socket; @@ -1507,7 +1516,7 @@ do_time_wait: struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo, skb->nh.iph->daddr, ntohs(th->dest), - tcp_v4_iif(skb)); + inet_iif(skb)); if (sk2) { tcp_tw_deschedule((struct inet_timewait_sock *)sk); inet_twsk_put((struct inet_timewait_sock *)sk); @@ -1619,7 +1628,7 @@ static int tcp_v4_init_sock(struct sock *sk) tcp_init_xmit_timers(sk); tcp_prequeue_init(tp); - tp->rto = TCP_TIMEOUT_INIT; + inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; tp->mdev = TCP_TIMEOUT_INIT; /* So many TCP implementations out there (incorrectly) count the @@ -1672,7 +1681,7 @@ int tcp_v4_destroy_sock(struct sock *sk) __skb_queue_purge(&tp->ucopy.prequeue); /* Clean up a referenced TCP bind bucket. */ - if (inet_sk(sk)->bind_hash) + if (inet_csk(sk)->icsk_bind_hash) inet_put_port(&tcp_hashinfo, sk); /* @@ -1707,7 +1716,7 @@ static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw) static void *listening_get_next(struct seq_file *seq, void *cur) { - struct tcp_sock *tp; + struct inet_connection_sock *icsk; struct hlist_node *node; struct sock *sk = cur; struct tcp_iter_state* st = seq->private; @@ -1723,7 +1732,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur) if (st->state == TCP_SEQ_STATE_OPENREQ) { struct request_sock *req = cur; - tp = tcp_sk(st->syn_wait_sk); + icsk = inet_csk(st->syn_wait_sk); req = req->dl_next; while (1) { while (req) { @@ -1736,17 +1745,17 @@ static void *listening_get_next(struct seq_file *seq, void *cur) if (++st->sbucket >= TCP_SYNQ_HSIZE) break; get_req: - req = tp->accept_queue.listen_opt->syn_table[st->sbucket]; + req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket]; } sk = sk_next(st->syn_wait_sk); st->state = TCP_SEQ_STATE_LISTENING; - read_unlock_bh(&tp->accept_queue.syn_wait_lock); + read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); } else { - tp = tcp_sk(sk); - read_lock_bh(&tp->accept_queue.syn_wait_lock); - if (reqsk_queue_len(&tp->accept_queue)) + icsk = inet_csk(sk); + read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + if (reqsk_queue_len(&icsk->icsk_accept_queue)) goto start_req; - read_unlock_bh(&tp->accept_queue.syn_wait_lock); + read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); sk = sk_next(sk); } get_sk: @@ -1755,9 +1764,9 @@ get_sk: cur = sk; goto out; } - tp = tcp_sk(sk); - read_lock_bh(&tp->accept_queue.syn_wait_lock); - if (reqsk_queue_len(&tp->accept_queue)) { + icsk = inet_csk(sk); + read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + if (reqsk_queue_len(&icsk->icsk_accept_queue)) { start_req: st->uid = sock_i_uid(sk); st->syn_wait_sk = sk; @@ -1765,7 +1774,7 @@ start_req: st->sbucket = 0; goto get_req; } - read_unlock_bh(&tp->accept_queue.syn_wait_lock); + read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); } if (++st->bucket < INET_LHTABLE_SIZE) { sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]); @@ -1951,8 +1960,8 @@ static void tcp_seq_stop(struct seq_file *seq, void *v) switch (st->state) { case TCP_SEQ_STATE_OPENREQ: if (v) { - struct tcp_sock *tp = tcp_sk(st->syn_wait_sk); - read_unlock_bh(&tp->accept_queue.syn_wait_lock); + struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk); + read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); } case TCP_SEQ_STATE_LISTENING: if (v != SEQ_START_TOKEN) @@ -2058,18 +2067,19 @@ static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i) int timer_active; unsigned long timer_expires; struct tcp_sock *tp = tcp_sk(sp); + const struct inet_connection_sock *icsk = inet_csk(sp); struct inet_sock *inet = inet_sk(sp); unsigned int dest = inet->daddr; unsigned int src = inet->rcv_saddr; __u16 destp = ntohs(inet->dport); __u16 srcp = ntohs(inet->sport); - if (tp->pending == TCP_TIME_RETRANS) { + if (icsk->icsk_pending == ICSK_TIME_RETRANS) { timer_active = 1; - timer_expires = tp->timeout; - } else if (tp->pending == TCP_TIME_PROBE0) { + timer_expires = icsk->icsk_timeout; + } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { timer_active = 4; - timer_expires = tp->timeout; + timer_expires = icsk->icsk_timeout; } else if (timer_pending(&sp->sk_timer)) { timer_active = 2; timer_expires = sp->sk_timer.expires; @@ -2084,12 +2094,14 @@ static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i) tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq, timer_active, jiffies_to_clock_t(timer_expires - jiffies), - tp->retransmits, + icsk->icsk_retransmits, sock_i_uid(sp), tp->probes_out, sock_i_ino(sp), atomic_read(&sp->sk_refcnt), sp, - tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong, + icsk->icsk_rto, + icsk->icsk_ack.ato, + (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, tp->snd_cwnd, tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh); } @@ -2174,7 +2186,7 @@ struct proto tcp_prot = { .close = tcp_close, .connect = tcp_v4_connect, .disconnect = tcp_disconnect, - .accept = tcp_accept, + .accept = inet_csk_accept, .ioctl = tcp_ioctl, .init = tcp_v4_init_sock, .destroy = tcp_v4_destroy_sock, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 8b6cd8d..5682370 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -271,7 +271,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) if (tw != NULL) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); - const int rto = (tp->rto << 2) - (tp->rto >> 1); + const struct inet_connection_sock *icsk = inet_csk(sk); + const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; tcptw->tw_rcv_nxt = tp->rcv_nxt; @@ -605,10 +606,11 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct inet_request_sock *ireq = inet_rsk(req); struct tcp_request_sock *treq = tcp_rsk(req); struct inet_sock *newinet = inet_sk(newsk); + struct inet_connection_sock *newicsk = inet_csk(newsk); struct tcp_sock *newtp; newsk->sk_state = TCP_SYN_RECV; - newinet->bind_hash = NULL; + newicsk->icsk_bind_hash = NULL; /* Clone the TCP header template */ newinet->dport = ireq->rmt_port; @@ -624,11 +626,11 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, tcp_init_wl(newtp, treq->snt_isn, treq->rcv_isn); - newtp->retransmits = 0; - newtp->backoff = 0; + newicsk->icsk_retransmits = 0; + newicsk->icsk_backoff = 0; newtp->srtt = 0; newtp->mdev = TCP_TIMEOUT_INIT; - newtp->rto = TCP_TIMEOUT_INIT; + newicsk->icsk_rto = TCP_TIMEOUT_INIT; newtp->packets_out = 0; newtp->left_out = 0; @@ -667,11 +669,11 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->rx_opt.num_sacks = 0; newtp->urg_data = 0; /* Deinitialize accept_queue to trap illegal accesses. */ - memset(&newtp->accept_queue, 0, sizeof(newtp->accept_queue)); + memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue)); if (sock_flag(newsk, SOCK_KEEPOPEN)) - tcp_reset_keepalive_timer(newsk, - keepalive_time_when(newtp)); + inet_csk_reset_keepalive_timer(newsk, + keepalive_time_when(newtp)); newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; if((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) { @@ -701,7 +703,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->tcp_header_len = sizeof(struct tcphdr); } if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len) - newtp->ack.last_seg_size = skb->len-newtp->tcp_header_len; + newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; newtp->rx_opt.mss_clamp = req->mss; TCP_ECN_openreq_child(newtp, req); if (newtp->ecn_flags&TCP_ECN_OK) @@ -881,10 +883,10 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, if (child == NULL) goto listen_overflow; - tcp_synq_unlink(tp, req, prev); - tcp_synq_removed(sk, req); + inet_csk_reqsk_queue_unlink(sk, req, prev); + inet_csk_reqsk_queue_removed(sk, req); - tcp_acceptq_queue(sk, req, child); + inet_csk_reqsk_queue_add(sk, req, child); return child; listen_overflow: @@ -898,7 +900,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, if (!(flg & TCP_FLAG_RST)) req->rsk_ops->send_reset(skb); - tcp_synq_drop(sk, req, prev); + inet_csk_reqsk_queue_drop(sk, req, prev); return NULL; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a4d1eb9..6f0a7e3 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -105,8 +105,9 @@ static __u16 tcp_advertise_mss(struct sock *sk) /* RFC2861. Reset CWND after idle period longer RTO to "restart window". * This is the first part of cwnd validation mechanism. */ -static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst) +static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst) { + struct tcp_sock *tp = tcp_sk(sk); s32 delta = tcp_time_stamp - tp->lsndtime; u32 restart_cwnd = tcp_init_cwnd(tp, dst); u32 cwnd = tp->snd_cwnd; @@ -116,7 +117,7 @@ static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst) tp->snd_ssthresh = tcp_current_ssthresh(tp); restart_cwnd = min(restart_cwnd, cwnd); - while ((delta -= tp->rto) > 0 && cwnd > restart_cwnd) + while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd) cwnd >>= 1; tp->snd_cwnd = max(cwnd, restart_cwnd); tp->snd_cwnd_stamp = tcp_time_stamp; @@ -126,26 +127,25 @@ static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst) static inline void tcp_event_data_sent(struct tcp_sock *tp, struct sk_buff *skb, struct sock *sk) { - u32 now = tcp_time_stamp; + struct inet_connection_sock *icsk = inet_csk(sk); + const u32 now = tcp_time_stamp; - if (!tp->packets_out && (s32)(now - tp->lsndtime) > tp->rto) - tcp_cwnd_restart(tp, __sk_dst_get(sk)); + if (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto) + tcp_cwnd_restart(sk, __sk_dst_get(sk)); tp->lsndtime = now; /* If it is a reply for ato after last received * packet, enter pingpong mode. */ - if ((u32)(now - tp->ack.lrcvtime) < tp->ack.ato) - tp->ack.pingpong = 1; + if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato) + icsk->icsk_ack.pingpong = 1; } static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) { - struct tcp_sock *tp = tcp_sk(sk); - - tcp_dec_quickack_mode(tp, pkts); - tcp_clear_xmit_timer(sk, TCP_TIME_DACK); + tcp_dec_quickack_mode(sk, pkts); + inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); } /* Determine a window scaling and initial window to offer. @@ -696,7 +696,7 @@ static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) if (tp->packets_out > tp->snd_cwnd_used) tp->snd_cwnd_used = tp->packets_out; - if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto) + if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto) tcp_cwnd_application_limited(sk); } } @@ -1147,6 +1147,7 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now) */ u32 __tcp_select_window(struct sock *sk) { + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); /* MSS for the peer's data. Previous verions used mss_clamp * here. I don't know if the value based on our guesses @@ -1154,7 +1155,7 @@ u32 __tcp_select_window(struct sock *sk) * but may be worse for the performance because of rcv_mss * fluctuations. --SAW 1998/11/1 */ - int mss = tp->ack.rcv_mss; + int mss = icsk->icsk_ack.rcv_mss; int free_space = tcp_space(sk); int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk)); int window; @@ -1163,7 +1164,7 @@ u32 __tcp_select_window(struct sock *sk) mss = full_space; if (free_space < full_space/2) { - tp->ack.quick = 0; + icsk->icsk_ack.quick = 0; if (tcp_memory_pressure) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss); @@ -1491,7 +1492,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (skb == skb_peek(&sk->sk_write_queue)) - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto); } packet_cnt -= tcp_skb_pcount(skb); @@ -1544,7 +1546,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) break; if (skb == skb_peek(&sk->sk_write_queue)) - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto); NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS); } @@ -1780,8 +1782,8 @@ static inline void tcp_connect_init(struct sock *sk) tp->rcv_wup = 0; tp->copied_seq = 0; - tp->rto = TCP_TIMEOUT_INIT; - tp->retransmits = 0; + inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; + inet_csk(sk)->icsk_retransmits = 0; tcp_clear_retrans(tp); } @@ -1824,7 +1826,7 @@ int tcp_connect(struct sock *sk) TCP_INC_STATS(TCP_MIB_ACTIVEOPENS); /* Timer for repeating the SYN until an answer. */ - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto); return 0; } @@ -1834,20 +1836,21 @@ int tcp_connect(struct sock *sk) */ void tcp_send_delayed_ack(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); - int ato = tp->ack.ato; + struct inet_connection_sock *icsk = inet_csk(sk); + int ato = icsk->icsk_ack.ato; unsigned long timeout; if (ato > TCP_DELACK_MIN) { + const struct tcp_sock *tp = tcp_sk(sk); int max_ato = HZ/2; - if (tp->ack.pingpong || (tp->ack.pending&TCP_ACK_PUSHED)) + if (icsk->icsk_ack.pingpong || (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)) max_ato = TCP_DELACK_MAX; /* Slow path, intersegment interval is "high". */ /* If some rtt estimate is known, use it to bound delayed ack. - * Do not use tp->rto here, use results of rtt measurements + * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements * directly. */ if (tp->srtt) { @@ -1864,21 +1867,22 @@ void tcp_send_delayed_ack(struct sock *sk) timeout = jiffies + ato; /* Use new timeout only if there wasn't a older one earlier. */ - if (tp->ack.pending&TCP_ACK_TIMER) { + if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) { /* If delack timer was blocked or is about to expire, * send ACK now. */ - if (tp->ack.blocked || time_before_eq(tp->ack.timeout, jiffies+(ato>>2))) { + if (icsk->icsk_ack.blocked || + time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) { tcp_send_ack(sk); return; } - if (!time_before(timeout, tp->ack.timeout)) - timeout = tp->ack.timeout; + if (!time_before(timeout, icsk->icsk_ack.timeout)) + timeout = icsk->icsk_ack.timeout; } - tp->ack.pending |= TCP_ACK_SCHED|TCP_ACK_TIMER; - tp->ack.timeout = timeout; - sk_reset_timer(sk, &tp->delack_timer, timeout); + icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER; + icsk->icsk_ack.timeout = timeout; + sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout); } /* This routine sends an ack and also updates the window. */ @@ -1895,9 +1899,9 @@ void tcp_send_ack(struct sock *sk) */ buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); if (buff == NULL) { - tcp_schedule_ack(tp); - tp->ack.ato = TCP_ATO_MIN; - tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX); + inet_csk_schedule_ack(sk); + inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, TCP_DELACK_MAX); return; } @@ -2011,6 +2015,7 @@ int tcp_write_wakeup(struct sock *sk) */ void tcp_send_probe0(struct sock *sk) { + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int err; @@ -2019,16 +2024,16 @@ void tcp_send_probe0(struct sock *sk) if (tp->packets_out || !sk->sk_send_head) { /* Cancel probe timer, if it is not required. */ tp->probes_out = 0; - tp->backoff = 0; + icsk->icsk_backoff = 0; return; } if (err <= 0) { - if (tp->backoff < sysctl_tcp_retries2) - tp->backoff++; + if (icsk->icsk_backoff < sysctl_tcp_retries2) + icsk->icsk_backoff++; tp->probes_out++; - tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, - min(tp->rto << tp->backoff, TCP_RTO_MAX)); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, + min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX)); } else { /* If packet was not sent due to local congestion, * do not backoff and do not remember probes_out. @@ -2038,8 +2043,9 @@ void tcp_send_probe0(struct sock *sk) */ if (!tp->probes_out) tp->probes_out=1; - tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, - min(tp->rto << tp->backoff, TCP_RESOURCE_PROBE_INTERVAL)); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, + min(icsk->icsk_rto << icsk->icsk_backoff, + TCP_RESOURCE_PROBE_INTERVAL)); } } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 0084227..0b71380 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -36,9 +36,9 @@ static void tcp_write_timer(unsigned long); static void tcp_delack_timer(unsigned long); static void tcp_keepalive_timer (unsigned long data); -#ifdef TCP_DEBUG -const char tcp_timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n"; -EXPORT_SYMBOL(tcp_timer_bug_msg); +#ifdef INET_CSK_DEBUG +const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; +EXPORT_SYMBOL(inet_csk_timer_bug_msg); #endif /* @@ -46,40 +46,45 @@ EXPORT_SYMBOL(tcp_timer_bug_msg); * We may wish use just one timer maintaining a list of expire jiffies * to optimize. */ - -void tcp_init_xmit_timers(struct sock *sk) +void inet_csk_init_xmit_timers(struct sock *sk, + void (*retransmit_handler)(unsigned long), + void (*delack_handler)(unsigned long), + void (*keepalive_handler)(unsigned long)) { - struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); - init_timer(&tp->retransmit_timer); - tp->retransmit_timer.function=&tcp_write_timer; - tp->retransmit_timer.data = (unsigned long) sk; - tp->pending = 0; + init_timer(&icsk->icsk_retransmit_timer); + init_timer(&icsk->icsk_delack_timer); + init_timer(&sk->sk_timer); - init_timer(&tp->delack_timer); - tp->delack_timer.function=&tcp_delack_timer; - tp->delack_timer.data = (unsigned long) sk; - tp->ack.pending = 0; + icsk->icsk_retransmit_timer.function = retransmit_handler; + icsk->icsk_delack_timer.function = delack_handler; + sk->sk_timer.function = keepalive_handler; - init_timer(&sk->sk_timer); - sk->sk_timer.function = &tcp_keepalive_timer; - sk->sk_timer.data = (unsigned long)sk; + icsk->icsk_retransmit_timer.data = + icsk->icsk_delack_timer.data = + sk->sk_timer.data = (unsigned long)sk; + + icsk->icsk_pending = icsk->icsk_ack.pending = 0; } -void tcp_clear_xmit_timers(struct sock *sk) +void inet_csk_clear_xmit_timers(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); - tp->pending = 0; - sk_stop_timer(sk, &tp->retransmit_timer); - - tp->ack.pending = 0; - tp->ack.blocked = 0; - sk_stop_timer(sk, &tp->delack_timer); + icsk->icsk_pending = icsk->icsk_ack.pending = icsk->icsk_ack.blocked = 0; + sk_stop_timer(sk, &icsk->icsk_retransmit_timer); + sk_stop_timer(sk, &icsk->icsk_delack_timer); sk_stop_timer(sk, &sk->sk_timer); } +void tcp_init_xmit_timers(struct sock *sk) +{ + inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, + &tcp_keepalive_timer); +} + static void tcp_write_err(struct sock *sk) { sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT; @@ -155,15 +160,15 @@ static int tcp_orphan_retries(struct sock *sk, int alive) /* A write timeout has occurred. Process the after effects. */ static int tcp_write_timeout(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); int retry_until; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { - if (tp->retransmits) + if (icsk->icsk_retransmits) dst_negative_advice(&sk->sk_dst_cache); - retry_until = tp->syn_retries ? : sysctl_tcp_syn_retries; + retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; } else { - if (tp->retransmits >= sysctl_tcp_retries1) { + if (icsk->icsk_retransmits >= sysctl_tcp_retries1) { /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black hole detection. :-( @@ -189,16 +194,16 @@ static int tcp_write_timeout(struct sock *sk) retry_until = sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { - int alive = (tp->rto < TCP_RTO_MAX); + const int alive = (icsk->icsk_rto < TCP_RTO_MAX); retry_until = tcp_orphan_retries(sk, alive); - if (tcp_out_of_resources(sk, alive || tp->retransmits < retry_until)) + if (tcp_out_of_resources(sk, alive || icsk->icsk_retransmits < retry_until)) return 1; } } - if (tp->retransmits >= retry_until) { + if (icsk->icsk_retransmits >= retry_until) { /* Has it gone just too far? */ tcp_write_err(sk); return 1; @@ -210,26 +215,27 @@ static void tcp_delack_timer(unsigned long data) { struct sock *sk = (struct sock*)data; struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); bh_lock_sock(sk); if (sock_owned_by_user(sk)) { /* Try again later. */ - tp->ack.blocked = 1; + icsk->icsk_ack.blocked = 1; NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED); - sk_reset_timer(sk, &tp->delack_timer, jiffies + TCP_DELACK_MIN); + sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN); goto out_unlock; } sk_stream_mem_reclaim(sk); - if (sk->sk_state == TCP_CLOSE || !(tp->ack.pending & TCP_ACK_TIMER)) + if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) goto out; - if (time_after(tp->ack.timeout, jiffies)) { - sk_reset_timer(sk, &tp->delack_timer, tp->ack.timeout); + if (time_after(icsk->icsk_ack.timeout, jiffies)) { + sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout); goto out; } - tp->ack.pending &= ~TCP_ACK_TIMER; + icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER; if (!skb_queue_empty(&tp->ucopy.prequeue)) { struct sk_buff *skb; @@ -242,16 +248,16 @@ static void tcp_delack_timer(unsigned long data) tp->ucopy.memory = 0; } - if (tcp_ack_scheduled(tp)) { - if (!tp->ack.pingpong) { + if (inet_csk_ack_scheduled(sk)) { + if (!icsk->icsk_ack.pingpong) { /* Delayed ACK missed: inflate ATO. */ - tp->ack.ato = min(tp->ack.ato << 1, tp->rto); + icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto); } else { /* Delayed ACK missed: leave pingpong mode and * deflate ATO. */ - tp->ack.pingpong = 0; - tp->ack.ato = TCP_ATO_MIN; + icsk->icsk_ack.pingpong = 0; + icsk->icsk_ack.ato = TCP_ATO_MIN; } tcp_send_ack(sk); NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS); @@ -294,7 +300,8 @@ static void tcp_probe_timer(struct sock *sk) max_probes = sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { - int alive = ((tp->rto<backoff) < TCP_RTO_MAX); + const struct inet_connection_sock *icsk = inet_csk(sk); + const int alive = ((icsk->icsk_rto << icsk->icsk_backoff) < TCP_RTO_MAX); max_probes = tcp_orphan_retries(sk, alive); @@ -317,6 +324,7 @@ static void tcp_probe_timer(struct sock *sk) static void tcp_retransmit_timer(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); if (!tp->packets_out) goto out; @@ -351,7 +359,7 @@ static void tcp_retransmit_timer(struct sock *sk) if (tcp_write_timeout(sk)) goto out; - if (tp->retransmits == 0) { + if (icsk->icsk_retransmits == 0) { if (tp->ca_state == TCP_CA_Disorder || tp->ca_state == TCP_CA_Recovery) { if (tp->rx_opt.sack_ok) { if (tp->ca_state == TCP_CA_Recovery) @@ -381,10 +389,10 @@ static void tcp_retransmit_timer(struct sock *sk) /* Retransmission failed because of local congestion, * do not backoff. */ - if (!tp->retransmits) - tp->retransmits=1; - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, - min(tp->rto, TCP_RESOURCE_PROBE_INTERVAL)); + if (!icsk->icsk_retransmits) + icsk->icsk_retransmits = 1; + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL)); goto out; } @@ -403,13 +411,13 @@ static void tcp_retransmit_timer(struct sock *sk) * implemented ftp to mars will work nicely. We will have to fix * the 120 second clamps though! */ - tp->backoff++; - tp->retransmits++; + icsk->icsk_backoff++; + icsk->icsk_retransmits++; out_reset_timer: - tp->rto = min(tp->rto << 1, TCP_RTO_MAX); - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); - if (tp->retransmits > sysctl_tcp_retries1) + icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto); + if (icsk->icsk_retransmits > sysctl_tcp_retries1) __sk_dst_reset(sk); out:; @@ -418,32 +426,32 @@ out:; static void tcp_write_timer(unsigned long data) { struct sock *sk = (struct sock*)data; - struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); int event; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { /* Try again later */ - sk_reset_timer(sk, &tp->retransmit_timer, jiffies + (HZ / 20)); + sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20)); goto out_unlock; } - if (sk->sk_state == TCP_CLOSE || !tp->pending) + if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending) goto out; - if (time_after(tp->timeout, jiffies)) { - sk_reset_timer(sk, &tp->retransmit_timer, tp->timeout); + if (time_after(icsk->icsk_timeout, jiffies)) { + sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); goto out; } - event = tp->pending; - tp->pending = 0; + event = icsk->icsk_pending; + icsk->icsk_pending = 0; switch (event) { - case TCP_TIME_RETRANS: + case ICSK_TIME_RETRANS: tcp_retransmit_timer(sk); break; - case TCP_TIME_PROBE0: + case ICSK_TIME_PROBE0: tcp_probe_timer(sk); break; } @@ -463,8 +471,9 @@ out_unlock: static void tcp_synack_timer(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - struct listen_sock *lopt = tp->accept_queue.listen_opt; - int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries; + struct inet_connection_sock *icsk = inet_csk(sk); + struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; + int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; int thresh = max_retries; unsigned long now = jiffies; struct request_sock **reqp, *req; @@ -526,8 +535,8 @@ static void tcp_synack_timer(struct sock *sk) } /* Drop this request */ - tcp_synq_unlink(tp, req, reqp); - reqsk_queue_removed(&tp->accept_queue, req); + inet_csk_reqsk_queue_unlink(sk, req, reqp); + reqsk_queue_removed(&icsk->icsk_accept_queue, req); reqsk_free(req); continue; } @@ -541,15 +550,15 @@ static void tcp_synack_timer(struct sock *sk) lopt->clock_hand = i; if (lopt->qlen) - tcp_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL); + inet_csk_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL); } -void tcp_delete_keepalive_timer (struct sock *sk) +void inet_csk_delete_keepalive_timer(struct sock *sk) { sk_stop_timer(sk, &sk->sk_timer); } -void tcp_reset_keepalive_timer (struct sock *sk, unsigned long len) +void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len) { sk_reset_timer(sk, &sk->sk_timer, jiffies + len); } @@ -560,9 +569,9 @@ void tcp_set_keepalive(struct sock *sk, int val) return; if (val && !sock_flag(sk, SOCK_KEEPOPEN)) - tcp_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk))); + inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk))); else if (!val) - tcp_delete_keepalive_timer(sk); + inet_csk_delete_keepalive_timer(sk); } @@ -576,7 +585,7 @@ static void tcp_keepalive_timer (unsigned long data) bh_lock_sock(sk); if (sock_owned_by_user(sk)) { /* Try again later. */ - tcp_reset_keepalive_timer (sk, HZ/20); + inet_csk_reset_keepalive_timer (sk, HZ/20); goto out; } @@ -587,7 +596,7 @@ static void tcp_keepalive_timer (unsigned long data) if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) { if (tp->linger2 >= 0) { - int tmo = tcp_fin_time(tp) - TCP_TIMEWAIT_LEN; + const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN; if (tmo > 0) { tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); @@ -634,7 +643,7 @@ static void tcp_keepalive_timer (unsigned long data) sk_stream_mem_reclaim(sk); resched: - tcp_reset_keepalive_timer (sk, elapsed); + inet_csk_reset_keepalive_timer (sk, elapsed); goto out; death: @@ -645,7 +654,7 @@ out: sock_put(sk); } -EXPORT_SYMBOL(tcp_clear_xmit_timers); -EXPORT_SYMBOL(tcp_delete_keepalive_timer); +EXPORT_SYMBOL(inet_csk_clear_xmit_timers); +EXPORT_SYMBOL(inet_csk_delete_keepalive_timer); EXPORT_SYMBOL(tcp_init_xmit_timers); -EXPORT_SYMBOL(tcp_reset_keepalive_timer); +EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 4582d9c..b9c3da3 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1043,7 +1043,7 @@ int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2) u32 sk_rcv_saddr = inet_sk(sk)->rcv_saddr; u32 sk2_rcv_saddr = inet_rcv_saddr(sk2); int sk_ipv6only = ipv6_only_sock(sk); - int sk2_ipv6only = tcp_v6_ipv6only(sk2); + int sk2_ipv6only = inet_v6_ipv6only(sk2); int addr_type = ipv6_addr_type(sk_rcv_saddr6); int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index af8ad5b..b9c7003 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -207,9 +207,9 @@ tb_not_found: tb->fastreuse = 0; success: - if (!inet_sk(sk)->bind_hash) + if (!inet_csk(sk)->icsk_bind_hash) inet_bind_hash(sk, tb, snum); - BUG_TRAP(inet_sk(sk)->bind_hash == tb); + BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb); ret = 0; fail_unlock: @@ -381,7 +381,7 @@ EXPORT_SYMBOL_GPL(tcp_v6_lookup); * Open request hash tables. */ -static u32 tcp_v6_synq_hash(struct in6_addr *raddr, u16 rport, u32 rnd) +static u32 tcp_v6_synq_hash(const struct in6_addr *raddr, const u16 rport, const u32 rnd) { u32 a, b, c; @@ -401,14 +401,15 @@ static u32 tcp_v6_synq_hash(struct in6_addr *raddr, u16 rport, u32 rnd) return c & (TCP_SYNQ_HSIZE - 1); } -static struct request_sock *tcp_v6_search_req(struct tcp_sock *tp, +static struct request_sock *tcp_v6_search_req(const struct sock *sk, struct request_sock ***prevp, __u16 rport, struct in6_addr *raddr, struct in6_addr *laddr, int iif) { - struct listen_sock *lopt = tp->accept_queue.listen_opt; + const struct inet_connection_sock *icsk = inet_csk(sk); + struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; struct request_sock *req, **prev; for (prev = &lopt->syn_table[tcp_v6_synq_hash(raddr, rport, lopt->hash_rnd)]; @@ -619,7 +620,7 @@ ok: } head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)]; - tb = inet_sk(sk)->bind_hash; + tb = inet_csk(sk)->icsk_bind_hash; spin_lock_bh(&head->lock); if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { @@ -925,7 +926,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (sock_owned_by_user(sk)) goto out; - req = tcp_v6_search_req(tp, &prev, th->dest, &hdr->daddr, + req = tcp_v6_search_req(sk, &prev, th->dest, &hdr->daddr, &hdr->saddr, tcp_v6_iif(skb)); if (!req) goto out; @@ -940,7 +941,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, goto out; } - tcp_synq_drop(sk, req, prev); + inet_csk_reqsk_queue_drop(sk, req, prev); goto out; case TCP_SYN_SENT: @@ -1245,11 +1246,10 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) { struct request_sock *req, **prev; struct tcphdr *th = skb->h.th; - struct tcp_sock *tp = tcp_sk(sk); struct sock *nsk; /* Find possible connection requests. */ - req = tcp_v6_search_req(tp, &prev, th->source, &skb->nh.ipv6h->saddr, + req = tcp_v6_search_req(sk, &prev, th->source, &skb->nh.ipv6h->saddr, &skb->nh.ipv6h->daddr, tcp_v6_iif(skb)); if (req) return tcp_check_req(sk, skb, req, prev); @@ -1278,12 +1278,12 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) static void tcp_v6_synq_add(struct sock *sk, struct request_sock *req) { - struct tcp_sock *tp = tcp_sk(sk); - struct listen_sock *lopt = tp->accept_queue.listen_opt; - u32 h = tcp_v6_synq_hash(&tcp6_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd); + struct inet_connection_sock *icsk = inet_csk(sk); + struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; + const u32 h = tcp_v6_synq_hash(&tcp6_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd); - reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT); - tcp_synq_added(sk); + reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, TCP_TIMEOUT_INIT); + inet_csk_reqsk_queue_added(sk, TCP_TIMEOUT_INIT); } @@ -1308,13 +1308,13 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) /* * There are no SYN attacks on IPv6, yet... */ - if (tcp_synq_is_full(sk) && !isn) { + if (inet_csk_reqsk_queue_is_full(sk) && !isn) { if (net_ratelimit()) printk(KERN_INFO "TCPv6: dropping request, synflood is possible\n"); goto drop; } - if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1) + if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) goto drop; req = reqsk_alloc(&tcp6_request_sock_ops); @@ -2015,7 +2015,7 @@ static int tcp_v6_init_sock(struct sock *sk) tcp_init_xmit_timers(sk); tcp_prequeue_init(tp); - tp->rto = TCP_TIMEOUT_INIT; + inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; tp->mdev = TCP_TIMEOUT_INIT; /* So many TCP implementations out there (incorrectly) count the @@ -2098,18 +2098,20 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) unsigned long timer_expires; struct inet_sock *inet = inet_sk(sp); struct tcp_sock *tp = tcp_sk(sp); + const struct inet_connection_sock *icsk = inet_csk(sp); struct ipv6_pinfo *np = inet6_sk(sp); dest = &np->daddr; src = &np->rcv_saddr; destp = ntohs(inet->dport); srcp = ntohs(inet->sport); - if (tp->pending == TCP_TIME_RETRANS) { + + if (icsk->icsk_pending == ICSK_TIME_RETRANS) { timer_active = 1; - timer_expires = tp->timeout; - } else if (tp->pending == TCP_TIME_PROBE0) { + timer_expires = icsk->icsk_timeout; + } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { timer_active = 4; - timer_expires = tp->timeout; + timer_expires = icsk->icsk_timeout; } else if (timer_pending(&sp->sk_timer)) { timer_active = 2; timer_expires = sp->sk_timer.expires; @@ -2130,12 +2132,14 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) tp->write_seq-tp->snd_una, tp->rcv_nxt-tp->copied_seq, timer_active, jiffies_to_clock_t(timer_expires - jiffies), - tp->retransmits, + icsk->icsk_retransmits, sock_i_uid(sp), tp->probes_out, sock_i_ino(sp), atomic_read(&sp->sk_refcnt), sp, - tp->rto, tp->ack.ato, (tp->ack.quick<<1)|tp->ack.pingpong, + icsk->icsk_rto, + icsk->icsk_ack.ato, + (icsk->icsk_ack.quick << 1 ) | icsk->icsk_ack.pingpong, tp->snd_cwnd, tp->snd_ssthresh>=0xFFFF?-1:tp->snd_ssthresh ); } @@ -2227,7 +2231,7 @@ struct proto tcpv6_prot = { .close = tcp_close, .connect = tcp_v6_connect, .disconnect = tcp_disconnect, - .accept = tcp_accept, + .accept = inet_csk_accept, .ioctl = tcp_ioctl, .init = tcp_v6_init_sock, .destroy = tcp_v6_destroy_sock, -- cgit v0.10.2 From 3f421baa4720b708022f8bcc52a61e5cd6f10bf8 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:11:08 -0700 Subject: [NET]: Just move the inet_connection_sock function from tcp sources Completing the previous changeset, this also generalises tcp_v4_synq_add, renaming it to inet_csk_reqsk_queue_hash_add, already geing used in the DCCP tree, which I plan to merge RSN. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 8859191..777339b 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -333,15 +333,10 @@ static inline struct in6_addr *tcp_v6_rcv_saddr(const struct sock *sk) return sk->sk_family == AF_INET6 ? __tcp_v6_rcv_saddr(sk) : NULL; } -static inline int inet_twsk_ipv6only(const struct sock *sk) -{ - return inet_twsk(sk)->tw_ipv6only; -} - static inline int inet_v6_ipv6only(const struct sock *sk) { return likely(sk->sk_state != TCP_TIME_WAIT) ? - ipv6_only_sock(sk) : inet_twsk_ipv6only(sk); + ipv6_only_sock(sk) : inet_twsk(sk)->tw_ipv6only; } #else #define __ipv6_only_sock(sk) 0 diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index ef60939..97e0020 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -16,9 +16,15 @@ #define _INET_CONNECTION_SOCK_H #include +#include #include #include +#define INET_CSK_DEBUG 1 + +/* Cancel timers, when they are not required. */ +#undef INET_CSK_CLEAR_TIMERS + struct inet_bind_bucket; struct inet_hashinfo; @@ -61,17 +67,107 @@ struct inet_connection_sock { } icsk_ack; }; +#define ICSK_TIME_RETRANS 1 /* Retransmit timer */ +#define ICSK_TIME_DACK 2 /* Delayed ack timer */ +#define ICSK_TIME_PROBE0 3 /* Zero window probe timer */ +#define ICSK_TIME_KEEPOPEN 4 /* Keepalive timer */ + static inline struct inet_connection_sock *inet_csk(const struct sock *sk) { return (struct inet_connection_sock *)sk; } +enum inet_csk_ack_state_t { + ICSK_ACK_SCHED = 1, + ICSK_ACK_TIMER = 2, + ICSK_ACK_PUSHED = 4 +}; + extern void inet_csk_init_xmit_timers(struct sock *sk, void (*retransmit_handler)(unsigned long), void (*delack_handler)(unsigned long), void (*keepalive_handler)(unsigned long)); extern void inet_csk_clear_xmit_timers(struct sock *sk); +static inline void inet_csk_schedule_ack(struct sock *sk) +{ + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_SCHED; +} + +static inline int inet_csk_ack_scheduled(const struct sock *sk) +{ + return inet_csk(sk)->icsk_ack.pending & ICSK_ACK_SCHED; +} + +static inline void inet_csk_delack_init(struct sock *sk) +{ + memset(&inet_csk(sk)->icsk_ack, 0, sizeof(inet_csk(sk)->icsk_ack)); +} + +extern void inet_csk_delete_keepalive_timer(struct sock *sk); +extern void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long timeout); + +#ifdef INET_CSK_DEBUG +extern const char inet_csk_timer_bug_msg[]; +#endif + +static inline void inet_csk_clear_xmit_timer(struct sock *sk, const int what) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0) { + icsk->icsk_pending = 0; +#ifdef INET_CSK_CLEAR_TIMERS + sk_stop_timer(sk, &icsk->icsk_retransmit_timer); +#endif + } else if (what == ICSK_TIME_DACK) { + icsk->icsk_ack.blocked = icsk->icsk_ack.pending = 0; +#ifdef INET_CSK_CLEAR_TIMERS + sk_stop_timer(sk, &icsk->icsk_delack_timer); +#endif + } +#ifdef INET_CSK_DEBUG + else { + pr_debug(inet_csk_timer_bug_msg); + } +#endif +} + +/* + * Reset the retransmission timer + */ +static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what, + unsigned long when, + const unsigned long max_when) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + if (when > max_when) { +#ifdef INET_CSK_DEBUG + pr_debug("reset_xmit_timer: sk=%p %d when=0x%lx, caller=%p\n", + sk, what, when, current_text_addr()); +#endif + when = max_when; + } + + if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0) { + icsk->icsk_pending = what; + icsk->icsk_timeout = jiffies + when; + sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); + } else if (what == ICSK_TIME_DACK) { + icsk->icsk_ack.pending |= ICSK_ACK_TIMER; + icsk->icsk_ack.timeout = jiffies + when; + sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout); + } +#ifdef INET_CSK_DEBUG + else { + pr_debug(inet_csk_timer_bug_msg); + } +#endif +} + +extern struct sock *inet_csk_accept(struct sock *sk, int flags, int *err); + extern struct request_sock *inet_csk_search_req(const struct sock *sk, struct request_sock ***prevp, const __u16 rport, @@ -83,4 +179,60 @@ extern int inet_csk_get_port(struct inet_hashinfo *hashinfo, extern struct dst_entry* inet_csk_route_req(struct sock *sk, const struct request_sock *req); +static inline void inet_csk_reqsk_queue_add(struct sock *sk, + struct request_sock *req, + struct sock *child) +{ + reqsk_queue_add(&inet_csk(sk)->icsk_accept_queue, req, sk, child); +} + +extern void inet_csk_reqsk_queue_hash_add(struct sock *sk, + struct request_sock *req, + const unsigned timeout); + +static inline void inet_csk_reqsk_queue_removed(struct sock *sk, + struct request_sock *req) +{ + if (reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req) == 0) + inet_csk_delete_keepalive_timer(sk); +} + +static inline void inet_csk_reqsk_queue_added(struct sock *sk, + const unsigned long timeout) +{ + if (reqsk_queue_added(&inet_csk(sk)->icsk_accept_queue) == 0) + inet_csk_reset_keepalive_timer(sk, timeout); +} + +static inline int inet_csk_reqsk_queue_len(const struct sock *sk) +{ + return reqsk_queue_len(&inet_csk(sk)->icsk_accept_queue); +} + +static inline int inet_csk_reqsk_queue_young(const struct sock *sk) +{ + return reqsk_queue_len_young(&inet_csk(sk)->icsk_accept_queue); +} + +static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk) +{ + return reqsk_queue_is_full(&inet_csk(sk)->icsk_accept_queue); +} + +static inline void inet_csk_reqsk_queue_unlink(struct sock *sk, + struct request_sock *req, + struct request_sock **prev) +{ + reqsk_queue_unlink(&inet_csk(sk)->icsk_accept_queue, req, prev); +} + +static inline void inet_csk_reqsk_queue_drop(struct sock *sk, + struct request_sock *req, + struct request_sock **prev) +{ + inet_csk_reqsk_queue_unlink(sk, req, prev); + inet_csk_reqsk_queue_removed(sk, req); + reqsk_free(req); +} + #endif /* _INET_CONNECTION_SOCK_H */ diff --git a/include/net/tcp.h b/include/net/tcp.h index a943c79..dd9a5a2 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -19,18 +19,16 @@ #define _TCP_H #define TCP_DEBUG 1 -#define INET_CSK_DEBUG 1 #define FASTRETRANS_DEBUG 1 -/* Cancel timers, when they are not required. */ -#undef INET_CSK_CLEAR_TIMERS - #include #include #include #include #include #include + +#include #include #include #include @@ -206,11 +204,6 @@ extern void tcp_tw_deschedule(struct inet_timewait_sock *tw); #define TCPOLEN_SACK_BASE_ALIGNED 4 #define TCPOLEN_SACK_PERBLOCK 8 -#define ICSK_TIME_RETRANS 1 /* Retransmit timer */ -#define ICSK_TIME_DACK 2 /* Delayed ack timer */ -#define ICSK_TIME_PROBE0 3 /* Zero window probe timer */ -#define ICSK_TIME_KEEPOPEN 4 /* Keepalive timer */ - /* Flags in tp->nonagle */ #define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */ #define TCP_NAGLE_CORK 2 /* Socket is corked */ @@ -257,12 +250,6 @@ extern atomic_t tcp_memory_allocated; extern atomic_t tcp_sockets_allocated; extern int tcp_memory_pressure; -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -#define AF_INET_FAMILY(fam) ((fam) == AF_INET) -#else -#define AF_INET_FAMILY(fam) 1 -#endif - /* * Pointers to address related TCP functions * (i.e. things that depend on the address family) @@ -373,22 +360,6 @@ extern int tcp_rcv_established(struct sock *sk, extern void tcp_rcv_space_adjust(struct sock *sk); -enum inet_csk_ack_state_t { - ICSK_ACK_SCHED = 1, - ICSK_ACK_TIMER = 2, - ICSK_ACK_PUSHED = 4 -}; - -static inline void inet_csk_schedule_ack(struct sock *sk) -{ - inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_SCHED; -} - -static inline int inet_csk_ack_scheduled(const struct sock *sk) -{ - return inet_csk(sk)->icsk_ack.pending & ICSK_ACK_SCHED; -} - static inline void tcp_dec_quickack_mode(struct sock *sk, const unsigned int pkts) { @@ -406,11 +377,6 @@ static inline void tcp_dec_quickack_mode(struct sock *sk, extern void tcp_enter_quickack_mode(struct sock *sk); -static inline void inet_csk_delack_init(struct sock *sk) -{ - memset(&inet_csk(sk)->icsk_ack, 0, sizeof(inet_csk(sk)->icsk_ack)); -} - static inline void tcp_clear_options(struct tcp_options_received *rx_opt) { rx_opt->tstamp_ok = rx_opt->sack_ok = rx_opt->wscale_ok = rx_opt->snd_wscale = 0; @@ -442,7 +408,6 @@ extern void tcp_update_metrics(struct sock *sk); extern void tcp_close(struct sock *sk, long timeout); -extern struct sock * inet_csk_accept(struct sock *sk, int flags, int *err); extern unsigned int tcp_poll(struct file * file, struct socket *sock, struct poll_table_struct *wait); extern int tcp_getsockopt(struct sock *sk, int level, @@ -541,15 +506,9 @@ static inline void tcp_clear_xmit_timers(struct sock *sk) inet_csk_clear_xmit_timers(sk); } -extern void inet_csk_delete_keepalive_timer(struct sock *sk); -extern void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long timeout); extern unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu); extern unsigned int tcp_current_mss(struct sock *sk, int large); -#ifdef INET_CSK_DEBUG -extern const char inet_csk_timer_bug_msg[]; -#endif - /* tcp_diag.c */ extern void tcp_get_info(struct sock *, struct tcp_info *); @@ -559,60 +518,6 @@ typedef int (*sk_read_actor_t)(read_descriptor_t *, struct sk_buff *, extern int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor); -static inline void inet_csk_clear_xmit_timer(struct sock *sk, const int what) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - - if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0) { - icsk->icsk_pending = 0; -#ifdef INET_CSK_CLEAR_TIMERS - sk_stop_timer(sk, &icsk->icsk_retransmit_timer); -#endif - } else if (what == ICSK_TIME_DACK) { - icsk->icsk_ack.blocked = icsk->icsk_ack.pending = 0; -#ifdef INET_CSK_CLEAR_TIMERS - sk_stop_timer(sk, &icsk->icsk_delack_timer); -#endif - } -#ifdef INET_CSK_DEBUG - else { - pr_debug(inet_csk_timer_bug_msg); - } -#endif -} - -/* - * Reset the retransmission timer - */ -static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what, - unsigned long when) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - - if (when > TCP_RTO_MAX) { -#ifdef INET_CSK_DEBUG - pr_debug("reset_xmit_timer: sk=%p %d when=0x%lx, caller=%p\n", - sk, what, when, current_text_addr()); -#endif - when = TCP_RTO_MAX; - } - - if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0) { - icsk->icsk_pending = what; - icsk->icsk_timeout = jiffies + when; - sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); - } else if (what == ICSK_TIME_DACK) { - icsk->icsk_ack.pending |= ICSK_ACK_TIMER; - icsk->icsk_ack.timeout = jiffies + when; - sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout); - } -#ifdef INET_CSK_DEBUG - else { - pr_debug(inet_csk_timer_bug_msg); - } -#endif -} - /* Initialize RCV_MSS value. * RCV_MSS is an our guess about MSS used by the peer. * We haven't any direct information about the MSS. @@ -765,7 +670,8 @@ static inline void tcp_packets_out_inc(struct sock *sk, tp->packets_out += tcp_skb_pcount(skb); if (!orig) - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto, TCP_RTO_MAX); } static inline void tcp_packets_out_dec(struct tcp_sock *tp, @@ -934,7 +840,8 @@ static __inline__ void tcp_check_probe_timer(struct sock *sk, struct tcp_sock *t { const struct inet_connection_sock *icsk = inet_csk(sk); if (!tp->packets_out && !icsk->icsk_pending) - inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, icsk->icsk_rto); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, + icsk->icsk_rto, TCP_RTO_MAX); } static __inline__ void tcp_push_pending_frames(struct sock *sk, @@ -1017,7 +924,8 @@ static __inline__ int tcp_prequeue(struct sock *sk, struct sk_buff *skb) wake_up_interruptible(sk->sk_sleep); if (!inet_csk_ack_scheduled(sk)) inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, - (3 * TCP_RTO_MIN) / 4); + (3 * TCP_RTO_MIN) / 4, + TCP_RTO_MAX); } return 1; } @@ -1181,58 +1089,6 @@ static inline int tcp_full_space(const struct sock *sk) return tcp_win_from_space(sk->sk_rcvbuf); } -static inline void inet_csk_reqsk_queue_add(struct sock *sk, - struct request_sock *req, - struct sock *child) -{ - reqsk_queue_add(&inet_csk(sk)->icsk_accept_queue, req, sk, child); -} - -static inline void inet_csk_reqsk_queue_removed(struct sock *sk, - struct request_sock *req) -{ - if (reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req) == 0) - inet_csk_delete_keepalive_timer(sk); -} - -static inline void inet_csk_reqsk_queue_added(struct sock *sk, - const unsigned long timeout) -{ - if (reqsk_queue_added(&inet_csk(sk)->icsk_accept_queue) == 0) - inet_csk_reset_keepalive_timer(sk, timeout); -} - -static inline int inet_csk_reqsk_queue_len(const struct sock *sk) -{ - return reqsk_queue_len(&inet_csk(sk)->icsk_accept_queue); -} - -static inline int inet_csk_reqsk_queue_young(const struct sock *sk) -{ - return reqsk_queue_len_young(&inet_csk(sk)->icsk_accept_queue); -} - -static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk) -{ - return reqsk_queue_is_full(&inet_csk(sk)->icsk_accept_queue); -} - -static inline void inet_csk_reqsk_queue_unlink(struct sock *sk, - struct request_sock *req, - struct request_sock **prev) -{ - reqsk_queue_unlink(&inet_csk(sk)->icsk_accept_queue, req, prev); -} - -static inline void inet_csk_reqsk_queue_drop(struct sock *sk, - struct request_sock *req, - struct request_sock **prev) -{ - inet_csk_reqsk_queue_unlink(sk, req, prev); - inet_csk_reqsk_queue_removed(sk, req); - reqsk_free(req); -} - static __inline__ void tcp_openreq_init(struct request_sock *req, struct tcp_options_received *rx_opt, struct sk_buff *skb) diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 6650d18..ea0e1d8 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -5,7 +5,7 @@ obj-y := route.o inetpeer.o protocol.o \ ip_input.o ip_fragment.o ip_forward.o ip_options.o \ ip_output.o ip_sockglue.o inet_hashtables.o \ - inet_timewait_sock.o \ + inet_timewait_sock.o inet_connection_sock.o \ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ tcp_minisocks.o tcp_cong.o \ datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c new file mode 100644 index 0000000..2712400 --- /dev/null +++ b/net/ipv4/inet_connection_sock.c @@ -0,0 +1,401 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Support for INET connection oriented protocols. + * + * Authors: See the TCP sources + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or(at your option) any later version. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#ifdef INET_CSK_DEBUG +const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; +EXPORT_SYMBOL(inet_csk_timer_bug_msg); +#endif + +/* + * This array holds the first and last local port number. + * For high-usage systems, use sysctl to change this to + * 32768-61000 + */ +int sysctl_local_port_range[2] = { 1024, 4999 }; + +static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb) +{ + const u32 sk_rcv_saddr = inet_rcv_saddr(sk); + struct sock *sk2; + struct hlist_node *node; + int reuse = sk->sk_reuse; + + sk_for_each_bound(sk2, node, &tb->owners) { + if (sk != sk2 && + !inet_v6_ipv6only(sk2) && + (!sk->sk_bound_dev_if || + !sk2->sk_bound_dev_if || + sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { + if (!reuse || !sk2->sk_reuse || + sk2->sk_state == TCP_LISTEN) { + const u32 sk2_rcv_saddr = inet_rcv_saddr(sk2); + if (!sk2_rcv_saddr || !sk_rcv_saddr || + sk2_rcv_saddr == sk_rcv_saddr) + break; + } + } + } + return node != NULL; +} + +/* Obtain a reference to a local port for the given sock, + * if snum is zero it means select any available local port. + */ +int inet_csk_get_port(struct inet_hashinfo *hashinfo, + struct sock *sk, unsigned short snum) +{ + struct inet_bind_hashbucket *head; + struct hlist_node *node; + struct inet_bind_bucket *tb; + int ret; + + local_bh_disable(); + if (!snum) { + int low = sysctl_local_port_range[0]; + int high = sysctl_local_port_range[1]; + int remaining = (high - low) + 1; + int rover; + + spin_lock(&hashinfo->portalloc_lock); + if (hashinfo->port_rover < low) + rover = low; + else + rover = hashinfo->port_rover; + do { + rover++; + if (rover > high) + rover = low; + head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)]; + spin_lock(&head->lock); + inet_bind_bucket_for_each(tb, node, &head->chain) + if (tb->port == rover) + goto next; + break; + next: + spin_unlock(&head->lock); + } while (--remaining > 0); + hashinfo->port_rover = rover; + spin_unlock(&hashinfo->portalloc_lock); + + /* Exhausted local port range during search? It is not + * possible for us to be holding one of the bind hash + * locks if this test triggers, because if 'remaining' + * drops to zero, we broke out of the do/while loop at + * the top level, not from the 'break;' statement. + */ + ret = 1; + if (remaining <= 0) + goto fail; + + /* OK, here is the one we will use. HEAD is + * non-NULL and we hold it's mutex. + */ + snum = rover; + } else { + head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)]; + spin_lock(&head->lock); + inet_bind_bucket_for_each(tb, node, &head->chain) + if (tb->port == snum) + goto tb_found; + } + tb = NULL; + goto tb_not_found; +tb_found: + if (!hlist_empty(&tb->owners)) { + if (sk->sk_reuse > 1) + goto success; + if (tb->fastreuse > 0 && + sk->sk_reuse && sk->sk_state != TCP_LISTEN) { + goto success; + } else { + ret = 1; + if (inet_csk_bind_conflict(sk, tb)) + goto fail_unlock; + } + } +tb_not_found: + ret = 1; + if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, head, snum)) == NULL) + goto fail_unlock; + if (hlist_empty(&tb->owners)) { + if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) + tb->fastreuse = 1; + else + tb->fastreuse = 0; + } else if (tb->fastreuse && + (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) + tb->fastreuse = 0; +success: + if (!inet_csk(sk)->icsk_bind_hash) + inet_bind_hash(sk, tb, snum); + BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb); + ret = 0; + +fail_unlock: + spin_unlock(&head->lock); +fail: + local_bh_enable(); + return ret; +} + +EXPORT_SYMBOL_GPL(inet_csk_get_port); + +/* + * Wait for an incoming connection, avoid race conditions. This must be called + * with the socket locked. + */ +static int inet_csk_wait_for_connect(struct sock *sk, long timeo) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + DEFINE_WAIT(wait); + int err; + + /* + * True wake-one mechanism for incoming connections: only + * one process gets woken up, not the 'whole herd'. + * Since we do not 'race & poll' for established sockets + * anymore, the common case will execute the loop only once. + * + * Subtle issue: "add_wait_queue_exclusive()" will be added + * after any current non-exclusive waiters, and we know that + * it will always _stay_ after any new non-exclusive waiters + * because all non-exclusive waiters are added at the + * beginning of the wait-queue. As such, it's ok to "drop" + * our exclusiveness temporarily when we get woken up without + * having to remove and re-insert us on the wait queue. + */ + for (;;) { + prepare_to_wait_exclusive(sk->sk_sleep, &wait, + TASK_INTERRUPTIBLE); + release_sock(sk); + if (reqsk_queue_empty(&icsk->icsk_accept_queue)) + timeo = schedule_timeout(timeo); + lock_sock(sk); + err = 0; + if (!reqsk_queue_empty(&icsk->icsk_accept_queue)) + break; + err = -EINVAL; + if (sk->sk_state != TCP_LISTEN) + break; + err = sock_intr_errno(timeo); + if (signal_pending(current)) + break; + err = -EAGAIN; + if (!timeo) + break; + } + finish_wait(sk->sk_sleep, &wait); + return err; +} + +/* + * This will accept the next outstanding connection. + */ +struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct sock *newsk; + int error; + + lock_sock(sk); + + /* We need to make sure that this socket is listening, + * and that it has something pending. + */ + error = -EINVAL; + if (sk->sk_state != TCP_LISTEN) + goto out_err; + + /* Find already established connection */ + if (reqsk_queue_empty(&icsk->icsk_accept_queue)) { + long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + + /* If this is a non blocking socket don't sleep */ + error = -EAGAIN; + if (!timeo) + goto out_err; + + error = inet_csk_wait_for_connect(sk, timeo); + if (error) + goto out_err; + } + + newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk); + BUG_TRAP(newsk->sk_state != TCP_SYN_RECV); +out: + release_sock(sk); + return newsk; +out_err: + newsk = NULL; + *err = error; + goto out; +} + +EXPORT_SYMBOL(inet_csk_accept); + +/* + * Using different timers for retransmit, delayed acks and probes + * We may wish use just one timer maintaining a list of expire jiffies + * to optimize. + */ +void inet_csk_init_xmit_timers(struct sock *sk, + void (*retransmit_handler)(unsigned long), + void (*delack_handler)(unsigned long), + void (*keepalive_handler)(unsigned long)) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + init_timer(&icsk->icsk_retransmit_timer); + init_timer(&icsk->icsk_delack_timer); + init_timer(&sk->sk_timer); + + icsk->icsk_retransmit_timer.function = retransmit_handler; + icsk->icsk_delack_timer.function = delack_handler; + sk->sk_timer.function = keepalive_handler; + + icsk->icsk_retransmit_timer.data = + icsk->icsk_delack_timer.data = + sk->sk_timer.data = (unsigned long)sk; + + icsk->icsk_pending = icsk->icsk_ack.pending = 0; +} + +EXPORT_SYMBOL(inet_csk_init_xmit_timers); + +void inet_csk_clear_xmit_timers(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + icsk->icsk_pending = icsk->icsk_ack.pending = icsk->icsk_ack.blocked = 0; + + sk_stop_timer(sk, &icsk->icsk_retransmit_timer); + sk_stop_timer(sk, &icsk->icsk_delack_timer); + sk_stop_timer(sk, &sk->sk_timer); +} + +EXPORT_SYMBOL(inet_csk_clear_xmit_timers); + +void inet_csk_delete_keepalive_timer(struct sock *sk) +{ + sk_stop_timer(sk, &sk->sk_timer); +} + +EXPORT_SYMBOL(inet_csk_delete_keepalive_timer); + +void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len) +{ + sk_reset_timer(sk, &sk->sk_timer, jiffies + len); +} + +EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); + +struct dst_entry* inet_csk_route_req(struct sock *sk, + const struct request_sock *req) +{ + struct rtable *rt; + const struct inet_request_sock *ireq = inet_rsk(req); + struct ip_options *opt = inet_rsk(req)->opt; + struct flowi fl = { .oif = sk->sk_bound_dev_if, + .nl_u = { .ip4_u = + { .daddr = ((opt && opt->srr) ? + opt->faddr : + ireq->rmt_addr), + .saddr = ireq->loc_addr, + .tos = RT_CONN_FLAGS(sk) } }, + .proto = sk->sk_protocol, + .uli_u = { .ports = + { .sport = inet_sk(sk)->sport, + .dport = ireq->rmt_port } } }; + + if (ip_route_output_flow(&rt, &fl, sk, 0)) { + IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); + return NULL; + } + if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) { + ip_rt_put(rt); + IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); + return NULL; + } + return &rt->u.dst; +} + +EXPORT_SYMBOL_GPL(inet_csk_route_req); + +static inline u32 inet_synq_hash(const u32 raddr, const u16 rport, + const u32 rnd, const u16 synq_hsize) +{ + return jhash_2words(raddr, (u32)rport, rnd) & (synq_hsize - 1); +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#define AF_INET_FAMILY(fam) ((fam) == AF_INET) +#else +#define AF_INET_FAMILY(fam) 1 +#endif + +struct request_sock *inet_csk_search_req(const struct sock *sk, + struct request_sock ***prevp, + const __u16 rport, const __u32 raddr, + const __u32 laddr) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; + struct request_sock *req, **prev; + + for (prev = &lopt->syn_table[inet_synq_hash(raddr, rport, lopt->hash_rnd, + lopt->nr_table_entries)]; + (req = *prev) != NULL; + prev = &req->dl_next) { + const struct inet_request_sock *ireq = inet_rsk(req); + + if (ireq->rmt_port == rport && + ireq->rmt_addr == raddr && + ireq->loc_addr == laddr && + AF_INET_FAMILY(req->rsk_ops->family)) { + BUG_TRAP(!req->sk); + *prevp = prev; + break; + } + } + + return req; +} + +EXPORT_SYMBOL_GPL(inet_csk_search_req); + +void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, + const unsigned timeout) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; + const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, + lopt->hash_rnd, lopt->nr_table_entries); + + reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); + inet_csk_reqsk_queue_added(sk, timeout); +} + +EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8177b86..581016a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1805,98 +1805,6 @@ int tcp_disconnect(struct sock *sk, int flags) } /* - * Wait for an incoming connection, avoid race - * conditions. This must be called with the socket locked. - */ -static int wait_for_connect(struct sock *sk, long timeo) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - DEFINE_WAIT(wait); - int err; - - /* - * True wake-one mechanism for incoming connections: only - * one process gets woken up, not the 'whole herd'. - * Since we do not 'race & poll' for established sockets - * anymore, the common case will execute the loop only once. - * - * Subtle issue: "add_wait_queue_exclusive()" will be added - * after any current non-exclusive waiters, and we know that - * it will always _stay_ after any new non-exclusive waiters - * because all non-exclusive waiters are added at the - * beginning of the wait-queue. As such, it's ok to "drop" - * our exclusiveness temporarily when we get woken up without - * having to remove and re-insert us on the wait queue. - */ - for (;;) { - prepare_to_wait_exclusive(sk->sk_sleep, &wait, - TASK_INTERRUPTIBLE); - release_sock(sk); - if (reqsk_queue_empty(&icsk->icsk_accept_queue)) - timeo = schedule_timeout(timeo); - lock_sock(sk); - err = 0; - if (!reqsk_queue_empty(&icsk->icsk_accept_queue)) - break; - err = -EINVAL; - if (sk->sk_state != TCP_LISTEN) - break; - err = sock_intr_errno(timeo); - if (signal_pending(current)) - break; - err = -EAGAIN; - if (!timeo) - break; - } - finish_wait(sk->sk_sleep, &wait); - return err; -} - -/* - * This will accept the next outstanding connection. - */ - -struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct sock *newsk; - int error; - - lock_sock(sk); - - /* We need to make sure that this socket is listening, - * and that it has something pending. - */ - error = -EINVAL; - if (sk->sk_state != TCP_LISTEN) - goto out_err; - - /* Find already established connection */ - if (reqsk_queue_empty(&icsk->icsk_accept_queue)) { - long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); - - /* If this is a non blocking socket don't sleep */ - error = -EAGAIN; - if (!timeo) - goto out_err; - - error = wait_for_connect(sk, timeo); - if (error) - goto out_err; - } - - newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk); - BUG_TRAP(newsk->sk_state != TCP_SYN_RECV); -out: - release_sock(sk); - return newsk; -out_err: - newsk = NULL; - *err = error; - goto out; -} - -/* * Socket option code for TCP. */ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, @@ -2344,7 +2252,6 @@ void __init tcp_init(void) tcp_register_congestion_control(&tcp_reno); } -EXPORT_SYMBOL(inet_csk_accept); EXPORT_SYMBOL(tcp_close); EXPORT_SYMBOL(tcp_destroy_sock); EXPORT_SYMBOL(tcp_disconnect); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8a8c5c2..b35badf 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1278,7 +1278,7 @@ static int tcp_check_sack_reneging(struct sock *sk) inet_csk(sk)->icsk_retransmits++; tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)); inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - inet_csk(sk)->icsk_rto); + inet_csk(sk)->icsk_rto, TCP_RTO_MAX); return 1; } return 0; @@ -1961,7 +1961,7 @@ static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp) if (!tp->packets_out) { inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); } else { - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX); } } @@ -2147,7 +2147,8 @@ static void tcp_ack_probe(struct sock *sk) */ } else { inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX)); + min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX), + TCP_RTO_MAX); } } @@ -3968,7 +3969,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; tcp_incr_quickack(sk); tcp_enter_quickack_mode(sk); - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, TCP_DELACK_MAX); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, + TCP_DELACK_MAX, TCP_RTO_MAX); discard: __kfree_skb(skb); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2cd4126..2f605b9 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -97,138 +97,6 @@ struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { .port_rover = 1024 - 1, }; -/* - * This array holds the first and last local port number. - * For high-usage systems, use sysctl to change this to - * 32768-61000 - */ -int sysctl_local_port_range[2] = { 1024, 4999 }; - -static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb) -{ - const u32 sk_rcv_saddr = inet_rcv_saddr(sk); - struct sock *sk2; - struct hlist_node *node; - int reuse = sk->sk_reuse; - - sk_for_each_bound(sk2, node, &tb->owners) { - if (sk != sk2 && - !inet_v6_ipv6only(sk2) && - (!sk->sk_bound_dev_if || - !sk2->sk_bound_dev_if || - sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { - if (!reuse || !sk2->sk_reuse || - sk2->sk_state == TCP_LISTEN) { - const u32 sk2_rcv_saddr = inet_rcv_saddr(sk2); - if (!sk2_rcv_saddr || !sk_rcv_saddr || - sk2_rcv_saddr == sk_rcv_saddr) - break; - } - } - } - return node != NULL; -} - -/* Obtain a reference to a local port for the given sock, - * if snum is zero it means select any available local port. - */ -int inet_csk_get_port(struct inet_hashinfo *hashinfo, - struct sock *sk, unsigned short snum) -{ - struct inet_bind_hashbucket *head; - struct hlist_node *node; - struct inet_bind_bucket *tb; - int ret; - - local_bh_disable(); - if (!snum) { - int low = sysctl_local_port_range[0]; - int high = sysctl_local_port_range[1]; - int remaining = (high - low) + 1; - int rover; - - spin_lock(&hashinfo->portalloc_lock); - if (hashinfo->port_rover < low) - rover = low; - else - rover = hashinfo->port_rover; - do { - rover++; - if (rover > high) - rover = low; - head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)]; - spin_lock(&head->lock); - inet_bind_bucket_for_each(tb, node, &head->chain) - if (tb->port == rover) - goto next; - break; - next: - spin_unlock(&head->lock); - } while (--remaining > 0); - hashinfo->port_rover = rover; - spin_unlock(&hashinfo->portalloc_lock); - - /* Exhausted local port range during search? It is not - * possible for us to be holding one of the bind hash - * locks if this test triggers, because if 'remaining' - * drops to zero, we broke out of the do/while loop at - * the top level, not from the 'break;' statement. - */ - ret = 1; - if (unlikely(remaining <= 0)) - goto fail; - - /* OK, here is the one we will use. HEAD is - * non-NULL and we hold it's mutex. - */ - snum = rover; - } else { - head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)]; - spin_lock(&head->lock); - inet_bind_bucket_for_each(tb, node, &head->chain) - if (tb->port == snum) - goto tb_found; - } - tb = NULL; - goto tb_not_found; -tb_found: - if (!hlist_empty(&tb->owners)) { - if (sk->sk_reuse > 1) - goto success; - if (tb->fastreuse > 0 && - sk->sk_reuse && sk->sk_state != TCP_LISTEN) { - goto success; - } else { - ret = 1; - if (inet_csk_bind_conflict(sk, tb)) - goto fail_unlock; - } - } -tb_not_found: - ret = 1; - if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, head, snum)) == NULL) - goto fail_unlock; - if (hlist_empty(&tb->owners)) { - if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) - tb->fastreuse = 1; - else - tb->fastreuse = 0; - } else if (tb->fastreuse && - (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) - tb->fastreuse = 0; -success: - if (!inet_csk(sk)->icsk_bind_hash) - inet_bind_hash(sk, tb, snum); - BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb); - ret = 0; - -fail_unlock: - spin_unlock(&head->lock); -fail: - local_bh_enable(); - return ret; -} - static int tcp_v4_get_port(struct sock *sk, unsigned short snum) { return inet_csk_get_port(&tcp_hashinfo, sk, snum); @@ -568,52 +436,6 @@ static inline int inet_iif(const struct sk_buff *skb) return ((struct rtable *)skb->dst)->rt_iif; } -static inline u32 inet_synq_hash(const u32 raddr, const u16 rport, - const u32 rnd, const u16 synq_hsize) -{ - return jhash_2words(raddr, (u32)rport, rnd) & (synq_hsize - 1); -} - -struct request_sock *inet_csk_search_req(const struct sock *sk, - struct request_sock ***prevp, - const __u16 rport, const __u32 raddr, - const __u32 laddr) -{ - const struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - struct request_sock *req, **prev; - - for (prev = &lopt->syn_table[inet_synq_hash(raddr, rport, lopt->hash_rnd, - lopt->nr_table_entries)]; - (req = *prev) != NULL; - prev = &req->dl_next) { - const struct inet_request_sock *ireq = inet_rsk(req); - - if (ireq->rmt_port == rport && - ireq->rmt_addr == raddr && - ireq->loc_addr == laddr && - AF_INET_FAMILY(req->rsk_ops->family)) { - BUG_TRAP(!req->sk); - *prevp = prev; - break; - } - } - - return req; -} - -static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, - lopt->hash_rnd, lopt->nr_table_entries); - - reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, TCP_TIMEOUT_INIT); - inet_csk_reqsk_queue_added(sk, TCP_TIMEOUT_INIT); -} - - /* * This routine does path mtu discovery as defined in RFC1191. */ @@ -963,36 +785,6 @@ static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req) req->ts_recent); } -struct dst_entry* inet_csk_route_req(struct sock *sk, - const struct request_sock *req) -{ - struct rtable *rt; - const struct inet_request_sock *ireq = inet_rsk(req); - struct ip_options *opt = inet_rsk(req)->opt; - struct flowi fl = { .oif = sk->sk_bound_dev_if, - .nl_u = { .ip4_u = - { .daddr = ((opt && opt->srr) ? - opt->faddr : - ireq->rmt_addr), - .saddr = ireq->loc_addr, - .tos = RT_CONN_FLAGS(sk) } }, - .proto = sk->sk_protocol, - .uli_u = { .ports = - { .sport = inet_sk(sk)->sport, - .dport = ireq->rmt_port } } }; - - if (ip_route_output_flow(&rt, &fl, sk, 0)) { - IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); - return NULL; - } - if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) { - ip_rt_put(rt); - IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); - return NULL; - } - return &rt->u.dst; -} - /* * Send a SYN-ACK after having received an ACK. * This still operates on a request_sock only, not on a big @@ -1222,7 +1014,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (want_cookie) { reqsk_free(req); } else { - tcp_v4_synq_add(sk, req); + inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); } return 0; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 6f0a7e3..f458eac 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1493,7 +1493,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (skb == skb_peek(&sk->sk_write_queue)) inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - inet_csk(sk)->icsk_rto); + inet_csk(sk)->icsk_rto, + TCP_RTO_MAX); } packet_cnt -= tcp_skb_pcount(skb); @@ -1546,7 +1547,9 @@ void tcp_xmit_retransmit_queue(struct sock *sk) break; if (skb == skb_peek(&sk->sk_write_queue)) - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto, + TCP_RTO_MAX); NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS); } @@ -1826,7 +1829,8 @@ int tcp_connect(struct sock *sk) TCP_INC_STATS(TCP_MIB_ACTIVEOPENS); /* Timer for repeating the SYN until an answer. */ - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto, TCP_RTO_MAX); return 0; } @@ -1901,7 +1905,8 @@ void tcp_send_ack(struct sock *sk) if (buff == NULL) { inet_csk_schedule_ack(sk); inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, TCP_DELACK_MAX); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, + TCP_DELACK_MAX, TCP_RTO_MAX); return; } @@ -2033,7 +2038,8 @@ void tcp_send_probe0(struct sock *sk) icsk->icsk_backoff++; tp->probes_out++; inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX)); + min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX), + TCP_RTO_MAX); } else { /* If packet was not sent due to local congestion, * do not backoff and do not remember probes_out. @@ -2045,7 +2051,8 @@ void tcp_send_probe0(struct sock *sk) tp->probes_out=1; inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, min(icsk->icsk_rto << icsk->icsk_backoff, - TCP_RESOURCE_PROBE_INTERVAL)); + TCP_RESOURCE_PROBE_INTERVAL), + TCP_RTO_MAX); } } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 0b71380..c03930c 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -36,55 +36,14 @@ static void tcp_write_timer(unsigned long); static void tcp_delack_timer(unsigned long); static void tcp_keepalive_timer (unsigned long data); -#ifdef INET_CSK_DEBUG -const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; -EXPORT_SYMBOL(inet_csk_timer_bug_msg); -#endif - -/* - * Using different timers for retransmit, delayed acks and probes - * We may wish use just one timer maintaining a list of expire jiffies - * to optimize. - */ -void inet_csk_init_xmit_timers(struct sock *sk, - void (*retransmit_handler)(unsigned long), - void (*delack_handler)(unsigned long), - void (*keepalive_handler)(unsigned long)) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - - init_timer(&icsk->icsk_retransmit_timer); - init_timer(&icsk->icsk_delack_timer); - init_timer(&sk->sk_timer); - - icsk->icsk_retransmit_timer.function = retransmit_handler; - icsk->icsk_delack_timer.function = delack_handler; - sk->sk_timer.function = keepalive_handler; - - icsk->icsk_retransmit_timer.data = - icsk->icsk_delack_timer.data = - sk->sk_timer.data = (unsigned long)sk; - - icsk->icsk_pending = icsk->icsk_ack.pending = 0; -} - -void inet_csk_clear_xmit_timers(struct sock *sk) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - - icsk->icsk_pending = icsk->icsk_ack.pending = icsk->icsk_ack.blocked = 0; - - sk_stop_timer(sk, &icsk->icsk_retransmit_timer); - sk_stop_timer(sk, &icsk->icsk_delack_timer); - sk_stop_timer(sk, &sk->sk_timer); -} - void tcp_init_xmit_timers(struct sock *sk) { inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, &tcp_keepalive_timer); } +EXPORT_SYMBOL(tcp_init_xmit_timers); + static void tcp_write_err(struct sock *sk) { sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT; @@ -392,7 +351,8 @@ static void tcp_retransmit_timer(struct sock *sk) if (!icsk->icsk_retransmits) icsk->icsk_retransmits = 1; inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL)); + min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL), + TCP_RTO_MAX); goto out; } @@ -416,7 +376,7 @@ static void tcp_retransmit_timer(struct sock *sk) out_reset_timer: icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); if (icsk->icsk_retransmits > sysctl_tcp_retries1) __sk_dst_reset(sk); @@ -553,16 +513,6 @@ static void tcp_synack_timer(struct sock *sk) inet_csk_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL); } -void inet_csk_delete_keepalive_timer(struct sock *sk) -{ - sk_stop_timer(sk, &sk->sk_timer); -} - -void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len) -{ - sk_reset_timer(sk, &sk->sk_timer, jiffies + len); -} - void tcp_set_keepalive(struct sock *sk, int val) { if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) @@ -653,8 +603,3 @@ out: bh_unlock_sock(sk); sock_put(sk); } - -EXPORT_SYMBOL(inet_csk_clear_xmit_timers); -EXPORT_SYMBOL(inet_csk_delete_keepalive_timer); -EXPORT_SYMBOL(tcp_init_xmit_timers); -EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); -- cgit v0.10.2 From 9f1d2604c71498579609b1532fedc5a89276bb00 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:11:24 -0700 Subject: [ICSK]: Introduce inet_csk_clone Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 97e0020..a50f4a4b 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -77,6 +77,10 @@ static inline struct inet_connection_sock *inet_csk(const struct sock *sk) return (struct inet_connection_sock *)sk; } +extern struct sock *inet_csk_clone(struct sock *sk, + const struct request_sock *req, + const unsigned int __nocast priority); + enum inet_csk_ack_state_t { ICSK_ACK_SCHED = 1, ICSK_ACK_TIMER = 2, diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 2712400..136ada0 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -399,3 +399,28 @@ void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, } EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); + +struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req, + const unsigned int __nocast priority) +{ + struct sock *newsk = sk_clone(sk, priority); + + if (newsk != NULL) { + struct inet_connection_sock *newicsk = inet_csk(newsk); + + newsk->sk_state = TCP_SYN_RECV; + newicsk->icsk_bind_hash = NULL; + + inet_sk(newsk)->dport = inet_rsk(req)->rmt_port; + newsk->sk_write_space = sk_stream_write_space; + + newicsk->icsk_retransmits = 0; + newicsk->icsk_backoff = 0; + + /* Deinitialize accept_queue to trap illegal accesses. */ + memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue)); + } + return newsk; +} + +EXPORT_SYMBOL_GPL(inet_csk_clone); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 5682370..4cfbe1d 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -600,22 +600,14 @@ out: */ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb) { - struct sock *newsk = sk_clone(sk, GFP_ATOMIC); + struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC); if (newsk != NULL) { - struct inet_request_sock *ireq = inet_rsk(req); + const struct inet_request_sock *ireq = inet_rsk(req); struct tcp_request_sock *treq = tcp_rsk(req); - struct inet_sock *newinet = inet_sk(newsk); - struct inet_connection_sock *newicsk = inet_csk(newsk); + struct inet_connection_sock *newicsk = inet_csk(sk); struct tcp_sock *newtp; - newsk->sk_state = TCP_SYN_RECV; - newicsk->icsk_bind_hash = NULL; - - /* Clone the TCP header template */ - newinet->dport = ireq->rmt_port; - newsk->sk_write_space = sk_stream_write_space; - /* Now setup tcp_sock */ newtp = tcp_sk(newsk); newtp->pred_flags = 0; @@ -626,8 +618,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, tcp_init_wl(newtp, treq->snt_isn, treq->rcv_isn); - newicsk->icsk_retransmits = 0; - newicsk->icsk_backoff = 0; newtp->srtt = 0; newtp->mdev = TCP_TIMEOUT_INIT; newicsk->icsk_rto = TCP_TIMEOUT_INIT; @@ -668,8 +658,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->probes_out = 0; newtp->rx_opt.num_sacks = 0; newtp->urg_data = 0; - /* Deinitialize accept_queue to trap illegal accesses. */ - memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue)); if (sock_flag(newsk, SOCK_KEEPOPEN)) inet_csk_reset_keepalive_timer(newsk, -- cgit v0.10.2 From 0a5578cf8e5e045aaa68643c17ce885426697c6b Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:11:41 -0700 Subject: [ICSK]: Generalise tcp_listen_{start,stop} This also moved inet_iif from tcp to inet_hashtables.h, as it is needed by the inet_lookup callers, perhaps this needs a bit of polishing, but for now seems fine. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index f0c21c0..646b6ea 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -26,6 +26,7 @@ #include #include +#include #include #include @@ -280,6 +281,11 @@ out: wake_up(&hashinfo->lhash_wait); } +static inline int inet_iif(const struct sk_buff *skb) +{ + return ((struct rtable *)skb->dst)->rt_iif; +} + extern struct sock *__inet_lookup_listener(const struct hlist_head *head, const u32 daddr, const unsigned short hnum, diff --git a/include/net/sock.h b/include/net/sock.h index 48cc337..8678313 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -558,6 +558,7 @@ struct proto { kmem_cache_t *twsk_slab; unsigned int twsk_obj_size; + atomic_t *orphan_count; struct request_sock_ops *rsk_prot; diff --git a/include/net/tcp.h b/include/net/tcp.h index dd9a5a2..68f1ec1 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -860,7 +860,7 @@ static __inline__ void tcp_update_wl(struct tcp_sock *tp, u32 ack, u32 seq) tp->snd_wl1 = seq; } -extern void tcp_destroy_sock(struct sock *sk); +extern void inet_csk_destroy_sock(struct sock *sk); /* @@ -987,7 +987,7 @@ static __inline__ void tcp_done(struct sock *sk) if (!sock_flag(sk, SOCK_DEAD)) sk->sk_state_change(sk); else - tcp_destroy_sock(sk); + inet_csk_destroy_sock(sk); } static __inline__ void tcp_sack_reset(struct tcp_options_received *rx_opt) diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 7137e64..f691058 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -202,7 +202,7 @@ int inet_listen(struct socket *sock, int backlog) * we can only allow the backlog to be adjusted. */ if (old_state != TCP_LISTEN) { - err = tcp_listen_start(sk); + err = inet_csk_listen_start(sk, TCP_SYNQ_HSIZE); if (err) goto out; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 581016a..a1f8121 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -273,6 +273,8 @@ DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics); atomic_t tcp_orphan_count = ATOMIC_INIT(0); +EXPORT_SYMBOL_GPL(tcp_orphan_count); + int sysctl_tcp_mem[3]; int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 }; int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 }; @@ -454,12 +456,11 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) return put_user(answ, (int __user *)arg); } - -int tcp_listen_start(struct sock *sk) +int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) { struct inet_sock *inet = inet_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); - int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, TCP_SYNQ_HSIZE); + int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries); if (rc != 0) return rc; @@ -488,12 +489,13 @@ int tcp_listen_start(struct sock *sk) return -EADDRINUSE; } +EXPORT_SYMBOL_GPL(inet_csk_listen_start); + /* * This routine closes sockets which have been at least partially * opened, but not yet accepted. */ - -static void tcp_listen_stop (struct sock *sk) +static void inet_csk_listen_stop(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct request_sock *acc_req; @@ -524,13 +526,13 @@ static void tcp_listen_stop (struct sock *sk) BUG_TRAP(!sock_owned_by_user(child)); sock_hold(child); - tcp_disconnect(child, O_NONBLOCK); + sk->sk_prot->disconnect(child, O_NONBLOCK); sock_orphan(child); - atomic_inc(&tcp_orphan_count); + atomic_inc(sk->sk_prot->orphan_count); - tcp_destroy_sock(child); + inet_csk_destroy_sock(child); bh_unlock_sock(child); local_bh_enable(); @@ -542,6 +544,8 @@ static void tcp_listen_stop (struct sock *sk) BUG_TRAP(!sk->sk_ack_backlog); } +EXPORT_SYMBOL_GPL(inet_csk_listen_stop); + static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) { TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; @@ -1561,7 +1565,7 @@ void tcp_shutdown(struct sock *sk, int how) * can assume the socket waitqueue is inactive and nobody will * try to jump onto it. */ -void tcp_destroy_sock(struct sock *sk) +void inet_csk_destroy_sock(struct sock *sk) { BUG_TRAP(sk->sk_state == TCP_CLOSE); BUG_TRAP(sock_flag(sk, SOCK_DEAD)); @@ -1580,7 +1584,7 @@ void tcp_destroy_sock(struct sock *sk) sk_refcnt_debug_release(sk); - atomic_dec(&tcp_orphan_count); + atomic_dec(sk->sk_prot->orphan_count); sock_put(sk); } @@ -1596,7 +1600,7 @@ void tcp_close(struct sock *sk, long timeout) tcp_set_state(sk, TCP_CLOSE); /* Special case. */ - tcp_listen_stop(sk); + inet_csk_listen_stop(sk); goto adjudge_to_death; } @@ -1704,7 +1708,7 @@ adjudge_to_death: if (tmo > TCP_TIMEWAIT_LEN) { inet_csk_reset_keepalive_timer(sk, tcp_fin_time(sk)); } else { - atomic_inc(&tcp_orphan_count); + atomic_inc(sk->sk_prot->orphan_count); tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); goto out; } @@ -1712,7 +1716,7 @@ adjudge_to_death: } if (sk->sk_state != TCP_CLOSE) { sk_stream_mem_reclaim(sk); - if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans || + if (atomic_read(sk->sk_prot->orphan_count) > sysctl_tcp_max_orphans || (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) { if (net_ratelimit()) @@ -1723,10 +1727,10 @@ adjudge_to_death: NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY); } } - atomic_inc(&tcp_orphan_count); + atomic_inc(sk->sk_prot->orphan_count); if (sk->sk_state == TCP_CLOSE) - tcp_destroy_sock(sk); + inet_csk_destroy_sock(sk); /* Otherwise, socket is reprieved until protocol close. */ out: @@ -1757,7 +1761,7 @@ int tcp_disconnect(struct sock *sk, int flags) /* ABORT function of RFC793 */ if (old_state == TCP_LISTEN) { - tcp_listen_stop(sk); + inet_csk_listen_stop(sk); } else if (tcp_need_reset(old_state) || (tp->snd_nxt != tp->write_seq && (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { @@ -2253,7 +2257,7 @@ void __init tcp_init(void) } EXPORT_SYMBOL(tcp_close); -EXPORT_SYMBOL(tcp_destroy_sock); +EXPORT_SYMBOL(inet_csk_destroy_sock); EXPORT_SYMBOL(tcp_disconnect); EXPORT_SYMBOL(tcp_getsockopt); EXPORT_SYMBOL(tcp_ioctl); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2f605b9..b966102 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -431,11 +431,6 @@ failure: return err; } -static inline int inet_iif(const struct sk_buff *skb) -{ - return ((struct rtable *)skb->dst)->rt_iif; -} - /* * This routine does path mtu discovery as defined in RFC1191. */ @@ -1993,6 +1988,7 @@ struct proto tcp_prot = { .get_port = tcp_v4_get_port, .enter_memory_pressure = tcp_enter_memory_pressure, .sockets_allocated = &tcp_sockets_allocated, + .orphan_count = &tcp_orphan_count, .memory_allocated = &tcp_memory_allocated, .memory_pressure = &tcp_memory_pressure, .sysctl_mem = sysctl_tcp_mem, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index b9c7003..0b51ec3 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2248,6 +2248,7 @@ struct proto tcpv6_prot = { .sockets_allocated = &tcp_sockets_allocated, .memory_allocated = &tcp_memory_allocated, .memory_pressure = &tcp_memory_pressure, + .orphan_count = &tcp_orphan_count, .sysctl_mem = sysctl_tcp_mem, .sysctl_wmem = sysctl_tcp_wmem, .sysctl_rmem = sysctl_tcp_rmem, -- cgit v0.10.2 From 295f7324ff8d9ea58b4d3ec93b1aaa1d80e048a9 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:11:56 -0700 Subject: [ICSK]: Introduce reqsk_queue_prune from code in tcp_synack_timer With this we're very close to getting all of the current TCP refactorings in my dccp-2.6 tree merged, next changeset will export some functions needed by the current DCCP code and then dccp-2.6.git will be born! Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 800930f..6200968 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -270,7 +270,7 @@ struct tcp_sock { __u8 frto_counter; /* Number of new acks after RTO */ __u8 nonagle; /* Disable Nagle algorithm? */ - __u8 defer_accept; /* User waits for some data after accept() */ + /* ONE BYTE HOLE, TRY TO PACK */ /* RTT measurement */ __u32 srtt; /* smoothed round trip time << 3 */ diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index a50f4a4b..692825f 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -239,4 +239,6 @@ static inline void inet_csk_reqsk_queue_drop(struct sock *sk, reqsk_free(req); } +extern void inet_csk_listen_stop(struct sock *sk); + #endif /* _INET_CONNECTION_SOCK_H */ diff --git a/include/net/request_sock.h b/include/net/request_sock.h index b7c7eec..447d287 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -97,6 +97,7 @@ struct listen_sock { * * @rskq_accept_head - FIFO head of established children * @rskq_accept_tail - FIFO tail of established children + * @rskq_defer_accept - User waits for some data after accept() * @syn_wait_lock - serializer * * %syn_wait_lock is necessary only to avoid proc interface having to grab the main @@ -112,6 +113,8 @@ struct request_sock_queue { struct request_sock *rskq_accept_head; struct request_sock *rskq_accept_tail; rwlock_t syn_wait_lock; + u8 rskq_defer_accept; + /* 3 bytes hole, try to pack */ struct listen_sock *listen_opt; }; @@ -255,4 +258,8 @@ static inline void reqsk_queue_hash_req(struct request_sock_queue *queue, write_unlock(&queue->syn_wait_lock); } +extern void reqsk_queue_prune(struct request_sock_queue *queue, struct sock *parent, + const unsigned long interval, const unsigned long timeout, + const unsigned long max_rto, int max_retries); + #endif /* _REQUEST_SOCK_H */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 68f1ec1..2423f05 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -423,7 +423,8 @@ extern int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, size_t len, int nonblock, int flags, int *addr_len); -extern int tcp_listen_start(struct sock *sk); +extern int inet_csk_listen_start(struct sock *sk, + const int nr_table_entries); extern void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, diff --git a/net/core/request_sock.c b/net/core/request_sock.c index 98f0fc9..b8203de 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -52,6 +52,7 @@ int reqsk_queue_alloc(struct request_sock_queue *queue, get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd)); rwlock_init(&queue->syn_wait_lock); queue->rskq_accept_head = queue->rskq_accept_head = NULL; + queue->rskq_defer_accept = 0; lopt->nr_table_entries = nr_table_entries; write_lock_bh(&queue->syn_wait_lock); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index f691058..52f5ecc 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -99,6 +99,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index a1f8121..a4e9eec 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -495,7 +495,7 @@ EXPORT_SYMBOL_GPL(inet_csk_listen_start); * This routine closes sockets which have been at least partially * opened, but not yet accepted. */ -static void inet_csk_listen_stop(struct sock *sk) +void inet_csk_listen_stop(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct request_sock *acc_req; @@ -1947,15 +1947,15 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, break; case TCP_DEFER_ACCEPT: - tp->defer_accept = 0; + icsk->icsk_accept_queue.rskq_defer_accept = 0; if (val > 0) { /* Translate value in seconds to number of * retransmits */ - while (tp->defer_accept < 32 && + while (icsk->icsk_accept_queue.rskq_defer_accept < 32 && val > ((TCP_TIMEOUT_INIT / HZ) << - tp->defer_accept)) - tp->defer_accept++; - tp->defer_accept++; + icsk->icsk_accept_queue.rskq_defer_accept)) + icsk->icsk_accept_queue.rskq_defer_accept++; + icsk->icsk_accept_queue.rskq_defer_accept++; } break; @@ -2058,6 +2058,7 @@ EXPORT_SYMBOL_GPL(tcp_get_info); int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int val, len; @@ -2095,7 +2096,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes; break; case TCP_SYNCNT: - val = inet_csk(sk)->icsk_syn_retries ? : sysctl_tcp_syn_retries; + val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; break; case TCP_LINGER2: val = tp->linger2; @@ -2103,8 +2104,8 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, val = (val ? : sysctl_tcp_fin_timeout) / HZ; break; case TCP_DEFER_ACCEPT: - val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) << - (tp->defer_accept - 1)); + val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 : + ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1)); break; case TCP_WINDOW_CLAMP: val = tp->window_clamp; @@ -2125,7 +2126,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, return 0; } case TCP_QUICKACK: - val = !inet_csk(sk)->icsk_ack.pingpong; + val = !icsk->icsk_ack.pingpong; break; case TCP_CONGESTION: diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b35badf..71d4561 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3831,6 +3831,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_parse_options(skb, &tp->rx_opt, 0); if (th->ack) { + struct inet_connection_sock *icsk; /* rfc793: * "If the state is SYN-SENT then * first check the ACK bit @@ -3956,7 +3957,11 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, sk_wake_async(sk, 0, POLL_OUT); } - if (sk->sk_write_pending || tp->defer_accept || inet_csk(sk)->icsk_ack.pingpong) { + icsk = inet_csk(sk); + + if (sk->sk_write_pending || + icsk->icsk_accept_queue.rskq_defer_accept || + icsk->icsk_ack.pingpong) { /* Save one ACK. Data will be ready after * several ticks, if write_pending is set. * @@ -3965,8 +3970,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * to stand against the temptation 8) --ANK */ inet_csk_schedule_ack(sk); - inet_csk(sk)->icsk_ack.lrcvtime = tcp_time_stamp; - inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; + icsk->icsk_ack.lrcvtime = tcp_time_stamp; + icsk->icsk_ack.ato = TCP_ATO_MIN; tcp_incr_quickack(sk); tcp_enter_quickack_mode(sk); inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 4cfbe1d..2d95afe 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -787,9 +787,10 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, does sequence test, SYN is truncated, and thus we consider it a bare ACK. - If tp->defer_accept, we silently drop this bare ACK. Otherwise, - we create an established connection. Both ends (listening sockets) - accept the new incoming connection and try to talk to each other. 8-) + If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this + bare ACK. Otherwise, we create an established connection. Both + ends (listening sockets) accept the new incoming connection and try + to talk to each other. 8-) Note: This case is both harmless, and rare. Possibility is about the same as us discovering intelligent life on another plant tomorrow. @@ -856,7 +857,8 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, return NULL; /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */ - if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { + if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && + TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { inet_rsk(req)->acked = 1; return NULL; } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index c03930c..b614ad4 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -424,16 +424,12 @@ out_unlock: sock_put(sk); } -/* - * Timer for listening sockets - */ - -static void tcp_synack_timer(struct sock *sk) +void reqsk_queue_prune(struct request_sock_queue *queue, struct sock *parent, + const unsigned long interval, const unsigned long timeout, + const unsigned long max_rto, int max_retries) { - struct tcp_sock *tp = tcp_sk(sk); - struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; + struct inet_connection_sock *icsk = inet_csk(parent); + struct listen_sock *lopt = queue->listen_opt; int thresh = max_retries; unsigned long now = jiffies; struct request_sock **reqp, *req; @@ -470,10 +466,10 @@ static void tcp_synack_timer(struct sock *sk) } } - if (tp->defer_accept) - max_retries = tp->defer_accept; + if (queue->rskq_defer_accept) + max_retries = queue->rskq_defer_accept; - budget = 2*(TCP_SYNQ_HSIZE/(TCP_TIMEOUT_INIT/TCP_SYNQ_INTERVAL)); + budget = 2 * (lopt->nr_table_entries / (timeout / interval)); i = lopt->clock_hand; do { @@ -482,20 +478,19 @@ static void tcp_synack_timer(struct sock *sk) if (time_after_eq(now, req->expires)) { if ((req->retrans < thresh || (inet_rsk(req)->acked && req->retrans < max_retries)) - && !req->rsk_ops->rtx_syn_ack(sk, req, NULL)) { + && !req->rsk_ops->rtx_syn_ack(parent, req, NULL)) { unsigned long timeo; if (req->retrans++ == 0) lopt->qlen_young--; - timeo = min((TCP_TIMEOUT_INIT << req->retrans), - TCP_RTO_MAX); + timeo = min((timeout << req->retrans), max_rto); req->expires = now + timeo; reqp = &req->dl_next; continue; } /* Drop this request */ - inet_csk_reqsk_queue_unlink(sk, req, reqp); + inet_csk_reqsk_queue_unlink(parent, req, reqp); reqsk_queue_removed(&icsk->icsk_accept_queue, req); reqsk_free(req); continue; @@ -503,14 +498,29 @@ static void tcp_synack_timer(struct sock *sk) reqp = &req->dl_next; } - i = (i+1)&(TCP_SYNQ_HSIZE-1); + i = (i + 1) & (lopt->nr_table_entries - 1); } while (--budget > 0); lopt->clock_hand = i; if (lopt->qlen) - inet_csk_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL); + inet_csk_reset_keepalive_timer(parent, interval); +} + +EXPORT_SYMBOL_GPL(reqsk_queue_prune); + +/* + * Timer for listening sockets + */ + +static void tcp_synack_timer(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + const int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; + + reqsk_queue_prune(&icsk->icsk_accept_queue, sk, TCP_SYNQ_INTERVAL, + TCP_TIMEOUT_INIT, TCP_RTO_MAX, max_retries); } void tcp_set_keepalive(struct sock *sk, int val) -- cgit v0.10.2 From d8c97a9451068dd9f7b838a240bb6db894133a5e Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:12:12 -0700 Subject: [NET]: Export symbols needed by the current DCCP code Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 5cba59b..22882d9 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -47,6 +47,8 @@ void __inet_twsk_kill(struct inet_timewait_sock *tw, struct inet_hashinfo *hashi inet_twsk_put(tw); } +EXPORT_SYMBOL_GPL(__inet_twsk_kill); + /* * Enter the time wait state. This is called with locally disabled BH. * Essentially we whip up a timewait bucket, copy the relevant info into it diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index dd568b0..633945d 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -158,6 +158,8 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, dst_output); } +EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); + static inline int ip_finish_output2(struct sk_buff *skb) { struct dst_entry *dst = skb->dst; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index ff4bd06..ddb1aed 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1090,7 +1090,5 @@ int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval, EXPORT_SYMBOL(ip_cmsg_recv); -#ifdef CONFIG_IP_SCTP_MODULE EXPORT_SYMBOL(ip_getsockopt); EXPORT_SYMBOL(ip_setsockopt); -#endif diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 3aef0e1..8c0b14e 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2602,6 +2602,8 @@ int __ip_route_output_key(struct rtable **rp, const struct flowi *flp) return ip_route_output_slow(rp, flp); } +EXPORT_SYMBOL_GPL(__ip_route_output_key); + int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags) { int err; @@ -2620,6 +2622,8 @@ int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, return 0; } +EXPORT_SYMBOL_GPL(ip_route_output_flow); + int ip_route_output_key(struct rtable **rp, struct flowi *flp) { return ip_route_output_flow(rp, flp, NULL, 0); -- cgit v0.10.2 From c4365c9235f80128c3c3d5993074173941b1c1f0 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:12:30 -0700 Subject: [RANDOM]: Introduce secure_dccp_sequence_number Code contributed by Stephen Hemminger. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/drivers/char/random.c b/drivers/char/random.c index 6b11d6b..7999da2 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1589,6 +1589,40 @@ u32 secure_tcpv6_port_ephemeral(const __u32 *saddr, const __u32 *daddr, __u16 dp EXPORT_SYMBOL(secure_tcpv6_port_ephemeral); #endif +#if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE) +/* Similar to secure_tcp_sequence_number but generate a 48 bit value + * bit's 32-47 increase every key exchange + * 0-31 hash(source, dest) + */ +u64 secure_dccp_sequence_number(__u32 saddr, __u32 daddr, + __u16 sport, __u16 dport) +{ + struct timeval tv; + u64 seq; + __u32 hash[4]; + struct keydata *keyptr = get_keyptr(); + + hash[0] = saddr; + hash[1] = daddr; + hash[2] = (sport << 16) + dport; + hash[3] = keyptr->secret[11]; + + seq = half_md4_transform(hash, keyptr->secret); + seq |= ((u64)keyptr->count) << (32 - HASH_BITS); + + do_gettimeofday(&tv); + seq += tv.tv_usec + tv.tv_sec * 1000000; + seq &= (1ull << 48) - 1; +#if 0 + printk("dccp init_seq(%lx, %lx, %d, %d) = %d\n", + saddr, daddr, sport, dport, seq); +#endif + return seq; +} + +EXPORT_SYMBOL(secure_dccp_sequence_number); +#endif + #endif /* CONFIG_INET */ diff --git a/include/linux/random.h b/include/linux/random.h index cc67034..7b2adb3 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -59,6 +59,8 @@ extern __u32 secure_tcp_sequence_number(__u32 saddr, __u32 daddr, __u16 sport, __u16 dport); extern __u32 secure_tcpv6_sequence_number(__u32 *saddr, __u32 *daddr, __u16 sport, __u16 dport); +extern u64 secure_dccp_sequence_number(__u32 saddr, __u32 daddr, + __u16 sport, __u16 dport); #ifndef MODULE extern struct file_operations random_fops, urandom_fops; -- cgit v0.10.2 From 7c657876b63cb1d8a2ec06f8fc6c37bb8412e66c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:14:34 -0700 Subject: [DCCP]: Initial implementation Development to this point was done on a subversion repository at: http://oops.ghostprotocols.net:81/cgi-bin/viewcvs.cgi/dccp-2.6/ This repository will be kept at this site for the foreseable future, so that interested parties can see the history of this code, attributions, etc. If I ever decide to take this offline I'll provide the full history at some other suitable place. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/linux/dccp.h b/include/linux/dccp.h new file mode 100644 index 0000000..e3b4bf7 --- /dev/null +++ b/include/linux/dccp.h @@ -0,0 +1,432 @@ +#ifndef _LINUX_DCCP_H +#define _LINUX_DCCP_H + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* FIXME: this is utterly wrong */ +struct sockaddr_dccp { + struct sockaddr_in in; + unsigned int service; +}; + +enum dccp_state { + DCCP_OPEN = TCP_ESTABLISHED, + DCCP_REQUESTING = TCP_SYN_SENT, + DCCP_PARTOPEN = TCP_FIN_WAIT1, /* FIXME: + This mapping is horrible, but TCP has + no matching state for DCCP_PARTOPEN, + as TCP_SYN_RECV is already used by + DCCP_RESPOND, why don't stop using TCP + mapping of states? OK, now we don't use + sk_stream_sendmsg anymore, so doesn't + seem to exist any reason for us to + do the TCP mapping here */ + DCCP_LISTEN = TCP_LISTEN, + DCCP_RESPOND = TCP_SYN_RECV, + DCCP_CLOSING = TCP_CLOSING, + DCCP_TIME_WAIT = TCP_TIME_WAIT, + DCCP_CLOSED = TCP_CLOSE, + DCCP_MAX_STATES = TCP_MAX_STATES, +}; + +#define DCCP_STATE_MASK 0xf +#define DCCP_ACTION_FIN (1<<7) + +enum { + DCCPF_OPEN = TCPF_ESTABLISHED, + DCCPF_REQUESTING = TCPF_SYN_SENT, + DCCPF_PARTOPEN = TCPF_FIN_WAIT1, + DCCPF_LISTEN = TCPF_LISTEN, + DCCPF_RESPOND = TCPF_SYN_RECV, + DCCPF_CLOSING = TCPF_CLOSING, + DCCPF_TIME_WAIT = TCPF_TIME_WAIT, + DCCPF_CLOSED = TCPF_CLOSE, +}; + +/** + * struct dccp_hdr - generic part of DCCP packet header + * + * @dccph_sport - Relevant port on the endpoint that sent this packet + * @dccph_dport - Relevant port on the other endpoint + * @dccph_doff - Data Offset from the start of the DCCP header, in 32-bit words + * @dccph_ccval - Used by the HC-Sender CCID + * @dccph_cscov - Parts of the packet that are covered by the Checksum field + * @dccph_checksum - Internet checksum, depends on dccph_cscov + * @dccph_x - 0 = 24 bit sequence number, 1 = 48 + * @dccph_type - packet type, see DCCP_PKT_ prefixed macros + * @dccph_seq - sequence number high or low order 24 bits, depends on dccph_x + */ +struct dccp_hdr { + __u16 dccph_sport, + dccph_dport; + __u8 dccph_doff; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 dccph_cscov:4, + dccph_ccval:4; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u8 dccph_ccval:4, + dccph_cscov:4; +#else +#error "Adjust your defines" +#endif + __u16 dccph_checksum; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u32 dccph_x:1, + dccph_type:4, + dccph_reserved:3, + dccph_seq:24; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u32 dccph_reserved:3, + dccph_type:4, + dccph_x:1, + dccph_seq:24; +#else +#error "Adjust your defines" +#endif +}; + +static inline struct dccp_hdr *dccp_hdr(const struct sk_buff *skb) +{ + return (struct dccp_hdr *)skb->h.raw; +} + +/** + * struct dccp_hdr_ext - the low bits of a 48 bit seq packet + * + * @dccph_seq_low - low 24 bits of a 48 bit seq packet + */ +struct dccp_hdr_ext { + __u32 dccph_seq_low; +}; + +static inline struct dccp_hdr_ext *dccp_hdrx(const struct sk_buff *skb) +{ + return (struct dccp_hdr_ext *)(skb->h.raw + sizeof(struct dccp_hdr)); +} + +static inline unsigned int dccp_basic_hdr_len(const struct sk_buff *skb) +{ + const struct dccp_hdr *dh = dccp_hdr(skb); + return sizeof(*dh) + (dh->dccph_x ? sizeof(struct dccp_hdr_ext) : 0); +} + +static inline __u64 dccp_hdr_seq(const struct sk_buff *skb) +{ + const struct dccp_hdr *dh = dccp_hdr(skb); +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 seq_nr = ntohl(dh->dccph_seq << 8); +#elif defined(__BIG_ENDIAN_BITFIELD) + __u64 seq_nr = ntohl(dh->dccph_seq); +#else +#error "Adjust your defines" +#endif + + if (dh->dccph_x != 0) + seq_nr = (seq_nr << 32) + ntohl(dccp_hdrx(skb)->dccph_seq_low); + + return seq_nr; +} + +/** + * struct dccp_hdr_request - Conection initiation request header + * + * @dccph_req_service - Service to which the client app wants to connect + * @dccph_req_options - list of options (must be a multiple of 32 bits + */ +struct dccp_hdr_request { + __u32 dccph_req_service; +}; + +static inline struct dccp_hdr_request *dccp_hdr_request(struct sk_buff *skb) +{ + return (struct dccp_hdr_request *)(skb->h.raw + dccp_basic_hdr_len(skb)); +} + +/** + * struct dccp_hdr_ack_bits - acknowledgment bits common to most packets + * + * @dccph_resp_ack_nr_high - 48 bit ack number high order bits, contains GSR + * @dccph_resp_ack_nr_low - 48 bit ack number low order bits, contains GSR + */ +struct dccp_hdr_ack_bits { + __u32 dccph_reserved1:8, + dccph_ack_nr_high:24; + __u32 dccph_ack_nr_low; +}; + +static inline struct dccp_hdr_ack_bits *dccp_hdr_ack_bits(const struct sk_buff *skb) +{ + return (struct dccp_hdr_ack_bits *)(skb->h.raw + dccp_basic_hdr_len(skb)); +} + +static inline u64 dccp_hdr_ack_seq(const struct sk_buff *skb) +{ + const struct dccp_hdr_ack_bits *dhack = dccp_hdr_ack_bits(skb); +#if defined(__LITTLE_ENDIAN_BITFIELD) + return (((u64)ntohl(dhack->dccph_ack_nr_high << 8)) << 32) + ntohl(dhack->dccph_ack_nr_low); +#elif defined(__BIG_ENDIAN_BITFIELD) + return (((u64)ntohl(dhack->dccph_ack_nr_high)) << 32) + ntohl(dhack->dccph_ack_nr_low); +#else +#error "Adjust your defines" +#endif +} + +/** + * struct dccp_hdr_response - Conection initiation response header + * + * @dccph_resp_ack_nr_high - 48 bit ack number high order bits, contains GSR + * @dccph_resp_ack_nr_low - 48 bit ack number low order bits, contains GSR + * @dccph_resp_service - Echoes the Service Code on a received DCCP-Request + * @dccph_resp_options - list of options (must be a multiple of 32 bits + */ +struct dccp_hdr_response { + struct dccp_hdr_ack_bits dccph_resp_ack; + __u32 dccph_resp_service; +}; + +static inline struct dccp_hdr_response *dccp_hdr_response(struct sk_buff *skb) +{ + return (struct dccp_hdr_response *)(skb->h.raw + dccp_basic_hdr_len(skb)); +} + +/** + * struct dccp_hdr_reset - Unconditionally shut down a connection + * + * @dccph_reset_service - Echoes the Service Code on a received DCCP-Request + * @dccph_reset_options - list of options (must be a multiple of 32 bits + */ +struct dccp_hdr_reset { + struct dccp_hdr_ack_bits dccph_reset_ack; + __u8 dccph_reset_code, + dccph_reset_data[3]; +}; + +static inline struct dccp_hdr_reset *dccp_hdr_reset(struct sk_buff *skb) +{ + return (struct dccp_hdr_reset *)(skb->h.raw + dccp_basic_hdr_len(skb)); +} + +enum dccp_pkt_type { + DCCP_PKT_REQUEST = 0, + DCCP_PKT_RESPONSE, + DCCP_PKT_DATA, + DCCP_PKT_ACK, + DCCP_PKT_DATAACK, + DCCP_PKT_CLOSEREQ, + DCCP_PKT_CLOSE, + DCCP_PKT_RESET, + DCCP_PKT_SYNC, + DCCP_PKT_SYNCACK, + DCCP_PKT_INVALID, +}; + +#define DCCP_NR_PKT_TYPES DCCP_PKT_INVALID + +static inline unsigned int dccp_packet_hdr_len(const __u8 type) +{ + if (type == DCCP_PKT_DATA) + return 0; + if (type == DCCP_PKT_DATAACK || + type == DCCP_PKT_ACK || + type == DCCP_PKT_SYNC || + type == DCCP_PKT_SYNCACK || + type == DCCP_PKT_CLOSE || + type == DCCP_PKT_CLOSEREQ) + return sizeof(struct dccp_hdr_ack_bits); + if (type == DCCP_PKT_REQUEST) + return sizeof(struct dccp_hdr_request); + if (type == DCCP_PKT_RESPONSE) + return sizeof(struct dccp_hdr_response); + return sizeof(struct dccp_hdr_reset); +} + +static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) +{ + return dccp_basic_hdr_len(skb) + + dccp_packet_hdr_len(dccp_hdr(skb)->dccph_type); +} + +enum dccp_reset_codes { + DCCP_RESET_CODE_UNSPECIFIED = 0, + DCCP_RESET_CODE_CLOSED, + DCCP_RESET_CODE_ABORTED, + DCCP_RESET_CODE_NO_CONNECTION, + DCCP_RESET_CODE_PACKET_ERROR, + DCCP_RESET_CODE_OPTION_ERROR, + DCCP_RESET_CODE_MANDATORY_ERROR, + DCCP_RESET_CODE_CONNECTION_REFUSED, + DCCP_RESET_CODE_BAD_SERVICE_CODE, + DCCP_RESET_CODE_TOO_BUSY, + DCCP_RESET_CODE_BAD_INIT_COOKIE, + DCCP_RESET_CODE_AGGRESSION_PENALTY, +}; + +/* DCCP options */ +enum { + DCCPO_PADDING = 0, + DCCPO_MANDATORY = 1, + DCCPO_MIN_RESERVED = 3, + DCCPO_MAX_RESERVED = 31, + DCCPO_NDP_COUNT = 37, + DCCPO_ACK_VECTOR_0 = 38, + DCCPO_ACK_VECTOR_1 = 39, + DCCPO_TIMESTAMP = 41, + DCCPO_TIMESTAMP_ECHO = 42, + DCCPO_ELAPSED_TIME = 43, + DCCPO_MAX = 45, + DCCPO_MIN_CCID_SPECIFIC = 128, + DCCPO_MAX_CCID_SPECIFIC = 255, +}; + +/* DCCP features */ +enum { + DCCPF_RESERVED = 0, + DCCPF_SEQUENCE_WINDOW = 3, + DCCPF_SEND_ACK_VECTOR = 6, + DCCPF_SEND_NDP_COUNT = 7, + /* 10-127 reserved */ + DCCPF_MIN_CCID_SPECIFIC = 128, + DCCPF_MAX_CCID_SPECIFIC = 255, +}; + +/* initial values for each feature */ +#define DCCPF_INITIAL_SEQUENCE_WINDOW 100 +/* FIXME: for now we're using CCID 3 (TFRC) */ +#define DCCPF_INITIAL_CCID 3 +#define DCCPF_INITIAL_SEND_ACK_VECTOR 0 +/* FIXME: for now we're default to 1 but it should really be 0 */ +#define DCCPF_INITIAL_SEND_NDP_COUNT 1 + +#define DCCP_NDP_LIMIT 0xFFFFFF + +/** + * struct dccp_options - option values for a DCCP connection + * @dccpo_sequence_window - Sequence Window Feature (section 7.5.2) + * @dccpo_ccid - Congestion Control Id (CCID) (section 10) + * @dccpo_send_ack_vector - Send Ack Vector Feature (section 11.5) + * @dccpo_send_ndp_count - Send NDP Count Feature (7.7.2) + */ +struct dccp_options { + __u64 dccpo_sequence_window; + __u8 dccpo_ccid; + __u8 dccpo_send_ack_vector; + __u8 dccpo_send_ndp_count; +}; + +extern void __dccp_options_init(struct dccp_options *dccpo); +extern void dccp_options_init(struct dccp_options *dccpo); +extern int dccp_parse_options(struct sock *sk, struct sk_buff *skb); + +struct dccp_request_sock { + struct inet_request_sock dreq_inet_rsk; + __u64 dreq_iss; + __u64 dreq_isr; + __u32 dreq_service; +}; + +static inline struct dccp_request_sock *dccp_rsk(const struct request_sock *req) +{ + return (struct dccp_request_sock *)req; +} + +/* Read about the ECN nonce to see why it is 253 */ +#define DCCP_MAX_ACK_VECTOR_LEN 253 + +struct dccp_options_received { + u32 dccpor_ndp:24, + dccpor_ack_vector_len:8; + u32 dccpor_ack_vector_idx:10; + /* 22 bits hole, try to pack */ + u32 dccpor_timestamp; + u32 dccpor_timestamp_echo; + u32 dccpor_elapsed_time; +}; + +struct ccid; + +enum dccp_role { + DCCP_ROLE_UNDEFINED, + DCCP_ROLE_LISTEN, + DCCP_ROLE_CLIENT, + DCCP_ROLE_SERVER, +}; + +/** + * struct dccp_sock - DCCP socket state + * + * @dccps_swl - sequence number window low + * @dccps_swh - sequence number window high + * @dccps_awl - acknowledgement number window low + * @dccps_awh - acknowledgement number window high + * @dccps_iss - initial sequence number sent + * @dccps_isr - initial sequence number received + * @dccps_osr - first OPEN sequence number received + * @dccps_gss - greatest sequence number sent + * @dccps_gsr - greatest valid sequence number received + * @dccps_gar - greatest valid ack number received on a non-Sync; initialized to %dccps_iss + * @dccps_timestamp_time - time of latest TIMESTAMP option + * @dccps_timestamp_echo - latest timestamp received on a TIMESTAMP option + * @dccps_ext_header_len - network protocol overhead (IP/IPv6 options) + * @dccps_pmtu_cookie - Last pmtu seen by socket + * @dccps_avg_packet_size - FIXME: has to be set by the app thru some setsockopt or ioctl, CCID3 uses it + * @dccps_role - Role of this sock, one of %dccp_role + * @dccps_ndp_count - number of Non Data Packets since last data packet + * @dccps_hc_rx_ackpkts - receiver half connection acked packets + */ +struct dccp_sock { + /* inet_connection_sock has to be the first member of dccp_sock */ + struct inet_connection_sock dccps_inet_connection; + __u64 dccps_swl; + __u64 dccps_swh; + __u64 dccps_awl; + __u64 dccps_awh; + __u64 dccps_iss; + __u64 dccps_isr; + __u64 dccps_osr; + __u64 dccps_gss; + __u64 dccps_gsr; + __u64 dccps_gar; + unsigned long dccps_service; + unsigned long dccps_timestamp_time; + __u32 dccps_timestamp_echo; + __u32 dccps_avg_packet_size; + unsigned long dccps_ndp_count; + __u16 dccps_ext_header_len; + __u32 dccps_pmtu_cookie; + __u32 dccps_mss_cache; + struct dccp_options dccps_options; + struct dccp_ackpkts *dccps_hc_rx_ackpkts; + void *dccps_hc_rx_ccid_private; + void *dccps_hc_tx_ccid_private; + struct ccid *dccps_hc_rx_ccid; + struct ccid *dccps_hc_tx_ccid; + struct dccp_options_received dccps_options_received; + enum dccp_role dccps_role:2; +}; + +static inline struct dccp_sock *dccp_sk(const struct sock *sk) +{ + return (struct dccp_sock *)sk; +} + +static inline const char *dccp_role(const struct sock *sk) +{ + switch (dccp_sk(sk)->dccps_role) { + case DCCP_ROLE_UNDEFINED: return "undefined"; + case DCCP_ROLE_LISTEN: return "listen"; + case DCCP_ROLE_SERVER: return "server"; + case DCCP_ROLE_CLIENT: return "client"; + } + return NULL; +} + +#endif /* _LINUX_DCCP_H */ diff --git a/include/linux/in.h b/include/linux/in.h index fb88c66..ba35538 100644 --- a/include/linux/in.h +++ b/include/linux/in.h @@ -32,6 +32,7 @@ enum { IPPROTO_PUP = 12, /* PUP protocol */ IPPROTO_UDP = 17, /* User Datagram Protocol */ IPPROTO_IDP = 22, /* XNS IDP protocol */ + IPPROTO_DCCP = 33, /* Datagram Congestion Control Protocol */ IPPROTO_RSVP = 46, /* RSVP protocol */ IPPROTO_GRE = 47, /* Cisco GRE tunnels (rfc 1701,1702) */ diff --git a/include/linux/net.h b/include/linux/net.h index 3990661..5f8b632 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -84,6 +84,7 @@ enum sock_type { SOCK_RAW = 3, SOCK_RDM = 4, SOCK_SEQPACKET = 5, + SOCK_DCCP = 6, SOCK_PACKET = 10, }; diff --git a/include/linux/socket.h b/include/linux/socket.h index a5c7d96..ddf2255 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -271,6 +271,7 @@ struct ucred { #define SOL_IRDA 266 #define SOL_NETBEUI 267 #define SOL_LLC 268 +#define SOL_DCCP 269 /* IPX options */ #define IPX_TYPE 1 diff --git a/net/Kconfig b/net/Kconfig index 02877ac..c07aafb 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -147,6 +147,7 @@ source "net/bridge/netfilter/Kconfig" endif +source "net/dccp/Kconfig" source "net/sctp/Kconfig" source "net/atm/Kconfig" source "net/bridge/Kconfig" diff --git a/net/Makefile b/net/Makefile index 4a01be8..7e6eff2 100644 --- a/net/Makefile +++ b/net/Makefile @@ -42,6 +42,7 @@ obj-$(CONFIG_ATM) += atm/ obj-$(CONFIG_DECNET) += decnet/ obj-$(CONFIG_ECONET) += econet/ obj-$(CONFIG_VLAN_8021Q) += 8021q/ +obj-$(CONFIG_IP_DCCP) += dccp/ obj-$(CONFIG_IP_SCTP) += sctp/ ifeq ($(CONFIG_NET),y) diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig new file mode 100644 index 0000000..90460bc --- /dev/null +++ b/net/dccp/Kconfig @@ -0,0 +1,24 @@ +menu "DCCP Configuration (EXPERIMENTAL)" + depends on INET && EXPERIMENTAL + +config IP_DCCP + tristate "The DCCP Protocol (EXPERIMENTAL)" + ---help--- + Datagram Congestion Control Protocol + + From draft-ietf-dccp-spec-11 . + + The Datagram Congestion Control Protocol (DCCP) is a transport + protocol that implements bidirectional, unicast connections of + congestion-controlled, unreliable datagrams. It should be suitable + for use by applications such as streaming media, Internet telephony, + and on-line games + + To compile this protocol support as a module, choose M here: the + module will be called dccp. + + If in doubt, say N. + +source "net/dccp/ccids/Kconfig" + +endmenu diff --git a/net/dccp/Makefile b/net/dccp/Makefile new file mode 100644 index 0000000..c6e6ba5 --- /dev/null +++ b/net/dccp/Makefile @@ -0,0 +1,5 @@ +obj-$(CONFIG_IP_DCCP) += dccp.o + +dccp-y := ccid.o input.o ipv4.o minisocks.o options.o output.o proto.o timer.o + +obj-y += ccids/ diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c new file mode 100644 index 0000000..9d8fc0e --- /dev/null +++ b/net/dccp/ccid.c @@ -0,0 +1,139 @@ +/* + * net/dccp/ccid.c + * + * An implementation of the DCCP protocol + * Arnaldo Carvalho de Melo + * + * CCID infrastructure + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include "ccid.h" + +static struct ccid *ccids[CCID_MAX]; +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) +static atomic_t ccids_lockct = ATOMIC_INIT(0); +static DEFINE_SPINLOCK(ccids_lock); + +/* + * The strategy is: modifications ccids vector are short, do not sleep and + * veeery rare, but read access should be free of any exclusive locks. + */ +static void ccids_write_lock(void) +{ + spin_lock(&ccids_lock); + while (atomic_read(&ccids_lockct) != 0) { + spin_unlock(&ccids_lock); + yield(); + spin_lock(&ccids_lock); + } +} + +static inline void ccids_write_unlock(void) +{ + spin_unlock(&ccids_lock); +} + +static inline void ccids_read_lock(void) +{ + atomic_inc(&ccids_lockct); + spin_unlock_wait(&ccids_lock); +} + +static inline void ccids_read_unlock(void) +{ + atomic_dec(&ccids_lockct); +} + +#else +#define ccids_write_lock() do { } while(0) +#define ccids_write_unlock() do { } while(0) +#define ccids_read_lock() do { } while(0) +#define ccids_read_unlock() do { } while(0) +#endif + +int ccid_register(struct ccid *ccid) +{ + int err; + + if (ccid->ccid_init == NULL) + return -1; + + ccids_write_lock(); + err = -EEXIST; + if (ccids[ccid->ccid_id] == NULL) { + ccids[ccid->ccid_id] = ccid; + err = 0; + } + ccids_write_unlock(); + if (err == 0) + pr_info("CCID: Registered CCID %d (%s)\n", + ccid->ccid_id, ccid->ccid_name); + return err; +} + +EXPORT_SYMBOL_GPL(ccid_register); + +int ccid_unregister(struct ccid *ccid) +{ + ccids_write_lock(); + ccids[ccid->ccid_id] = NULL; + ccids_write_unlock(); + pr_info("CCID: Unregistered CCID %d (%s)\n", + ccid->ccid_id, ccid->ccid_name); + return 0; +} + +EXPORT_SYMBOL_GPL(ccid_unregister); + +struct ccid *ccid_init(unsigned char id, struct sock *sk) +{ + struct ccid *ccid; + +#ifdef CONFIG_KMOD + if (ccids[id] == NULL) + request_module("net-dccp-ccid-%d", id); +#endif + ccids_read_lock(); + + ccid = ccids[id]; + if (ccid == NULL) + goto out; + + if (!try_module_get(ccid->ccid_owner)) + goto out_err; + + if (ccid->ccid_init(sk) != 0) + goto out_module_put; +out: + ccids_read_unlock(); + return ccid; +out_module_put: + module_put(ccid->ccid_owner); +out_err: + ccid = NULL; + goto out; +} + +EXPORT_SYMBOL_GPL(ccid_init); + +void ccid_exit(struct ccid *ccid, struct sock *sk) +{ + if (ccid == NULL) + return; + + ccids_read_lock(); + + if (ccids[ccid->ccid_id] != NULL) { + if (ccid->ccid_exit != NULL) + ccid->ccid_exit(sk); + module_put(ccid->ccid_owner); + } + + ccids_read_unlock(); +} + +EXPORT_SYMBOL_GPL(ccid_exit); diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h new file mode 100644 index 0000000..06105b2 --- /dev/null +++ b/net/dccp/ccid.h @@ -0,0 +1,156 @@ +#ifndef _CCID_H +#define _CCID_H +/* + * net/dccp/ccid.h + * + * An implementation of the DCCP protocol + * Arnaldo Carvalho de Melo + * + * CCID infrastructure + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +#define CCID_MAX 255 + +struct ccid { + unsigned char ccid_id; + const char *ccid_name; + struct module *ccid_owner; + int (*ccid_init)(struct sock *sk); + void (*ccid_exit)(struct sock *sk); + int (*ccid_hc_rx_init)(struct sock *sk); + int (*ccid_hc_tx_init)(struct sock *sk); + void (*ccid_hc_rx_exit)(struct sock *sk); + void (*ccid_hc_tx_exit)(struct sock *sk); + void (*ccid_hc_rx_packet_recv)(struct sock *sk, struct sk_buff *skb); + int (*ccid_hc_rx_parse_options)(struct sock *sk, + unsigned char option, + unsigned char len, u16 idx, + unsigned char* value); + void (*ccid_hc_rx_insert_options)(struct sock *sk, struct sk_buff *skb); + void (*ccid_hc_tx_insert_options)(struct sock *sk, struct sk_buff *skb); + void (*ccid_hc_tx_packet_recv)(struct sock *sk, struct sk_buff *skb); + int (*ccid_hc_tx_parse_options)(struct sock *sk, + unsigned char option, + unsigned char len, u16 idx, + unsigned char* value); + int (*ccid_hc_tx_send_packet)(struct sock *sk, + struct sk_buff *skb, int len, + long *delay); + void (*ccid_hc_tx_packet_sent)(struct sock *sk, int more, int len); +}; + +extern int ccid_register(struct ccid *ccid); +extern int ccid_unregister(struct ccid *ccid); + +extern struct ccid *ccid_init(unsigned char id, struct sock *sk); +extern void ccid_exit(struct ccid *ccid, struct sock *sk); + +static inline void __ccid_get(struct ccid *ccid) +{ + __module_get(ccid->ccid_owner); +} + +static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk, + struct sk_buff *skb, int len, + long *delay) +{ + int rc = 0; + if (ccid->ccid_hc_tx_send_packet != NULL) + rc = ccid->ccid_hc_tx_send_packet(sk, skb, len, delay); + return rc; +} + +static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk, + int more, int len) +{ + if (ccid->ccid_hc_tx_packet_sent != NULL) + ccid->ccid_hc_tx_packet_sent(sk, more, len); +} + +static inline int ccid_hc_rx_init(struct ccid *ccid, struct sock *sk) +{ + int rc = 0; + if (ccid->ccid_hc_rx_init != NULL) + rc = ccid->ccid_hc_rx_init(sk); + return rc; +} + +static inline int ccid_hc_tx_init(struct ccid *ccid, struct sock *sk) +{ + int rc = 0; + if (ccid->ccid_hc_tx_init != NULL) + rc = ccid->ccid_hc_tx_init(sk); + return rc; +} + +static inline void ccid_hc_rx_exit(struct ccid *ccid, struct sock *sk) +{ + if (ccid->ccid_hc_rx_exit != NULL) + ccid->ccid_hc_rx_exit(sk); +} + +static inline void ccid_hc_tx_exit(struct ccid *ccid, struct sock *sk) +{ + if (ccid->ccid_hc_tx_exit != NULL) + ccid->ccid_hc_tx_exit(sk); +} + +static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk, + struct sk_buff *skb) +{ + if (ccid->ccid_hc_rx_packet_recv != NULL) + ccid->ccid_hc_rx_packet_recv(sk, skb); +} + +static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk, + struct sk_buff *skb) +{ + if (ccid->ccid_hc_tx_packet_recv != NULL) + ccid->ccid_hc_tx_packet_recv(sk, skb); +} + +static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk, + unsigned char option, + unsigned char len, u16 idx, + unsigned char* value) +{ + int rc = 0; + if (ccid->ccid_hc_tx_parse_options != NULL) + rc = ccid->ccid_hc_tx_parse_options(sk, option, len, idx, value); + return rc; +} + +static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk, + unsigned char option, + unsigned char len, u16 idx, + unsigned char* value) +{ + int rc = 0; + if (ccid->ccid_hc_rx_parse_options != NULL) + rc = ccid->ccid_hc_rx_parse_options(sk, option, len, idx, value); + return rc; +} + +static inline void ccid_hc_tx_insert_options(struct ccid *ccid, struct sock *sk, + struct sk_buff *skb) +{ + if (ccid->ccid_hc_tx_insert_options != NULL) + ccid->ccid_hc_tx_insert_options(sk, skb); +} + +static inline void ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk, + struct sk_buff *skb) +{ + if (ccid->ccid_hc_rx_insert_options != NULL) + ccid->ccid_hc_rx_insert_options(sk, skb); +} +#endif /* _CCID_H */ diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig new file mode 100644 index 0000000..67f9c06 --- /dev/null +++ b/net/dccp/ccids/Kconfig @@ -0,0 +1,25 @@ +menu "DCCP CCIDs Configuration (EXPERIMENTAL)" + depends on IP_DCCP && EXPERIMENTAL + +config IP_DCCP_CCID3 + tristate "CCID3 (TFRC) (EXPERIMENTAL)" + depends on IP_DCCP + ---help--- + CCID 3 denotes TCP-Friendly Rate Control (TFRC), an equation-based + rate-controlled congestion control mechanism. TFRC is designed to + be reasonably fair when competing for bandwidth with TCP-like flows, + where a flow is "reasonably fair" if its sending rate is generally + within a factor of two of the sending rate of a TCP flow under the + same conditions. However, TFRC has a much lower variation of + throughput over time compared with TCP, which makes CCID 3 more + suitable than CCID 2 for applications such streaming media where a + relatively smooth sending rate is of importance. + + CCID 3 is further described in [CCID 3 PROFILE]. The TFRC + congestion control algorithms were initially described in RFC 3448. + + This text was extracted from draft-ietf-dccp-spec-11.txt. + + If in doubt, say M. + +endmenu diff --git a/net/dccp/ccids/Makefile b/net/dccp/ccids/Makefile new file mode 100644 index 0000000..1c72013 --- /dev/null +++ b/net/dccp/ccids/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_IP_DCCP_CCID3) += dccp_ccid3.o + +dccp_ccid3-y := ccid3.o diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c new file mode 100644 index 0000000..4f45902 --- /dev/null +++ b/net/dccp/ccids/ccid3.c @@ -0,0 +1,2164 @@ +/* + * net/dccp/ccids/ccid3.c + * + * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand. + * + * An implementation of the DCCP protocol + * + * This code has been developed by the University of Waikato WAND + * research group. For further information please see http://www.wand.net.nz/ + * or e-mail Ian McDonald - iam4@cs.waikato.ac.nz + * + * This code also uses code from Lulea University, rereleased as GPL by its + * authors: + * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon + * + * Changes to meet Linux coding standards, to make it meet latest ccid3 draft + * and to make it work as a loadable module in the DCCP stack written by + * Arnaldo Carvalho de Melo . + * + * Copyright (c) 2005 Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include "../ccid.h" +#include "../dccp.h" +#include "ccid3.h" + +#ifdef CCID3_DEBUG +extern int ccid3_debug; + +#define ccid3_pr_debug(format, a...) \ + do { if (ccid3_debug) \ + printk(KERN_DEBUG "%s: " format, __FUNCTION__, ##a); \ + } while (0) +#else +#define ccid3_pr_debug(format, a...) +#endif + +#define TFRC_MIN_PACKET_SIZE 16 +#define TFRC_STD_PACKET_SIZE 256 +#define TFRC_MAX_PACKET_SIZE 65535 + +#define USEC_IN_SEC 1000000 + +#define TFRC_INITIAL_TIMEOUT (2 * USEC_IN_SEC) +/* two seconds as per CCID3 spec 11 */ + +#define TFRC_OPSYS_HALF_TIME_GRAN (USEC_IN_SEC / (2 * HZ)) +/* above is in usecs - half the scheduling granularity as per RFC3448 4.6 */ + +#define TFRC_WIN_COUNT_PER_RTT 4 +#define TFRC_WIN_COUNT_LIMIT 16 + +#define TFRC_MAX_BACK_OFF_TIME 64 +/* above is in seconds */ + +#define TFRC_SMALLEST_P 40 + +#define TFRC_RECV_IVAL_F_LENGTH 8 /* length(w[]) */ + +/* Number of later packets received before one is considered lost */ +#define TFRC_RECV_NUM_LATE_LOSS 3 + +enum ccid3_options { + TFRC_OPT_LOSS_EVENT_RATE = 192, + TFRC_OPT_LOSS_INTERVALS = 193, + TFRC_OPT_RECEIVE_RATE = 194, +}; + +static int ccid3_debug; + +static kmem_cache_t *ccid3_tx_hist_slab; +static kmem_cache_t *ccid3_rx_hist_slab; +static kmem_cache_t *ccid3_loss_interval_hist_slab; + +static inline struct ccid3_tx_hist_entry *ccid3_tx_hist_entry_new(int prio) +{ + struct ccid3_tx_hist_entry *entry = kmem_cache_alloc(ccid3_tx_hist_slab, prio); + + if (entry != NULL) + entry->ccid3htx_sent = 0; + + return entry; +} + +static inline void ccid3_tx_hist_entry_delete(struct ccid3_tx_hist_entry *entry) +{ + if (entry != NULL) + kmem_cache_free(ccid3_tx_hist_slab, entry); +} + +static inline struct ccid3_rx_hist_entry *ccid3_rx_hist_entry_new(struct sock *sk, + struct sk_buff *skb, + int prio) +{ + struct ccid3_rx_hist_entry *entry = kmem_cache_alloc(ccid3_rx_hist_slab, prio); + + if (entry != NULL) { + const struct dccp_hdr *dh = dccp_hdr(skb); + + entry->ccid3hrx_seqno = DCCP_SKB_CB(skb)->dccpd_seq; + entry->ccid3hrx_win_count = dh->dccph_ccval; + entry->ccid3hrx_type = dh->dccph_type; + entry->ccid3hrx_ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp; + do_gettimeofday(&(entry->ccid3hrx_tstamp)); + } + + return entry; +} + +static inline void ccid3_rx_hist_entry_delete(struct ccid3_rx_hist_entry *entry) +{ + if (entry != NULL) + kmem_cache_free(ccid3_rx_hist_slab, entry); +} + +static void ccid3_rx_history_delete(struct list_head *hist) +{ + struct ccid3_rx_hist_entry *entry, *next; + + list_for_each_entry_safe(entry, next, hist, ccid3hrx_node) { + list_del_init(&entry->ccid3hrx_node); + kmem_cache_free(ccid3_rx_hist_slab, entry); + } +} + +static inline struct ccid3_loss_interval_hist_entry *ccid3_loss_interval_hist_entry_new(int prio) +{ + return kmem_cache_alloc(ccid3_loss_interval_hist_slab, prio); +} + +static inline void ccid3_loss_interval_hist_entry_delete(struct ccid3_loss_interval_hist_entry *entry) +{ + if (entry != NULL) + kmem_cache_free(ccid3_loss_interval_hist_slab, entry); +} + +static void ccid3_loss_interval_history_delete(struct list_head *hist) +{ + struct ccid3_loss_interval_hist_entry *entry, *next; + + list_for_each_entry_safe(entry, next, hist, ccid3lih_node) { + list_del_init(&entry->ccid3lih_node); + kmem_cache_free(ccid3_loss_interval_hist_slab, entry); + } +} + +static int ccid3_init(struct sock *sk) +{ + ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk); + return 0; +} + +static void ccid3_exit(struct sock *sk) +{ + ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk); +} + +/* TFRC sender states */ +enum ccid3_hc_tx_states { + TFRC_SSTATE_NO_SENT = 1, + TFRC_SSTATE_NO_FBACK, + TFRC_SSTATE_FBACK, + TFRC_SSTATE_TERM, +}; + +#ifdef CCID3_DEBUG +static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state) +{ + static char *ccid3_state_names[] = { + [TFRC_SSTATE_NO_SENT] = "NO_SENT", + [TFRC_SSTATE_NO_FBACK] = "NO_FBACK", + [TFRC_SSTATE_FBACK] = "FBACK", + [TFRC_SSTATE_TERM] = "TERM", + }; + + return ccid3_state_names[state]; +} +#endif + +static inline void ccid3_hc_tx_set_state(struct sock *sk, enum ccid3_hc_tx_states state) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private; + enum ccid3_hc_tx_states oldstate = hctx->ccid3hctx_state; + + ccid3_pr_debug("%s(%p) %-8.8s -> %s\n", + dccp_role(sk), sk, ccid3_tx_state_name(oldstate), ccid3_tx_state_name(state)); + WARN_ON(state == oldstate); + hctx->ccid3hctx_state = state; +} + +static void timeval_sub(struct timeval large, struct timeval small, struct timeval *result) { + + result->tv_sec = large.tv_sec-small.tv_sec; + if (large.tv_usec < small.tv_usec) { + (result->tv_sec)--; + result->tv_usec = USEC_IN_SEC+large.tv_usec-small.tv_usec; + } else + result->tv_usec = large.tv_usec-small.tv_usec; +} + +static inline void timeval_fix(struct timeval *tv) { + if (tv->tv_usec >= USEC_IN_SEC) { + tv->tv_sec++; + tv->tv_usec -= USEC_IN_SEC; + } +} + +/* returns the difference in usecs between timeval passed in and current time */ +static inline u32 now_delta(struct timeval tv) { + struct timeval now; + + do_gettimeofday(&now); + return ((now.tv_sec-tv.tv_sec)*1000000+now.tv_usec-tv.tv_usec); +} + +#define CALCX_ARRSIZE 500 + +#define CALCX_SPLIT 50000 +/* equivalent to 0.05 */ + +static const u32 calcx_lookup[CALCX_ARRSIZE][2] = { + { 37172 , 8172 }, + { 53499 , 11567 }, + { 66664 , 14180 }, + { 78298 , 16388 }, + { 89021 , 18339 }, + { 99147 , 20108 }, + { 108858 , 21738 }, + { 118273 , 23260 }, + { 127474 , 24693 }, + { 136520 , 26052 }, + { 145456 , 27348 }, + { 154316 , 28589 }, + { 163130 , 29783 }, + { 171919 , 30935 }, + { 180704 , 32049 }, + { 189502 , 33130 }, + { 198328 , 34180 }, + { 207194 , 35202 }, + { 216114 , 36198 }, + { 225097 , 37172 }, + { 234153 , 38123 }, + { 243294 , 39055 }, + { 252527 , 39968 }, + { 261861 , 40864 }, + { 271305 , 41743 }, + { 280866 , 42607 }, + { 290553 , 43457 }, + { 300372 , 44293 }, + { 310333 , 45117 }, + { 320441 , 45929 }, + { 330705 , 46729 }, + { 341131 , 47518 }, + { 351728 , 48297 }, + { 362501 , 49066 }, + { 373460 , 49826 }, + { 384609 , 50577 }, + { 395958 , 51320 }, + { 407513 , 52054 }, + { 419281 , 52780 }, + { 431270 , 53499 }, + { 443487 , 54211 }, + { 455940 , 54916 }, + { 468635 , 55614 }, + { 481581 , 56306 }, + { 494785 , 56991 }, + { 508254 , 57671 }, + { 521996 , 58345 }, + { 536019 , 59014 }, + { 550331 , 59677 }, + { 564939 , 60335 }, + { 579851 , 60988 }, + { 595075 , 61636 }, + { 610619 , 62279 }, + { 626491 , 62918 }, + { 642700 , 63553 }, + { 659253 , 64183 }, + { 676158 , 64809 }, + { 693424 , 65431 }, + { 711060 , 66050 }, + { 729073 , 66664 }, + { 747472 , 67275 }, + { 766266 , 67882 }, + { 785464 , 68486 }, + { 805073 , 69087 }, + { 825103 , 69684 }, + { 845562 , 70278 }, + { 866460 , 70868 }, + { 887805 , 71456 }, + { 909606 , 72041 }, + { 931873 , 72623 }, + { 954614 , 73202 }, + { 977839 , 73778 }, + { 1001557 , 74352 }, + { 1025777 , 74923 }, + { 1050508 , 75492 }, + { 1075761 , 76058 }, + { 1101544 , 76621 }, + { 1127867 , 77183 }, + { 1154739 , 77741 }, + { 1182172 , 78298 }, + { 1210173 , 78852 }, + { 1238753 , 79405 }, + { 1267922 , 79955 }, + { 1297689 , 80503 }, + { 1328066 , 81049 }, + { 1359060 , 81593 }, + { 1390684 , 82135 }, + { 1422947 , 82675 }, + { 1455859 , 83213 }, + { 1489430 , 83750 }, + { 1523671 , 84284 }, + { 1558593 , 84817 }, + { 1594205 , 85348 }, + { 1630518 , 85878 }, + { 1667543 , 86406 }, + { 1705290 , 86932 }, + { 1743770 , 87457 }, + { 1782994 , 87980 }, + { 1822973 , 88501 }, + { 1863717 , 89021 }, + { 1905237 , 89540 }, + { 1947545 , 90057 }, + { 1990650 , 90573 }, + { 2034566 , 91087 }, + { 2079301 , 91600 }, + { 2124869 , 92111 }, + { 2171279 , 92622 }, + { 2218543 , 93131 }, + { 2266673 , 93639 }, + { 2315680 , 94145 }, + { 2365575 , 94650 }, + { 2416371 , 95154 }, + { 2468077 , 95657 }, + { 2520707 , 96159 }, + { 2574271 , 96660 }, + { 2628782 , 97159 }, + { 2684250 , 97658 }, + { 2740689 , 98155 }, + { 2798110 , 98651 }, + { 2856524 , 99147 }, + { 2915944 , 99641 }, + { 2976382 , 100134 }, + { 3037850 , 100626 }, + { 3100360 , 101117 }, + { 3163924 , 101608 }, + { 3228554 , 102097 }, + { 3294263 , 102586 }, + { 3361063 , 103073 }, + { 3428966 , 103560 }, + { 3497984 , 104045 }, + { 3568131 , 104530 }, + { 3639419 , 105014 }, + { 3711860 , 105498 }, + { 3785467 , 105980 }, + { 3860253 , 106462 }, + { 3936229 , 106942 }, + { 4013410 , 107422 }, + { 4091808 , 107902 }, + { 4171435 , 108380 }, + { 4252306 , 108858 }, + { 4334431 , 109335 }, + { 4417825 , 109811 }, + { 4502501 , 110287 }, + { 4588472 , 110762 }, + { 4675750 , 111236 }, + { 4764349 , 111709 }, + { 4854283 , 112182 }, + { 4945564 , 112654 }, + { 5038206 , 113126 }, + { 5132223 , 113597 }, + { 5227627 , 114067 }, + { 5324432 , 114537 }, + { 5422652 , 115006 }, + { 5522299 , 115474 }, + { 5623389 , 115942 }, + { 5725934 , 116409 }, + { 5829948 , 116876 }, + { 5935446 , 117342 }, + { 6042439 , 117808 }, + { 6150943 , 118273 }, + { 6260972 , 118738 }, + { 6372538 , 119202 }, + { 6485657 , 119665 }, + { 6600342 , 120128 }, + { 6716607 , 120591 }, + { 6834467 , 121053 }, + { 6953935 , 121514 }, + { 7075025 , 121976 }, + { 7197752 , 122436 }, + { 7322131 , 122896 }, + { 7448175 , 123356 }, + { 7575898 , 123815 }, + { 7705316 , 124274 }, + { 7836442 , 124733 }, + { 7969291 , 125191 }, + { 8103877 , 125648 }, + { 8240216 , 126105 }, + { 8378321 , 126562 }, + { 8518208 , 127018 }, + { 8659890 , 127474 }, + { 8803384 , 127930 }, + { 8948702 , 128385 }, + { 9095861 , 128840 }, + { 9244875 , 129294 }, + { 9395760 , 129748 }, + { 9548529 , 130202 }, + { 9703198 , 130655 }, + { 9859782 , 131108 }, + { 10018296 , 131561 }, + { 10178755 , 132014 }, + { 10341174 , 132466 }, + { 10505569 , 132917 }, + { 10671954 , 133369 }, + { 10840345 , 133820 }, + { 11010757 , 134271 }, + { 11183206 , 134721 }, + { 11357706 , 135171 }, + { 11534274 , 135621 }, + { 11712924 , 136071 }, + { 11893673 , 136520 }, + { 12076536 , 136969 }, + { 12261527 , 137418 }, + { 12448664 , 137867 }, + { 12637961 , 138315 }, + { 12829435 , 138763 }, + { 13023101 , 139211 }, + { 13218974 , 139658 }, + { 13417071 , 140106 }, + { 13617407 , 140553 }, + { 13819999 , 140999 }, + { 14024862 , 141446 }, + { 14232012 , 141892 }, + { 14441465 , 142339 }, + { 14653238 , 142785 }, + { 14867346 , 143230 }, + { 15083805 , 143676 }, + { 15302632 , 144121 }, + { 15523842 , 144566 }, + { 15747453 , 145011 }, + { 15973479 , 145456 }, + { 16201939 , 145900 }, + { 16432847 , 146345 }, + { 16666221 , 146789 }, + { 16902076 , 147233 }, + { 17140429 , 147677 }, + { 17381297 , 148121 }, + { 17624696 , 148564 }, + { 17870643 , 149007 }, + { 18119154 , 149451 }, + { 18370247 , 149894 }, + { 18623936 , 150336 }, + { 18880241 , 150779 }, + { 19139176 , 151222 }, + { 19400759 , 151664 }, + { 19665007 , 152107 }, + { 19931936 , 152549 }, + { 20201564 , 152991 }, + { 20473907 , 153433 }, + { 20748982 , 153875 }, + { 21026807 , 154316 }, + { 21307399 , 154758 }, + { 21590773 , 155199 }, + { 21876949 , 155641 }, + { 22165941 , 156082 }, + { 22457769 , 156523 }, + { 22752449 , 156964 }, + { 23049999 , 157405 }, + { 23350435 , 157846 }, + { 23653774 , 158287 }, + { 23960036 , 158727 }, + { 24269236 , 159168 }, + { 24581392 , 159608 }, + { 24896521 , 160049 }, + { 25214642 , 160489 }, + { 25535772 , 160929 }, + { 25859927 , 161370 }, + { 26187127 , 161810 }, + { 26517388 , 162250 }, + { 26850728 , 162690 }, + { 27187165 , 163130 }, + { 27526716 , 163569 }, + { 27869400 , 164009 }, + { 28215234 , 164449 }, + { 28564236 , 164889 }, + { 28916423 , 165328 }, + { 29271815 , 165768 }, + { 29630428 , 166208 }, + { 29992281 , 166647 }, + { 30357392 , 167087 }, + { 30725779 , 167526 }, + { 31097459 , 167965 }, + { 31472452 , 168405 }, + { 31850774 , 168844 }, + { 32232445 , 169283 }, + { 32617482 , 169723 }, + { 33005904 , 170162 }, + { 33397730 , 170601 }, + { 33792976 , 171041 }, + { 34191663 , 171480 }, + { 34593807 , 171919 }, + { 34999428 , 172358 }, + { 35408544 , 172797 }, + { 35821174 , 173237 }, + { 36237335 , 173676 }, + { 36657047 , 174115 }, + { 37080329 , 174554 }, + { 37507197 , 174993 }, + { 37937673 , 175433 }, + { 38371773 , 175872 }, + { 38809517 , 176311 }, + { 39250924 , 176750 }, + { 39696012 , 177190 }, + { 40144800 , 177629 }, + { 40597308 , 178068 }, + { 41053553 , 178507 }, + { 41513554 , 178947 }, + { 41977332 , 179386 }, + { 42444904 , 179825 }, + { 42916290 , 180265 }, + { 43391509 , 180704 }, + { 43870579 , 181144 }, + { 44353520 , 181583 }, + { 44840352 , 182023 }, + { 45331092 , 182462 }, + { 45825761 , 182902 }, + { 46324378 , 183342 }, + { 46826961 , 183781 }, + { 47333531 , 184221 }, + { 47844106 , 184661 }, + { 48358706 , 185101 }, + { 48877350 , 185541 }, + { 49400058 , 185981 }, + { 49926849 , 186421 }, + { 50457743 , 186861 }, + { 50992759 , 187301 }, + { 51531916 , 187741 }, + { 52075235 , 188181 }, + { 52622735 , 188622 }, + { 53174435 , 189062 }, + { 53730355 , 189502 }, + { 54290515 , 189943 }, + { 54854935 , 190383 }, + { 55423634 , 190824 }, + { 55996633 , 191265 }, + { 56573950 , 191706 }, + { 57155606 , 192146 }, + { 57741621 , 192587 }, + { 58332014 , 193028 }, + { 58926806 , 193470 }, + { 59526017 , 193911 }, + { 60129666 , 194352 }, + { 60737774 , 194793 }, + { 61350361 , 195235 }, + { 61967446 , 195677 }, + { 62589050 , 196118 }, + { 63215194 , 196560 }, + { 63845897 , 197002 }, + { 64481179 , 197444 }, + { 65121061 , 197886 }, + { 65765563 , 198328 }, + { 66414705 , 198770 }, + { 67068508 , 199213 }, + { 67726992 , 199655 }, + { 68390177 , 200098 }, + { 69058085 , 200540 }, + { 69730735 , 200983 }, + { 70408147 , 201426 }, + { 71090343 , 201869 }, + { 71777343 , 202312 }, + { 72469168 , 202755 }, + { 73165837 , 203199 }, + { 73867373 , 203642 }, + { 74573795 , 204086 }, + { 75285124 , 204529 }, + { 76001380 , 204973 }, + { 76722586 , 205417 }, + { 77448761 , 205861 }, + { 78179926 , 206306 }, + { 78916102 , 206750 }, + { 79657310 , 207194 }, + { 80403571 , 207639 }, + { 81154906 , 208084 }, + { 81911335 , 208529 }, + { 82672880 , 208974 }, + { 83439562 , 209419 }, + { 84211402 , 209864 }, + { 84988421 , 210309 }, + { 85770640 , 210755 }, + { 86558080 , 211201 }, + { 87350762 , 211647 }, + { 88148708 , 212093 }, + { 88951938 , 212539 }, + { 89760475 , 212985 }, + { 90574339 , 213432 }, + { 91393551 , 213878 }, + { 92218133 , 214325 }, + { 93048107 , 214772 }, + { 93883493 , 215219 }, + { 94724314 , 215666 }, + { 95570590 , 216114 }, + { 96422343 , 216561 }, + { 97279594 , 217009 }, + { 98142366 , 217457 }, + { 99010679 , 217905 }, + { 99884556 , 218353 }, + { 100764018 , 218801 }, + { 101649086 , 219250 }, + { 102539782 , 219698 }, + { 103436128 , 220147 }, + { 104338146 , 220596 }, + { 105245857 , 221046 }, + { 106159284 , 221495 }, + { 107078448 , 221945 }, + { 108003370 , 222394 }, + { 108934074 , 222844 }, + { 109870580 , 223294 }, + { 110812910 , 223745 }, + { 111761087 , 224195 }, + { 112715133 , 224646 }, + { 113675069 , 225097 }, + { 114640918 , 225548 }, + { 115612702 , 225999 }, + { 116590442 , 226450 }, + { 117574162 , 226902 }, + { 118563882 , 227353 }, + { 119559626 , 227805 }, + { 120561415 , 228258 }, + { 121569272 , 228710 }, + { 122583219 , 229162 }, + { 123603278 , 229615 }, + { 124629471 , 230068 }, + { 125661822 , 230521 }, + { 126700352 , 230974 }, + { 127745083 , 231428 }, + { 128796039 , 231882 }, + { 129853241 , 232336 }, + { 130916713 , 232790 }, + { 131986475 , 233244 }, + { 133062553 , 233699 }, + { 134144966 , 234153 }, + { 135233739 , 234608 }, + { 136328894 , 235064 }, + { 137430453 , 235519 }, + { 138538440 , 235975 }, + { 139652876 , 236430 }, + { 140773786 , 236886 }, + { 141901190 , 237343 }, + { 143035113 , 237799 }, + { 144175576 , 238256 }, + { 145322604 , 238713 }, + { 146476218 , 239170 }, + { 147636442 , 239627 }, + { 148803298 , 240085 }, + { 149976809 , 240542 }, + { 151156999 , 241000 }, + { 152343890 , 241459 }, + { 153537506 , 241917 }, + { 154737869 , 242376 }, + { 155945002 , 242835 }, + { 157158929 , 243294 }, + { 158379673 , 243753 }, + { 159607257 , 244213 }, + { 160841704 , 244673 }, + { 162083037 , 245133 }, + { 163331279 , 245593 }, + { 164586455 , 246054 }, + { 165848586 , 246514 }, + { 167117696 , 246975 }, + { 168393810 , 247437 }, + { 169676949 , 247898 }, + { 170967138 , 248360 }, + { 172264399 , 248822 }, + { 173568757 , 249284 }, + { 174880235 , 249747 }, + { 176198856 , 250209 }, + { 177524643 , 250672 }, + { 178857621 , 251136 }, + { 180197813 , 251599 }, + { 181545242 , 252063 }, + { 182899933 , 252527 }, + { 184261908 , 252991 }, + { 185631191 , 253456 }, + { 187007807 , 253920 }, + { 188391778 , 254385 }, + { 189783129 , 254851 }, + { 191181884 , 255316 }, + { 192588065 , 255782 }, + { 194001698 , 256248 }, + { 195422805 , 256714 }, + { 196851411 , 257181 }, + { 198287540 , 257648 }, + { 199731215 , 258115 }, + { 201182461 , 258582 }, + { 202641302 , 259050 }, + { 204107760 , 259518 }, + { 205581862 , 259986 }, + { 207063630 , 260454 }, + { 208553088 , 260923 }, + { 210050262 , 261392 }, + { 211555174 , 261861 }, + { 213067849 , 262331 }, + { 214588312 , 262800 }, + { 216116586 , 263270 }, + { 217652696 , 263741 }, + { 219196666 , 264211 }, + { 220748520 , 264682 }, + { 222308282 , 265153 }, + { 223875978 , 265625 }, + { 225451630 , 266097 }, + { 227035265 , 266569 }, + { 228626905 , 267041 }, + { 230226576 , 267514 }, + { 231834302 , 267986 }, + { 233450107 , 268460 }, + { 235074016 , 268933 }, + { 236706054 , 269407 }, + { 238346244 , 269881 }, + { 239994613 , 270355 }, + { 241651183 , 270830 }, + { 243315981 , 271305 } +}; + +/* Calculate the send rate as per section 3.1 of RFC3448 + +Returns send rate in bytes per second + +Integer maths and lookups are used as not allowed floating point in kernel + +The function for Xcalc as per section 3.1 of RFC3448 is: + +X = s + ------------------------------------------------------------- + R*sqrt(2*b*p/3) + (t_RTO * (3*sqrt(3*b*p/8) * p * (1+32*p^2))) + +where +X is the trasmit rate in bytes/second +s is the packet size in bytes +R is the round trip time in seconds +p is the loss event rate, between 0 and 1.0, of the number of loss events + as a fraction of the number of packets transmitted +t_RTO is the TCP retransmission timeout value in seconds +b is the number of packets acknowledged by a single TCP acknowledgement + +we can assume that b = 1 and t_RTO is 4 * R. With this the equation becomes: + +X = s + ----------------------------------------------------------------------- + R * sqrt(2 * p / 3) + (12 * R * (sqrt(3 * p / 8) * p * (1 + 32 * p^2))) + + +which we can break down into: + +X = s + -------- + R * f(p) + +where f(p) = sqrt(2 * p / 3) + (12 * sqrt(3 * p / 8) * p * (1 + 32 * p * p)) + +Function parameters: +s - bytes +R - RTT in usecs +p - loss rate (decimal fraction multiplied by 1,000,000) + +Returns Xcalc in bytes per second + +DON'T alter this code unless you run test cases against it as the code +has been manipulated to stop underflow/overlow. + +*/ +static u32 ccid3_calc_x(u16 s, u32 R, u32 p) +{ + int index; + u32 f; + u64 tmp1, tmp2; + + if (p < CALCX_SPLIT) + index = (p / (CALCX_SPLIT / CALCX_ARRSIZE)) - 1; + else + index = (p / (1000000 / CALCX_ARRSIZE)) - 1; + + if (index < 0) + /* p should be 0 unless there is a bug in my code */ + index = 0; + + if (R == 0) + R = 1; /* RTT can't be zero or else divide by zero */ + + BUG_ON(index >= CALCX_ARRSIZE); + + if (p >= CALCX_SPLIT) + f = calcx_lookup[index][0]; + else + f = calcx_lookup[index][1]; + + tmp1 = ((u64)s * 100000000); + tmp2 = ((u64)R * (u64)f); + do_div(tmp2,10000); + do_div(tmp1,tmp2); + /* don't alter above math unless you test due to overflow on 32 bit */ + + return (u32)tmp1; +} + +/* Calculate new t_ipi (inter packet interval) by t_ipi = s / X_inst */ +static inline void ccid3_calc_new_t_ipi(struct ccid3_hc_tx_sock *hctx) +{ + if (hctx->ccid3hctx_state == TFRC_SSTATE_NO_FBACK) + return; + /* if no feedback spec says t_ipi is 1 second (set elsewhere and then + * doubles after every no feedback timer (separate function) */ + + if (hctx->ccid3hctx_x < 10) { + ccid3_pr_debug("ccid3_calc_new_t_ipi - ccid3hctx_x < 10\n"); + hctx->ccid3hctx_x = 10; + } + hctx->ccid3hctx_t_ipi = (hctx->ccid3hctx_s * 100000) + / (hctx->ccid3hctx_x / 10); + /* reason for above maths with 10 in there is to avoid 32 bit + * overflow for jumbo packets */ + +} + +/* Calculate new delta by delta = min(t_ipi / 2, t_gran / 2) */ +static inline void ccid3_calc_new_delta(struct ccid3_hc_tx_sock *hctx) +{ + hctx->ccid3hctx_delta = min_t(u32, hctx->ccid3hctx_t_ipi / 2, TFRC_OPSYS_HALF_TIME_GRAN); + +} + +/* + * Update X by + * If (p > 0) + * x_calc = calcX(s, R, p); + * X = max(min(X_calc, 2 * X_recv), s / t_mbi); + * Else + * If (now - tld >= R) + * X = max(min(2 * X, 2 * X_recv), s / R); + * tld = now; + */ +static void ccid3_hc_tx_update_x(struct sock *sk) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private; + + if (hctx->ccid3hctx_p >= TFRC_SMALLEST_P) { /* to avoid large error in calcX */ + hctx->ccid3hctx_x_calc = ccid3_calc_x(hctx->ccid3hctx_s, + hctx->ccid3hctx_rtt, + hctx->ccid3hctx_p); + hctx->ccid3hctx_x = max_t(u32, min_t(u32, hctx->ccid3hctx_x_calc, 2 * hctx->ccid3hctx_x_recv), + hctx->ccid3hctx_s / TFRC_MAX_BACK_OFF_TIME); + } else if (now_delta(hctx->ccid3hctx_t_ld) >= hctx->ccid3hctx_rtt) { + u32 rtt = hctx->ccid3hctx_rtt; + if (rtt < 10) { + rtt = 10; + } /* avoid divide by zero below */ + + hctx->ccid3hctx_x = max_t(u32, min_t(u32, 2 * hctx->ccid3hctx_x_recv, 2 * hctx->ccid3hctx_x), + (hctx->ccid3hctx_s * 100000) / (rtt / 10)); + /* Using 100000 and 10 to avoid 32 bit overflow for jumbo frames */ + do_gettimeofday(&hctx->ccid3hctx_t_ld); + } + + if (hctx->ccid3hctx_x == 0) { + ccid3_pr_debug("ccid3hctx_x = 0!\n"); + hctx->ccid3hctx_x = 1; + } +} + +static void ccid3_hc_tx_no_feedback_timer(unsigned long data) +{ + struct sock *sk = (struct sock *)data; + struct dccp_sock *dp = dccp_sk(sk); + unsigned long next_tmout = 0; + struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private; + u32 rtt; + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + /* Try again later. */ + /* XXX: set some sensible MIB */ + sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, jiffies + HZ / 5); + goto out; + } + + ccid3_pr_debug("%s, sk=%p, state=%s\n", dccp_role(sk), sk, + ccid3_tx_state_name(hctx->ccid3hctx_state)); + + if (hctx->ccid3hctx_x < 10) { + ccid3_pr_debug("TFRC_SSTATE_NO_FBACK ccid3hctx_x < 10\n"); + hctx->ccid3hctx_x = 10; + } + + switch (hctx->ccid3hctx_state) { + case TFRC_SSTATE_TERM: + goto out; + case TFRC_SSTATE_NO_FBACK: + /* Halve send rate */ + hctx->ccid3hctx_x /= 2; + if (hctx->ccid3hctx_x < (hctx->ccid3hctx_s / TFRC_MAX_BACK_OFF_TIME)) + hctx->ccid3hctx_x = hctx->ccid3hctx_s / TFRC_MAX_BACK_OFF_TIME; + + ccid3_pr_debug("%s, sk=%p, state=%s, updated tx rate to %d bytes/s\n", + dccp_role(sk), sk, ccid3_tx_state_name(hctx->ccid3hctx_state), + hctx->ccid3hctx_x); + next_tmout = max_t(u32, 2 * (hctx->ccid3hctx_s * 100000) + / (hctx->ccid3hctx_x / 10), TFRC_INITIAL_TIMEOUT); + /* do above maths with 100000 and 10 to prevent overflow on 32 bit */ + /* FIXME - not sure above calculation is correct. See section 5 of CCID3 11 + * should adjust tx_t_ipi and double that to achieve it really */ + break; + case TFRC_SSTATE_FBACK: + /* Check if IDLE since last timeout and recv rate is less than 4 packets per RTT */ + rtt = hctx->ccid3hctx_rtt; + if (rtt < 10) + rtt = 10; + /* stop divide by zero below */ + if (!hctx->ccid3hctx_idle || (hctx->ccid3hctx_x_recv >= + 4 * (hctx->ccid3hctx_s * 100000) / (rtt / 10))) { + ccid3_pr_debug("%s, sk=%p, state=%s, not idle\n", dccp_role(sk), sk, + ccid3_tx_state_name(hctx->ccid3hctx_state)); + /* Halve sending rate */ + + /* If (X_calc > 2 * X_recv) + * X_recv = max(X_recv / 2, s / (2 * t_mbi)); + * Else + * X_recv = X_calc / 4; + */ + BUG_ON(hctx->ccid3hctx_p >= TFRC_SMALLEST_P && hctx->ccid3hctx_x_calc == 0); + + /* check also if p is zero -> x_calc is infinity? */ + if (hctx->ccid3hctx_p < TFRC_SMALLEST_P || + hctx->ccid3hctx_x_calc > 2 * hctx->ccid3hctx_x_recv) + hctx->ccid3hctx_x_recv = max_t(u32, hctx->ccid3hctx_x_recv / 2, + hctx->ccid3hctx_s / (2 * TFRC_MAX_BACK_OFF_TIME)); + else + hctx->ccid3hctx_x_recv = hctx->ccid3hctx_x_calc / 4; + + /* Update sending rate */ + ccid3_hc_tx_update_x(sk); + } + if (hctx->ccid3hctx_x == 0) { + ccid3_pr_debug("TFRC_SSTATE_FBACK ccid3hctx_x = 0!\n"); + hctx->ccid3hctx_x = 10; + } + /* Schedule no feedback timer to expire in max(4 * R, 2 * s / X) */ + next_tmout = max_t(u32, inet_csk(sk)->icsk_rto, + 2 * (hctx->ccid3hctx_s * 100000) / (hctx->ccid3hctx_x / 10)); + break; + default: + printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n", + __FUNCTION__, dccp_role(sk), sk, hctx->ccid3hctx_state); + dump_stack(); + goto out; + } + + sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, + jiffies + max_t(u32, 1, usecs_to_jiffies(next_tmout))); + hctx->ccid3hctx_idle = 1; +out: + bh_unlock_sock(sk); + sock_put(sk); +} + +static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb, + int len, long *delay) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private; + struct ccid3_tx_hist_entry *new_packet = NULL; + struct timeval now; + int rc = -ENOTCONN; + +// ccid3_pr_debug("%s, sk=%p, skb=%p, len=%d\n", dccp_role(sk), sk, skb, len); + /* + * check if pure ACK or Terminating */ + /* XXX: We only call this function for DATA and DATAACK, on, these packets can have + * zero length, but why the comment about "pure ACK"? + */ + if (hctx == NULL || len == 0 || hctx->ccid3hctx_state == TFRC_SSTATE_TERM) + goto out; + + /* See if last packet allocated was not sent */ + if (!list_empty(&hctx->ccid3hctx_hist)) + new_packet = list_entry(hctx->ccid3hctx_hist.next, + struct ccid3_tx_hist_entry, ccid3htx_node); + + if (new_packet == NULL || new_packet->ccid3htx_sent) { + new_packet = ccid3_tx_hist_entry_new(SLAB_ATOMIC); + + rc = -ENOBUFS; + if (new_packet == NULL) { + ccid3_pr_debug("%s, sk=%p, not enough mem to add " + "to history, send refused\n", dccp_role(sk), sk); + goto out; + } + + list_add(&new_packet->ccid3htx_node, &hctx->ccid3hctx_hist); + } + + do_gettimeofday(&now); + + switch (hctx->ccid3hctx_state) { + case TFRC_SSTATE_NO_SENT: + ccid3_pr_debug("%s, sk=%p, first packet(%llu)\n", dccp_role(sk), sk, + dp->dccps_gss); + + hctx->ccid3hctx_no_feedback_timer.function = ccid3_hc_tx_no_feedback_timer; + hctx->ccid3hctx_no_feedback_timer.data = (unsigned long)sk; + sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, jiffies + usecs_to_jiffies(TFRC_INITIAL_TIMEOUT)); + hctx->ccid3hctx_last_win_count = 0; + hctx->ccid3hctx_t_last_win_count = now; + ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); + hctx->ccid3hctx_t_ipi = TFRC_INITIAL_TIMEOUT; + + /* Set nominal send time for initial packet */ + hctx->ccid3hctx_t_nom = now; + (hctx->ccid3hctx_t_nom).tv_usec += hctx->ccid3hctx_t_ipi; + timeval_fix(&(hctx->ccid3hctx_t_nom)); + ccid3_calc_new_delta(hctx); + rc = 0; + break; + case TFRC_SSTATE_NO_FBACK: + case TFRC_SSTATE_FBACK: + *delay = (now_delta(hctx->ccid3hctx_t_nom) - hctx->ccid3hctx_delta); + ccid3_pr_debug("send_packet delay=%ld\n",*delay); + *delay /= -1000; + /* divide by -1000 is to convert to ms and get sign right */ + rc = *delay > 0 ? -EAGAIN : 0; + break; + default: + printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n", + __FUNCTION__, dccp_role(sk), sk, hctx->ccid3hctx_state); + dump_stack(); + rc = -EINVAL; + break; + } + + /* Can we send? if so add options and add to packet history */ + if (rc == 0) + new_packet->ccid3htx_win_count = DCCP_SKB_CB(skb)->dccpd_ccval = hctx->ccid3hctx_last_win_count; +out: + return rc; +} + +static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, int len) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private; + struct ccid3_tx_hist_entry *packet = NULL; + struct timeval now; + +// ccid3_pr_debug("%s, sk=%p, more=%d, len=%d\n", dccp_role(sk), sk, more, len); + BUG_ON(hctx == NULL); + + if (hctx->ccid3hctx_state == TFRC_SSTATE_TERM) { + ccid3_pr_debug("%s, sk=%p, while state is TFRC_SSTATE_TERM!\n", + dccp_role(sk), sk); + return; + } + + do_gettimeofday(&now); + + /* check if we have sent a data packet */ + if (len > 0) { + unsigned long quarter_rtt; + + if (list_empty(&hctx->ccid3hctx_hist)) { + printk(KERN_CRIT "%s: packet doesn't exists in history!\n", __FUNCTION__); + return; + } + packet = list_entry(hctx->ccid3hctx_hist.next, struct ccid3_tx_hist_entry, ccid3htx_node); + if (packet->ccid3htx_sent) { + printk(KERN_CRIT "%s: no unsent packet in history!\n", __FUNCTION__); + return; + } + packet->ccid3htx_tstamp = now; + packet->ccid3htx_seqno = dp->dccps_gss; + // ccid3_pr_debug("%s, sk=%p, seqno=%llu inserted!\n", dccp_role(sk), sk, packet->ccid3htx_seqno); + + /* + * Check if win_count have changed */ + /* COMPLIANCE_BEGIN + * Algorithm in "8.1. Window Counter Valuer" in draft-ietf-dccp-ccid3-11.txt + */ + quarter_rtt = now_delta(hctx->ccid3hctx_t_last_win_count) / (hctx->ccid3hctx_rtt / 4); + if (quarter_rtt > 0) { + hctx->ccid3hctx_t_last_win_count = now; + hctx->ccid3hctx_last_win_count = (hctx->ccid3hctx_last_win_count + + min_t(unsigned long, quarter_rtt, 5)) % 16; + ccid3_pr_debug("%s, sk=%p, window changed from %u to %u!\n", + dccp_role(sk), sk, + packet->ccid3htx_win_count, + hctx->ccid3hctx_last_win_count); + } + /* COMPLIANCE_END */ +#if 0 + ccid3_pr_debug("%s, sk=%p, packet sent (%llu,%u)\n", + dccp_role(sk), sk, + packet->ccid3htx_seqno, + packet->ccid3htx_win_count); +#endif + hctx->ccid3hctx_idle = 0; + packet->ccid3htx_sent = 1; + } else + ccid3_pr_debug("%s, sk=%p, seqno=%llu NOT inserted!\n", + dccp_role(sk), sk, dp->dccps_gss); + + switch (hctx->ccid3hctx_state) { + case TFRC_SSTATE_NO_SENT: + /* if first wasn't pure ack */ + if (len != 0) + printk(KERN_CRIT "%s: %s, First packet sent is noted as a data packet\n", + __FUNCTION__, dccp_role(sk)); + return; + case TFRC_SSTATE_NO_FBACK: + case TFRC_SSTATE_FBACK: + if (len > 0) { + hctx->ccid3hctx_t_nom = now; + ccid3_calc_new_t_ipi(hctx); + ccid3_calc_new_delta(hctx); + (hctx->ccid3hctx_t_nom).tv_usec += hctx->ccid3hctx_t_ipi; + timeval_fix(&(hctx->ccid3hctx_t_nom)); + } + break; + default: + printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n", + __FUNCTION__, dccp_role(sk), sk, hctx->ccid3hctx_state); + dump_stack(); + break; + } +} + +static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private; + struct ccid3_options_received *opt_recv; + struct ccid3_tx_hist_entry *entry, *next, *packet; + unsigned long next_tmout; + u16 t_elapsed; + u32 pinv; + u32 x_recv; + u32 r_sample; +#if 0 + ccid3_pr_debug("%s, sk=%p(%s), skb=%p(%s)\n", + dccp_role(sk), sk, dccp_state_name(sk->sk_state), + skb, dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type)); +#endif + if (hctx == NULL) + return; + + if (hctx->ccid3hctx_state == TFRC_SSTATE_TERM) { + ccid3_pr_debug("%s, sk=%p, received a packet when terminating!\n", dccp_role(sk), sk); + return; + } + + /* we are only interested in ACKs */ + if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK || + DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK)) + return; + + opt_recv = &hctx->ccid3hctx_options_received; + + t_elapsed = dp->dccps_options_received.dccpor_elapsed_time; + x_recv = opt_recv->ccid3or_receive_rate; + pinv = opt_recv->ccid3or_loss_event_rate; + + switch (hctx->ccid3hctx_state) { + case TFRC_SSTATE_NO_SENT: + /* FIXME: what to do here? */ + return; + case TFRC_SSTATE_NO_FBACK: + case TFRC_SSTATE_FBACK: + /* Calculate new round trip sample by + * R_sample = (now - t_recvdata) - t_delay */ + /* get t_recvdata from history */ + packet = NULL; + list_for_each_entry_safe(entry, next, &hctx->ccid3hctx_hist, ccid3htx_node) + if (entry->ccid3htx_seqno == DCCP_SKB_CB(skb)->dccpd_ack_seq) { + packet = entry; + break; + } + + if (packet == NULL) { + ccid3_pr_debug("%s, sk=%p, seqno %llu(%s) does't exist in history!\n", + dccp_role(sk), sk, DCCP_SKB_CB(skb)->dccpd_ack_seq, + dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type)); + return; + } + + /* Update RTT */ + r_sample = now_delta(packet->ccid3htx_tstamp); + /* FIXME: */ + // r_sample -= usecs_to_jiffies(t_elapsed * 10); + + /* Update RTT estimate by + * If (No feedback recv) + * R = R_sample; + * Else + * R = q * R + (1 - q) * R_sample; + * + * q is a constant, RFC 3448 recomments 0.9 + */ + if (hctx->ccid3hctx_state == TFRC_SSTATE_NO_FBACK) { + ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK); + hctx->ccid3hctx_rtt = r_sample; + } else + hctx->ccid3hctx_rtt = (hctx->ccid3hctx_rtt * 9) / 10 + r_sample / 10; + + /* + * XXX: this is to avoid a division by zero in ccid3_hc_tx_packet_sent + * implemention of the new window count. + */ + if (hctx->ccid3hctx_rtt < 4) + hctx->ccid3hctx_rtt = 4; + + ccid3_pr_debug("%s, sk=%p, New RTT estimate=%uus, r_sample=%us\n", + dccp_role(sk), sk, + hctx->ccid3hctx_rtt, + r_sample); + + /* Update timeout interval */ + inet_csk(sk)->icsk_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt, USEC_IN_SEC); + + /* Update receive rate */ + hctx->ccid3hctx_x_recv = x_recv; /* x_recv in bytes per second */ + + /* Update loss event rate */ + if (pinv == ~0 || pinv == 0) + hctx->ccid3hctx_p = 0; + else { + hctx->ccid3hctx_p = 1000000 / pinv; + + if (hctx->ccid3hctx_p < TFRC_SMALLEST_P) { + hctx->ccid3hctx_p = TFRC_SMALLEST_P; + ccid3_pr_debug("%s, sk=%p, Smallest p used!\n", dccp_role(sk), sk); + } + } + + /* unschedule no feedback timer */ + sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer); + + /* Update sending rate */ + ccid3_hc_tx_update_x(sk); + + /* Update next send time */ + if (hctx->ccid3hctx_t_ipi > (hctx->ccid3hctx_t_nom).tv_usec) { + (hctx->ccid3hctx_t_nom).tv_usec += USEC_IN_SEC; + (hctx->ccid3hctx_t_nom).tv_sec--; + } + /* FIXME - if no feedback then t_ipi can go > 1 second */ + (hctx->ccid3hctx_t_nom).tv_usec -= hctx->ccid3hctx_t_ipi; + ccid3_calc_new_t_ipi(hctx); + (hctx->ccid3hctx_t_nom).tv_usec += hctx->ccid3hctx_t_ipi; + timeval_fix(&(hctx->ccid3hctx_t_nom)); + ccid3_calc_new_delta(hctx); + + /* remove all packets older than the one acked from history */ +#if 0 + FIXME! + list_for_each_entry_safe_continue(entry, next, &hctx->ccid3hctx_hist, ccid3htx_node) { + list_del_init(&entry->ccid3htx_node); + ccid3_tx_hist_entry_delete(entry); + } +#endif + if (hctx->ccid3hctx_x < 10) { + ccid3_pr_debug("ccid3_hc_tx_packet_recv hctx->ccid3hctx_x < 10\n"); + hctx->ccid3hctx_x = 10; + } + /* to prevent divide by zero below */ + + /* Schedule no feedback timer to expire in max(4 * R, 2 * s / X) */ + next_tmout = max(inet_csk(sk)->icsk_rto, + 2 * (hctx->ccid3hctx_s * 100000) / (hctx->ccid3hctx_x/10)); + /* maths with 100000 and 10 is to prevent overflow with 32 bit */ + + ccid3_pr_debug("%s, sk=%p, Scheduled no feedback timer to expire in %lu jiffies (%luus)\n", + dccp_role(sk), sk, usecs_to_jiffies(next_tmout), next_tmout); + + sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, + jiffies + max_t(u32,1,usecs_to_jiffies(next_tmout))); + + /* set idle flag */ + hctx->ccid3hctx_idle = 1; + break; + default: + printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n", + __FUNCTION__, dccp_role(sk), sk, hctx->ccid3hctx_state); + dump_stack(); + break; + } +} + +static void ccid3_hc_tx_insert_options(struct sock *sk, struct sk_buff *skb) +{ + const struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private; + + if (hctx == NULL || !(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN)) + return; + + DCCP_SKB_CB(skb)->dccpd_ccval = hctx->ccid3hctx_last_win_count; +} + +static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option, + unsigned char len, u16 idx, unsigned char *value) +{ + int rc = 0; + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private; + struct ccid3_options_received *opt_recv; + + if (hctx == NULL) + return 0; + + opt_recv = &hctx->ccid3hctx_options_received; + + if (opt_recv->ccid3or_seqno != dp->dccps_gsr) { + opt_recv->ccid3or_seqno = dp->dccps_gsr; + opt_recv->ccid3or_loss_event_rate = ~0; + opt_recv->ccid3or_loss_intervals_idx = 0; + opt_recv->ccid3or_loss_intervals_len = 0; + opt_recv->ccid3or_receive_rate = 0; + } + + switch (option) { + case TFRC_OPT_LOSS_EVENT_RATE: + if (len != 4) { + ccid3_pr_debug("%s, sk=%p, invalid len for TFRC_OPT_LOSS_EVENT_RATE\n", + dccp_role(sk), sk); + rc = -EINVAL; + } else { + opt_recv->ccid3or_loss_event_rate = ntohl(*(u32 *)value); + ccid3_pr_debug("%s, sk=%p, LOSS_EVENT_RATE=%u\n", + dccp_role(sk), sk, + opt_recv->ccid3or_loss_event_rate); + } + break; + case TFRC_OPT_LOSS_INTERVALS: + opt_recv->ccid3or_loss_intervals_idx = idx; + opt_recv->ccid3or_loss_intervals_len = len; + ccid3_pr_debug("%s, sk=%p, LOSS_INTERVALS=(%u, %u)\n", + dccp_role(sk), sk, + opt_recv->ccid3or_loss_intervals_idx, + opt_recv->ccid3or_loss_intervals_len); + break; + case TFRC_OPT_RECEIVE_RATE: + if (len != 4) { + ccid3_pr_debug("%s, sk=%p, invalid len for TFRC_OPT_RECEIVE_RATE\n", + dccp_role(sk), sk); + rc = -EINVAL; + } else { + opt_recv->ccid3or_receive_rate = ntohl(*(u32 *)value); + ccid3_pr_debug("%s, sk=%p, RECEIVE_RATE=%u\n", + dccp_role(sk), sk, + opt_recv->ccid3or_receive_rate); + } + break; + } + + return rc; +} + +static int ccid3_hc_tx_init(struct sock *sk) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_tx_sock *hctx; + + ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk); + + hctx = dp->dccps_hc_tx_ccid_private = kmalloc(sizeof(*hctx), gfp_any()); + if (hctx == NULL) + return -ENOMEM; + + memset(hctx, 0, sizeof(*hctx)); + + if (dp->dccps_avg_packet_size >= TFRC_MIN_PACKET_SIZE && + dp->dccps_avg_packet_size <= TFRC_MAX_PACKET_SIZE) + hctx->ccid3hctx_s = (u16)dp->dccps_avg_packet_size; + else + hctx->ccid3hctx_s = TFRC_STD_PACKET_SIZE; + + hctx->ccid3hctx_x = hctx->ccid3hctx_s; /* set transmission rate to 1 packet per second */ + hctx->ccid3hctx_rtt = 4; /* See ccid3_hc_tx_packet_sent win_count calculatation */ + inet_csk(sk)->icsk_rto = USEC_IN_SEC; + hctx->ccid3hctx_state = TFRC_SSTATE_NO_SENT; + INIT_LIST_HEAD(&hctx->ccid3hctx_hist); + init_timer(&hctx->ccid3hctx_no_feedback_timer); + + return 0; +} + +static void ccid3_hc_tx_exit(struct sock *sk) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private; + struct ccid3_tx_hist_entry *entry, *next; + + ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk); + BUG_ON(hctx == NULL); + + ccid3_hc_tx_set_state(sk, TFRC_SSTATE_TERM); + sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer); + + /* Empty packet history */ + list_for_each_entry_safe(entry, next, &hctx->ccid3hctx_hist, ccid3htx_node) { + list_del_init(&entry->ccid3htx_node); + ccid3_tx_hist_entry_delete(entry); + } + + kfree(dp->dccps_hc_tx_ccid_private); + dp->dccps_hc_tx_ccid_private = NULL; +} + +/* + * RX Half Connection methods + */ + +/* TFRC receiver states */ +enum ccid3_hc_rx_states { + TFRC_RSTATE_NO_DATA = 1, + TFRC_RSTATE_DATA, + TFRC_RSTATE_TERM = 127, +}; + +#ifdef CCID3_DEBUG +static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state) +{ + static char *ccid3_rx_state_names[] = { + [TFRC_RSTATE_NO_DATA] = "NO_DATA", + [TFRC_RSTATE_DATA] = "DATA", + [TFRC_RSTATE_TERM] = "TERM", + }; + + return ccid3_rx_state_names[state]; +} +#endif + +static inline void ccid3_hc_rx_set_state(struct sock *sk, enum ccid3_hc_rx_states state) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; + enum ccid3_hc_rx_states oldstate = hcrx->ccid3hcrx_state; + + ccid3_pr_debug("%s(%p) %-8.8s -> %s\n", + dccp_role(sk), sk, ccid3_rx_state_name(oldstate), ccid3_rx_state_name(state)); + WARN_ON(state == oldstate); + hcrx->ccid3hcrx_state = state; +} + +static int ccid3_hc_rx_add_hist(struct sock *sk, struct ccid3_rx_hist_entry *packet) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; + struct ccid3_rx_hist_entry *entry, *next; + u8 num_later = 0; + + if (list_empty(&hcrx->ccid3hcrx_hist)) + list_add(&packet->ccid3hrx_node, &hcrx->ccid3hcrx_hist); + else { + u64 seqno = packet->ccid3hrx_seqno; + struct ccid3_rx_hist_entry *iter = list_entry(hcrx->ccid3hcrx_hist.next, + struct ccid3_rx_hist_entry, + ccid3hrx_node); + if (after48(seqno, iter->ccid3hrx_seqno)) + list_add(&packet->ccid3hrx_node, &hcrx->ccid3hcrx_hist); + else { + if (iter->ccid3hrx_type == DCCP_PKT_DATA || + iter->ccid3hrx_type == DCCP_PKT_DATAACK) + num_later = 1; + + list_for_each_entry_continue(iter, &hcrx->ccid3hcrx_hist, ccid3hrx_node) { + if (after48(seqno, iter->ccid3hrx_seqno)) { + list_add(&packet->ccid3hrx_node, &iter->ccid3hrx_node); + goto trim_history; + } + + if (iter->ccid3hrx_type == DCCP_PKT_DATA || + iter->ccid3hrx_type == DCCP_PKT_DATAACK) + num_later++; + + if (num_later == TFRC_RECV_NUM_LATE_LOSS) { + ccid3_rx_hist_entry_delete(packet); + ccid3_pr_debug("%s, sk=%p, packet(%llu) already lost!\n", + dccp_role(sk), sk, seqno); + return 1; + } + } + + if (num_later < TFRC_RECV_NUM_LATE_LOSS) + list_add_tail(&packet->ccid3hrx_node, &hcrx->ccid3hcrx_hist); + /* FIXME: else what? should we destroy the packet like above? */ + } + } + +trim_history: + /* Trim history (remove all packets after the NUM_LATE_LOSS + 1 data packets) */ + num_later = TFRC_RECV_NUM_LATE_LOSS + 1; + + if (!list_empty(&hcrx->ccid3hcrx_loss_interval_hist)) { + list_for_each_entry_safe(entry, next, &hcrx->ccid3hcrx_hist, ccid3hrx_node) { + if (num_later == 0) { + list_del_init(&entry->ccid3hrx_node); + ccid3_rx_hist_entry_delete(entry); + } else if (entry->ccid3hrx_type == DCCP_PKT_DATA || + entry->ccid3hrx_type == DCCP_PKT_DATAACK) + --num_later; + } + } else { + int step = 0; + u8 win_count = 0; /* Not needed, but lets shut up gcc */ + int tmp; + /* + * We have no loss interval history so we need at least one + * rtt:s of data packets to approximate rtt. + */ + list_for_each_entry_safe(entry, next, &hcrx->ccid3hcrx_hist, ccid3hrx_node) { + if (num_later == 0) { + switch (step) { + case 0: + step = 1; + /* OK, find next data packet */ + num_later = 1; + break; + case 1: + step = 2; + /* OK, find next data packet */ + num_later = 1; + win_count = entry->ccid3hrx_win_count; + break; + case 2: + tmp = win_count - entry->ccid3hrx_win_count; + if (tmp < 0) + tmp += TFRC_WIN_COUNT_LIMIT; + if (tmp > TFRC_WIN_COUNT_PER_RTT + 1) { + /* we have found a packet older than one rtt + * remove the rest */ + step = 3; + } else /* OK, find next data packet */ + num_later = 1; + break; + case 3: + list_del_init(&entry->ccid3hrx_node); + ccid3_rx_hist_entry_delete(entry); + break; + } + } else if (entry->ccid3hrx_type == DCCP_PKT_DATA || + entry->ccid3hrx_type == DCCP_PKT_DATAACK) + --num_later; + } + } + + return 0; +} + +static void ccid3_hc_rx_send_feedback(struct sock *sk) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; + struct ccid3_rx_hist_entry *entry, *packet; + + ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk); + + switch (hcrx->ccid3hcrx_state) { + case TFRC_RSTATE_NO_DATA: + hcrx->ccid3hcrx_x_recv = 0; + break; + case TFRC_RSTATE_DATA: { + u32 delta = now_delta(hcrx->ccid3hcrx_tstamp_last_feedback); + + if (delta == 0) + delta = 1; /* to prevent divide by zero */ + hcrx->ccid3hcrx_x_recv = (hcrx->ccid3hcrx_bytes_recv * USEC_IN_SEC) / delta; + } + break; + default: + printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n", + __FUNCTION__, dccp_role(sk), sk, hcrx->ccid3hcrx_state); + dump_stack(); + return; + } + + packet = NULL; + list_for_each_entry(entry, &hcrx->ccid3hcrx_hist, ccid3hrx_node) + if (entry->ccid3hrx_type == DCCP_PKT_DATA || + entry->ccid3hrx_type == DCCP_PKT_DATAACK) { + packet = entry; + break; + } + + if (packet == NULL) { + printk(KERN_CRIT "%s: %s, sk=%p, no data packet in history!\n", + __FUNCTION__, dccp_role(sk), sk); + dump_stack(); + return; + } + + do_gettimeofday(&(hcrx->ccid3hcrx_tstamp_last_feedback)); + hcrx->ccid3hcrx_last_counter = packet->ccid3hrx_win_count; + hcrx->ccid3hcrx_seqno_last_counter = packet->ccid3hrx_seqno; + hcrx->ccid3hcrx_bytes_recv = 0; + + /* Convert to multiples of 10us */ + hcrx->ccid3hcrx_elapsed_time = now_delta(packet->ccid3hrx_tstamp) / 10; + if (hcrx->ccid3hcrx_p == 0) + hcrx->ccid3hcrx_pinv = ~0; + else + hcrx->ccid3hcrx_pinv = 1000000 / hcrx->ccid3hcrx_p; + dccp_send_ack(sk); +} + +static void ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) +{ + const struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; + + if (hcrx == NULL || !(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN)) + return; + + if (hcrx->ccid3hcrx_elapsed_time != 0 && !dccp_packet_without_ack(skb)) + dccp_insert_option_elapsed_time(sk, skb, hcrx->ccid3hcrx_elapsed_time); + + if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA) { + const u32 x_recv = htonl(hcrx->ccid3hcrx_x_recv); + const u32 pinv = htonl(hcrx->ccid3hcrx_pinv); + + dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE, &pinv, sizeof(pinv)); + dccp_insert_option(sk, skb, TFRC_OPT_RECEIVE_RATE, &x_recv, sizeof(x_recv)); + } + + DCCP_SKB_CB(skb)->dccpd_ccval = hcrx->ccid3hcrx_last_counter; +} + +/* Weights used to calculate loss event rate */ +/* + * These are integers as per section 8 of RFC3448. We can then divide by 4 * + * when we use it. + */ +const int ccid3_hc_rx_w[TFRC_RECV_IVAL_F_LENGTH] = { 4, 4, 4, 4, 3, 2, 1, 1, }; + +/* + * args: fvalue - function value to match + * returns: p closest to that value + * + * both fvalue and p are multiplied by 1,000,000 to use ints + */ +u32 calcx_reverse_lookup(u32 fvalue) { + int ctr = 0; + int small; + + if (fvalue < calcx_lookup[0][1]) + return 0; + if (fvalue <= calcx_lookup[CALCX_ARRSIZE-1][1]) + small = 1; + else if (fvalue > calcx_lookup[CALCX_ARRSIZE-1][0]) + return 1000000; + else + small = 0; + while (fvalue > calcx_lookup[ctr][small]) + ctr++; + if (small) + return (CALCX_SPLIT * ctr / CALCX_ARRSIZE); + else + return (1000000 * ctr / CALCX_ARRSIZE) ; +} + +/* calculate first loss interval + * + * returns estimated loss interval in usecs */ + +static u32 ccid3_hc_rx_calc_first_li(struct sock *sk) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; + struct ccid3_rx_hist_entry *entry, *next, *tail = NULL; + u32 rtt, delta, x_recv, fval, p, tmp2; + struct timeval tstamp, tmp_tv; + int interval = 0; + int win_count = 0; + int step = 0; + u64 tmp1; + + list_for_each_entry_safe(entry, next, &hcrx->ccid3hcrx_hist, ccid3hrx_node) { + if (entry->ccid3hrx_type == DCCP_PKT_DATA || + entry->ccid3hrx_type == DCCP_PKT_DATAACK) { + tail = entry; + + switch (step) { + case 0: + tstamp = entry->ccid3hrx_tstamp; + win_count = entry->ccid3hrx_win_count; + step = 1; + break; + case 1: + interval = win_count - entry->ccid3hrx_win_count; + if (interval < 0) + interval += TFRC_WIN_COUNT_LIMIT; + if (interval > 4) + goto found; + break; + } + } + } + + if (step == 0) { + printk(KERN_CRIT "%s: %s, sk=%p, packet history contains no data packets!\n", + __FUNCTION__, dccp_role(sk), sk); + return ~0; + } + + if (interval == 0) { + ccid3_pr_debug("%s, sk=%p, Could not find a win_count interval > 0. Defaulting to 1\n", + dccp_role(sk), sk); + interval = 1; + } +found: + timeval_sub(tstamp,tail->ccid3hrx_tstamp,&tmp_tv); + rtt = (tmp_tv.tv_sec * USEC_IN_SEC + tmp_tv.tv_usec) * 4 / interval; + ccid3_pr_debug("%s, sk=%p, approximated RTT to %uus\n", + dccp_role(sk), sk, rtt); + if (rtt == 0) + rtt = 1; + + delta = now_delta(hcrx->ccid3hcrx_tstamp_last_feedback); + if (delta == 0) + delta = 1; + + x_recv = (hcrx->ccid3hcrx_bytes_recv * USEC_IN_SEC) / delta; + + tmp1 = (u64)x_recv * (u64)rtt; + do_div(tmp1,10000000); + tmp2 = (u32)tmp1; + fval = (hcrx->ccid3hcrx_s * 100000) / tmp2; + /* do not alter order above or you will get overflow on 32 bit */ + p = calcx_reverse_lookup(fval); + ccid3_pr_debug("%s, sk=%p, receive rate=%u bytes/s, implied loss rate=%u\n",\ + dccp_role(sk), sk, x_recv, p); + + if (p == 0) + return ~0; + else + return 1000000 / p; +} + +static void ccid3_hc_rx_update_li(struct sock *sk, u64 seq_loss, u8 win_loss) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; + struct ccid3_loss_interval_hist_entry *li_entry; + + if (seq_loss != DCCP_MAX_SEQNO + 1) { + ccid3_pr_debug("%s, sk=%p, seq_loss=%llu, win_loss=%u, packet loss detected\n", + dccp_role(sk), sk, seq_loss, win_loss); + + if (list_empty(&hcrx->ccid3hcrx_loss_interval_hist)) { + struct ccid3_loss_interval_hist_entry *li_tail = NULL; + int i; + + ccid3_pr_debug("%s, sk=%p, first loss event detected, creating history\n", dccp_role(sk), sk); + for (i = 0; i <= TFRC_RECV_IVAL_F_LENGTH; ++i) { + li_entry = ccid3_loss_interval_hist_entry_new(SLAB_ATOMIC); + if (li_entry == NULL) { + ccid3_loss_interval_history_delete(&hcrx->ccid3hcrx_loss_interval_hist); + ccid3_pr_debug("%s, sk=%p, not enough mem for creating history\n", + dccp_role(sk), sk); + return; + } + if (li_tail == NULL) + li_tail = li_entry; + list_add(&li_entry->ccid3lih_node, &hcrx->ccid3hcrx_loss_interval_hist); + } + + li_entry->ccid3lih_seqno = seq_loss; + li_entry->ccid3lih_win_count = win_loss; + + li_tail->ccid3lih_interval = ccid3_hc_rx_calc_first_li(sk); + } + } + /* FIXME: find end of interval */ +} + +static void ccid3_hc_rx_detect_loss(struct sock *sk) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; + struct ccid3_rx_hist_entry *entry, *a_next, *b_next, *packet; + struct ccid3_rx_hist_entry *a_loss = NULL; + struct ccid3_rx_hist_entry *b_loss = NULL; + u64 seq_loss = DCCP_MAX_SEQNO + 1; + u8 win_loss = 0; + u8 num_later = TFRC_RECV_NUM_LATE_LOSS; + + list_for_each_entry_safe(entry, b_next, &hcrx->ccid3hcrx_hist, ccid3hrx_node) { + if (num_later == 0) { + b_loss = entry; + break; + } else if (entry->ccid3hrx_type == DCCP_PKT_DATA || + entry->ccid3hrx_type == DCCP_PKT_DATAACK) + --num_later; + } + + if (b_loss == NULL) + goto out_update_li; + + a_next = b_next; + num_later = 1; +#if 0 + FIXME MERGE GIT! + list_for_each_entry_safe_continue(entry, a_next, &hcrx->ccid3hcrx_hist, ccid3hrx_node) { + if (num_later == 0) { + a_loss = entry; + break; + } else if (entry->ccid3hrx_type == DCCP_PKT_DATA || + entry->ccid3hrx_type == DCCP_PKT_DATAACK) + --num_later; + } +#endif + + if (a_loss == NULL) { + if (list_empty(&hcrx->ccid3hcrx_loss_interval_hist)) { + /* no loss event have occured yet */ + ccid3_pr_debug("%s, sk=%p, TODO: find a lost data " + "packet by comparing to initial seqno\n", + dccp_role(sk), sk); + goto out_update_li; + } else { + pr_info("%s: %s, sk=%p, ERROR! Less than 4 data packets in history", + __FUNCTION__, dccp_role(sk), sk); + return; + } + } + + /* Locate a lost data packet */ + entry = packet = b_loss; +#if 0 + FIXME MERGE GIT! + list_for_each_entry_safe_continue(entry, b_next, &hcrx->ccid3hcrx_hist, ccid3hrx_node) { + u64 delta = dccp_delta_seqno(entry->ccid3hrx_seqno, packet->ccid3hrx_seqno); + + if (delta != 0) { + if (packet->ccid3hrx_type == DCCP_PKT_DATA || + packet->ccid3hrx_type == DCCP_PKT_DATAACK) + --delta; + /* + * FIXME: check this, probably this % usage is because + * in earlier drafts the ndp count was just 8 bits + * long, but now it cam be up to 24 bits long. + */ +#if 0 + if (delta % DCCP_NDP_LIMIT != + (packet->ccid3hrx_ndp - entry->ccid3hrx_ndp) % DCCP_NDP_LIMIT) +#endif + if (delta != packet->ccid3hrx_ndp - entry->ccid3hrx_ndp) { + seq_loss = entry->ccid3hrx_seqno; + dccp_inc_seqno(&seq_loss); + } + } + packet = entry; + if (packet == a_loss) + break; + } +#endif + + if (seq_loss != DCCP_MAX_SEQNO + 1) + win_loss = a_loss->ccid3hrx_win_count; + +out_update_li: + ccid3_hc_rx_update_li(sk, seq_loss, win_loss); +} + +static u32 ccid3_hc_rx_calc_i_mean(struct sock *sk) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; + struct ccid3_loss_interval_hist_entry *li_entry, *li_next; + int i = 0; + u32 i_tot; + u32 i_tot0 = 0; + u32 i_tot1 = 0; + u32 w_tot = 0; + + list_for_each_entry_safe(li_entry, li_next, &hcrx->ccid3hcrx_loss_interval_hist, ccid3lih_node) { + if (i < TFRC_RECV_IVAL_F_LENGTH) { + i_tot0 += li_entry->ccid3lih_interval * ccid3_hc_rx_w[i]; + w_tot += ccid3_hc_rx_w[i]; + } + + if (i != 0) + i_tot1 += li_entry->ccid3lih_interval * ccid3_hc_rx_w[i - 1]; + + if (++i > TFRC_RECV_IVAL_F_LENGTH) + break; + } + + if (i != TFRC_RECV_IVAL_F_LENGTH) { + pr_info("%s: %s, sk=%p, ERROR! Missing entry in interval history!\n", + __FUNCTION__, dccp_role(sk), sk); + return 0; + } + + i_tot = max(i_tot0, i_tot1); + + /* FIXME: Why do we do this? -Ian McDonald */ + if (i_tot * 4 < w_tot) + i_tot = w_tot * 4; + + return i_tot * 4 / w_tot; +} + +static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; + struct ccid3_rx_hist_entry *packet; + struct timeval now; + u8 win_count; + u32 p_prev; + int ins; +#if 0 + ccid3_pr_debug("%s, sk=%p(%s), skb=%p(%s)\n", + dccp_role(sk), sk, dccp_state_name(sk->sk_state), + skb, dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type)); +#endif + if (hcrx == NULL) + return; + + BUG_ON(!(hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA || + hcrx->ccid3hcrx_state == TFRC_RSTATE_DATA)); + + switch (DCCP_SKB_CB(skb)->dccpd_type) { + case DCCP_PKT_ACK: + if (hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA) + return; + case DCCP_PKT_DATAACK: + if (dp->dccps_options_received.dccpor_timestamp_echo == 0) + break; + p_prev = hcrx->ccid3hcrx_rtt; + do_gettimeofday(&now); + /* hcrx->ccid3hcrx_rtt = now - dp->dccps_options_received.dccpor_timestamp_echo - + usecs_to_jiffies(dp->dccps_options_received.dccpor_elapsed_time * 10); + FIXME - I think above code is broken - have to look at options more, will also need + to fix pr_debug below */ + if (p_prev != hcrx->ccid3hcrx_rtt) + ccid3_pr_debug("%s, sk=%p, New RTT estimate=%lu jiffies, tstamp_echo=%u, elapsed time=%u\n", + dccp_role(sk), sk, hcrx->ccid3hcrx_rtt, + dp->dccps_options_received.dccpor_timestamp_echo, + dp->dccps_options_received.dccpor_elapsed_time); + break; + case DCCP_PKT_DATA: + break; + default: + ccid3_pr_debug("%s, sk=%p, not DATA/DATAACK/ACK packet(%s)\n", + dccp_role(sk), sk, + dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type)); + return; + } + + packet = ccid3_rx_hist_entry_new(sk, skb, SLAB_ATOMIC); + if (packet == NULL) { + ccid3_pr_debug("%s, sk=%p, Not enough mem to add rx packet to history (consider it lost)!", + dccp_role(sk), sk); + return; + } + + win_count = packet->ccid3hrx_win_count; + + ins = ccid3_hc_rx_add_hist(sk, packet); + + if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK) + return; + + switch (hcrx->ccid3hcrx_state) { + case TFRC_RSTATE_NO_DATA: + ccid3_pr_debug("%s, sk=%p(%s), skb=%p, sending initial feedback\n", + dccp_role(sk), sk, dccp_state_name(sk->sk_state), skb); + ccid3_hc_rx_send_feedback(sk); + ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA); + return; + case TFRC_RSTATE_DATA: + hcrx->ccid3hcrx_bytes_recv += skb->len - dccp_hdr(skb)->dccph_doff * 4; + if (ins == 0) { + do_gettimeofday(&now); + if ((now_delta(hcrx->ccid3hcrx_tstamp_last_ack)) >= hcrx->ccid3hcrx_rtt) { + hcrx->ccid3hcrx_tstamp_last_ack = now; + ccid3_hc_rx_send_feedback(sk); + } + return; + } + break; + default: + printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n", + __FUNCTION__, dccp_role(sk), sk, hcrx->ccid3hcrx_state); + dump_stack(); + return; + } + + /* Dealing with packet loss */ + ccid3_pr_debug("%s, sk=%p(%s), skb=%p, data loss! Reacting...\n", + dccp_role(sk), sk, dccp_state_name(sk->sk_state), skb); + + ccid3_hc_rx_detect_loss(sk); + p_prev = hcrx->ccid3hcrx_p; + + /* Calculate loss event rate */ + if (!list_empty(&hcrx->ccid3hcrx_loss_interval_hist)) + /* Scaling up by 1000000 as fixed decimal */ + hcrx->ccid3hcrx_p = 1000000 / ccid3_hc_rx_calc_i_mean(sk); + + if (hcrx->ccid3hcrx_p > p_prev) { + ccid3_hc_rx_send_feedback(sk); + return; + } +} + +static int ccid3_hc_rx_init(struct sock *sk) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_rx_sock *hcrx; + + ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk); + + hcrx = dp->dccps_hc_rx_ccid_private = kmalloc(sizeof(*hcrx), gfp_any()); + if (hcrx == NULL) + return -ENOMEM; + + memset(hcrx, 0, sizeof(*hcrx)); + + if (dp->dccps_avg_packet_size >= TFRC_MIN_PACKET_SIZE && + dp->dccps_avg_packet_size <= TFRC_MAX_PACKET_SIZE) + hcrx->ccid3hcrx_s = (u16)dp->dccps_avg_packet_size; + else + hcrx->ccid3hcrx_s = TFRC_STD_PACKET_SIZE; + + hcrx->ccid3hcrx_state = TFRC_RSTATE_NO_DATA; + INIT_LIST_HEAD(&hcrx->ccid3hcrx_hist); + INIT_LIST_HEAD(&hcrx->ccid3hcrx_loss_interval_hist); + + return 0; +} + +static void ccid3_hc_rx_exit(struct sock *sk) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; + + ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk); + + if (hcrx == NULL) + return; + + ccid3_hc_rx_set_state(sk, TFRC_RSTATE_TERM); + + /* Empty packet history */ + ccid3_rx_history_delete(&hcrx->ccid3hcrx_hist); + + /* Empty loss interval history */ + ccid3_loss_interval_history_delete(&hcrx->ccid3hcrx_loss_interval_hist); + + kfree(dp->dccps_hc_rx_ccid_private); + dp->dccps_hc_rx_ccid_private = NULL; +} + +static struct ccid ccid3 = { + .ccid_id = 3, + .ccid_name = "ccid3", + .ccid_owner = THIS_MODULE, + .ccid_init = ccid3_init, + .ccid_exit = ccid3_exit, + .ccid_hc_tx_init = ccid3_hc_tx_init, + .ccid_hc_tx_exit = ccid3_hc_tx_exit, + .ccid_hc_tx_send_packet = ccid3_hc_tx_send_packet, + .ccid_hc_tx_packet_sent = ccid3_hc_tx_packet_sent, + .ccid_hc_tx_packet_recv = ccid3_hc_tx_packet_recv, + .ccid_hc_tx_insert_options = ccid3_hc_tx_insert_options, + .ccid_hc_tx_parse_options = ccid3_hc_tx_parse_options, + .ccid_hc_rx_init = ccid3_hc_rx_init, + .ccid_hc_rx_exit = ccid3_hc_rx_exit, + .ccid_hc_rx_insert_options = ccid3_hc_rx_insert_options, + .ccid_hc_rx_packet_recv = ccid3_hc_rx_packet_recv, +}; + +module_param(ccid3_debug, int, 0444); +MODULE_PARM_DESC(ccid3_debug, "Enable debug messages"); + +static __init int ccid3_module_init(void) +{ + int rc = -ENOMEM; + + ccid3_tx_hist_slab = kmem_cache_create("dccp_ccid3_tx_history", + sizeof(struct ccid3_tx_hist_entry), 0, + SLAB_HWCACHE_ALIGN, NULL, NULL); + if (ccid3_tx_hist_slab == NULL) + goto out; + + ccid3_rx_hist_slab = kmem_cache_create("dccp_ccid3_rx_history", + sizeof(struct ccid3_rx_hist_entry), 0, + SLAB_HWCACHE_ALIGN, NULL, NULL); + if (ccid3_rx_hist_slab == NULL) + goto out_free_tx_history; + + ccid3_loss_interval_hist_slab = kmem_cache_create("dccp_ccid3_loss_interval_history", + sizeof(struct ccid3_loss_interval_hist_entry), 0, + SLAB_HWCACHE_ALIGN, NULL, NULL); + if (ccid3_loss_interval_hist_slab == NULL) + goto out_free_rx_history; + + rc = ccid_register(&ccid3); + if (rc != 0) + goto out_free_loss_interval_history; + +out: + return rc; +out_free_loss_interval_history: + kmem_cache_destroy(ccid3_loss_interval_hist_slab); + ccid3_loss_interval_hist_slab = NULL; +out_free_rx_history: + kmem_cache_destroy(ccid3_rx_hist_slab); + ccid3_rx_hist_slab = NULL; +out_free_tx_history: + kmem_cache_destroy(ccid3_tx_hist_slab); + ccid3_tx_hist_slab = NULL; + goto out; +} +module_init(ccid3_module_init); + +static __exit void ccid3_module_exit(void) +{ + ccid_unregister(&ccid3); + + if (ccid3_tx_hist_slab != NULL) { + kmem_cache_destroy(ccid3_tx_hist_slab); + ccid3_tx_hist_slab = NULL; + } + if (ccid3_rx_hist_slab != NULL) { + kmem_cache_destroy(ccid3_rx_hist_slab); + ccid3_rx_hist_slab = NULL; + } + if (ccid3_loss_interval_hist_slab != NULL) { + kmem_cache_destroy(ccid3_loss_interval_hist_slab); + ccid3_loss_interval_hist_slab = NULL; + } +} +module_exit(ccid3_module_exit); + +MODULE_AUTHOR("Ian McDonald & Arnaldo Carvalho de Melo "); +MODULE_DESCRIPTION("DCCP TFRC CCID3 CCID"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("net-dccp-ccid-3"); diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h new file mode 100644 index 0000000..5d6b623 --- /dev/null +++ b/net/dccp/ccids/ccid3.h @@ -0,0 +1,137 @@ +/* + * net/dccp/ccids/ccid3.h + * + * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand. + * + * An implementation of the DCCP protocol + * + * This code has been developed by the University of Waikato WAND + * research group. For further information please see http://www.wand.net.nz/ + * or e-mail Ian McDonald - iam4@cs.waikato.ac.nz + * + * This code also uses code from Lulea University, rereleased as GPL by its + * authors: + * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon + * + * Changes to meet Linux coding standards, to make it meet latest ccid3 draft + * and to make it work as a loadable module in the DCCP stack written by + * Arnaldo Carvalho de Melo . + * + * Copyright (c) 2005 Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#ifndef _DCCP_CCID3_H_ +#define _DCCP_CCID3_H_ + +#include +#include +#include + +struct ccid3_tx_hist_entry { + struct list_head ccid3htx_node; + u64 ccid3htx_seqno:48, + ccid3htx_win_count:8, + ccid3htx_sent:1; + struct timeval ccid3htx_tstamp; +}; + +struct ccid3_options_received { + u64 ccid3or_seqno:48, + ccid3or_loss_intervals_idx:16; + u16 ccid3or_loss_intervals_len; + u32 ccid3or_loss_event_rate; + u32 ccid3or_receive_rate; +}; + +/** struct ccid3_hc_tx_sock - CCID3 sender half connection congestion control block + * + * @ccid3hctx_state - Sender state + * @ccid3hctx_x - Current sending rate + * @ccid3hctx_x_recv - Receive rate + * @ccid3hctx_x_calc - Calculated send (?) rate + * @ccid3hctx_s - Packet size + * @ccid3hctx_rtt - Estimate of current round trip time in usecs + * @@ccid3hctx_p - Current loss event rate (0-1) scaled by 1000000 + * @ccid3hctx_last_win_count - Last window counter sent + * @ccid3hctx_t_last_win_count - Timestamp of earliest packet with last_win_count value sent + * @ccid3hctx_no_feedback_timer - Handle to no feedback timer + * @ccid3hctx_idle - FIXME + * @ccid3hctx_t_ld - Time last doubled during slow start + * @ccid3hctx_t_nom - Nominal send time of next packet + * @ccid3hctx_t_ipi - Interpacket (send) interval + * @ccid3hctx_delta - Send timer delta + * @ccid3hctx_hist - Packet history + */ +struct ccid3_hc_tx_sock { + u32 ccid3hctx_x; + u32 ccid3hctx_x_recv; + u32 ccid3hctx_x_calc; + u16 ccid3hctx_s; + u32 ccid3hctx_rtt; + u32 ccid3hctx_p; + u8 ccid3hctx_state; + u8 ccid3hctx_last_win_count; + u8 ccid3hctx_idle; + struct timeval ccid3hctx_t_last_win_count; + struct timer_list ccid3hctx_no_feedback_timer; + struct timeval ccid3hctx_t_ld; + struct timeval ccid3hctx_t_nom; + u32 ccid3hctx_t_ipi; + u32 ccid3hctx_delta; + struct list_head ccid3hctx_hist; + struct ccid3_options_received ccid3hctx_options_received; +}; + +struct ccid3_loss_interval_hist_entry { + struct list_head ccid3lih_node; + u64 ccid3lih_seqno:48, + ccid3lih_win_count:4; + u32 ccid3lih_interval; +}; + +struct ccid3_rx_hist_entry { + struct list_head ccid3hrx_node; + u64 ccid3hrx_seqno:48, + ccid3hrx_win_count:4, + ccid3hrx_type:4; + u32 ccid3hrx_ndp; /* In fact it is from 8 to 24 bits */ + struct timeval ccid3hrx_tstamp; +}; + +struct ccid3_hc_rx_sock { + u64 ccid3hcrx_seqno_last_counter:48, + ccid3hcrx_state:8, + ccid3hcrx_last_counter:4; + unsigned long ccid3hcrx_rtt; + u32 ccid3hcrx_p; + u32 ccid3hcrx_bytes_recv; + struct timeval ccid3hcrx_tstamp_last_feedback; + struct timeval ccid3hcrx_tstamp_last_ack; + struct list_head ccid3hcrx_hist; + struct list_head ccid3hcrx_loss_interval_hist; + u16 ccid3hcrx_s; + u32 ccid3hcrx_pinv; + u32 ccid3hcrx_elapsed_time; + u32 ccid3hcrx_x_recv; +}; + +#define ccid3_hc_tx_field(s,field) (s->dccps_hc_tx_ccid_private == NULL ? 0 : \ + ((struct ccid3_hc_tx_sock *)s->dccps_hc_tx_ccid_private)->ccid3hctx_##field) + +#define ccid3_hc_rx_field(s,field) (s->dccps_hc_rx_ccid_private == NULL ? 0 : \ + ((struct ccid3_hc_rx_sock *)s->dccps_hc_rx_ccid_private)->ccid3hcrx_##field) + +#endif /* _DCCP_CCID3_H_ */ diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h new file mode 100644 index 0000000..fb83454 --- /dev/null +++ b/net/dccp/dccp.h @@ -0,0 +1,422 @@ +#ifndef _DCCP_H +#define _DCCP_H +/* + * net/dccp/dccp.h + * + * An implementation of the DCCP protocol + * Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +#define DCCP_DEBUG + +#ifdef DCCP_DEBUG +extern int dccp_debug; + +#define dccp_pr_debug(format, a...) \ + do { if (dccp_debug) \ + printk(KERN_DEBUG "%s: " format, __FUNCTION__ , ##a); \ + } while (0) +#define dccp_pr_debug_cat(format, a...) do { if (dccp_debug) printk(format, ##a); } while (0) +#else +#define dccp_pr_debug(format, a...) +#define dccp_pr_debug_cat(format, a...) +#endif + +extern struct inet_hashinfo dccp_hashinfo; + +extern atomic_t dccp_orphan_count; +extern int dccp_tw_count; +extern void dccp_tw_deschedule(struct inet_timewait_sock *tw); + +extern void dccp_time_wait(struct sock *sk, int state, int timeo); + +/* FIXME: Right size this */ +#define DCCP_MAX_OPT_LEN 128 + +#define DCCP_MAX_PACKET_HDR 32 + +#define MAX_DCCP_HEADER (DCCP_MAX_PACKET_HDR + DCCP_MAX_OPT_LEN + MAX_HEADER) + +#define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT + * state, about 60 seconds */ + +/* draft-ietf-dccp-spec-11.txt initial RTO value */ +#define DCCP_TIMEOUT_INIT ((unsigned)(3 * HZ)) + +/* Maximal interval between probes for local resources. */ +#define DCCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ / 2U)) + +#define DCCP_RTO_MAX ((unsigned)(120 * HZ)) /* FIXME: using TCP value */ + +extern struct proto dccp_v4_prot; + +/* is seq1 < seq2 ? */ +static inline const int before48(const u64 seq1, const u64 seq2) +{ + return (const s64)((seq1 << 16) - (seq2 << 16)) < 0; +} + +/* is seq1 > seq2 ? */ +static inline const int after48(const u64 seq1, const u64 seq2) +{ + return (const s64)((seq2 << 16) - (seq1 << 16)) < 0; +} + +/* is seq2 <= seq1 <= seq3 ? */ +static inline const int between48(const u64 seq1, const u64 seq2, const u64 seq3) +{ + return (seq3 << 16) - (seq2 << 16) >= (seq1 << 16) - (seq2 << 16); +} + +static inline u64 max48(const u64 seq1, const u64 seq2) +{ + return after48(seq1, seq2) ? seq1 : seq2; +} + +enum { + DCCP_MIB_NUM = 0, + DCCP_MIB_ACTIVEOPENS, /* ActiveOpens */ + DCCP_MIB_ESTABRESETS, /* EstabResets */ + DCCP_MIB_CURRESTAB, /* CurrEstab */ + DCCP_MIB_OUTSEGS, /* OutSegs */ + DCCP_MIB_OUTRSTS, + DCCP_MIB_ABORTONTIMEOUT, + DCCP_MIB_TIMEOUTS, + DCCP_MIB_ABORTFAILED, + DCCP_MIB_PASSIVEOPENS, + DCCP_MIB_ATTEMPTFAILS, + DCCP_MIB_OUTDATAGRAMS, + DCCP_MIB_INERRS, + DCCP_MIB_OPTMANDATORYERROR, + DCCP_MIB_INVALIDOPT, + __DCCP_MIB_MAX +}; + +#define DCCP_MIB_MAX __DCCP_MIB_MAX +struct dccp_mib { + unsigned long mibs[DCCP_MIB_MAX]; +} __SNMP_MIB_ALIGN__; + +DECLARE_SNMP_STAT(struct dccp_mib, dccp_statistics); +#define DCCP_INC_STATS(field) SNMP_INC_STATS(dccp_statistics, field) +#define DCCP_INC_STATS_BH(field) SNMP_INC_STATS_BH(dccp_statistics, field) +#define DCCP_INC_STATS_USER(field) SNMP_INC_STATS_USER(dccp_statistics, field) +#define DCCP_DEC_STATS(field) SNMP_DEC_STATS(dccp_statistics, field) +#define DCCP_ADD_STATS_BH(field, val) SNMP_ADD_STATS_BH(dccp_statistics, field, val) +#define DCCP_ADD_STATS_USER(field, val) SNMP_ADD_STATS_USER(dccp_statistics, field, val) + +extern int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb); +extern int dccp_retransmit_skb(struct sock *sk, struct sk_buff *skb); + +extern int dccp_send_response(struct sock *sk); +extern void dccp_send_ack(struct sock *sk); +extern void dccp_send_delayed_ack(struct sock *sk); +extern void dccp_send_sync(struct sock *sk, u64 seq); + +extern void dccp_init_xmit_timers(struct sock *sk); +static inline void dccp_clear_xmit_timers(struct sock *sk) +{ + inet_csk_clear_xmit_timers(sk); +} + +extern unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu); + +extern const char *dccp_packet_name(const int type); +extern const char *dccp_state_name(const int state); + +static inline void dccp_set_state(struct sock *sk, const int state) +{ + const int oldstate = sk->sk_state; + + dccp_pr_debug("%s(%p) %-10.10s -> %s\n", + dccp_role(sk), sk, + dccp_state_name(oldstate), dccp_state_name(state)); + WARN_ON(state == oldstate); + + switch (state) { + case DCCP_OPEN: + if (oldstate != DCCP_OPEN) + DCCP_INC_STATS(DCCP_MIB_CURRESTAB); + break; + + case DCCP_CLOSED: + if (oldstate == DCCP_CLOSING || oldstate == DCCP_OPEN) + DCCP_INC_STATS(DCCP_MIB_ESTABRESETS); + + sk->sk_prot->unhash(sk); + if (inet_csk(sk)->icsk_bind_hash != NULL && + !(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) + inet_put_port(&dccp_hashinfo, sk); + /* fall through */ + default: + if (oldstate == DCCP_OPEN) + DCCP_DEC_STATS(DCCP_MIB_CURRESTAB); + } + + /* Change state AFTER socket is unhashed to avoid closed + * socket sitting in hash tables. + */ + sk->sk_state = state; +} + +static inline void dccp_done(struct sock *sk) +{ + dccp_set_state(sk, DCCP_CLOSED); + dccp_clear_xmit_timers(sk); + + sk->sk_shutdown = SHUTDOWN_MASK; + + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_state_change(sk); + else + inet_csk_destroy_sock(sk); +} + +static inline void dccp_openreq_init(struct request_sock *req, + struct dccp_sock *dp, + struct sk_buff *skb) +{ + /* + * FIXME: fill in the other req fields from the DCCP options + * received + */ + inet_rsk(req)->rmt_port = dccp_hdr(skb)->dccph_sport; + inet_rsk(req)->acked = 0; + req->rcv_wnd = 0; +} + +extern void dccp_v4_send_check(struct sock *sk, struct dccp_hdr *dh, int len, + struct sk_buff *skb); +extern int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb); + +extern struct sock *dccp_create_openreq_child(struct sock *sk, + const struct request_sock *req, + const struct sk_buff *skb); + +extern int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb); + +extern void dccp_v4_err(struct sk_buff *skb, u32); + +extern int dccp_v4_rcv(struct sk_buff *skb); + +extern struct sock *dccp_v4_request_recv_sock(struct sock *sk, + struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst); +extern struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct request_sock **prev); + +extern int dccp_child_process(struct sock *parent, struct sock *child, + struct sk_buff *skb); +extern int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, + struct dccp_hdr *dh, unsigned len); +extern int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, + const struct dccp_hdr *dh, const unsigned len); + +extern void dccp_close(struct sock *sk, long timeout); +extern struct sk_buff *dccp_make_response(struct sock *sk, + struct dst_entry *dst, + struct request_sock *req); + +extern int dccp_connect(struct sock *sk); +extern int dccp_disconnect(struct sock *sk, int flags); +extern int dccp_getsockopt(struct sock *sk, int level, int optname, + char *optval, int *optlen); +extern int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg); +extern int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t size); +extern int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t len, int nonblock, + int flags, int *addr_len); +extern int dccp_setsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen); +extern void dccp_shutdown(struct sock *sk, int how); + +extern int dccp_v4_checksum(struct sk_buff *skb); + +extern int dccp_v4_send_reset(struct sock *sk, enum dccp_reset_codes code); +extern void dccp_send_close(struct sock *sk); + +struct dccp_skb_cb { + __u8 dccpd_type; + __u8 dccpd_reset_code; + __u8 dccpd_service; + __u8 dccpd_ccval; + __u64 dccpd_seq; + __u64 dccpd_ack_seq; + int dccpd_opt_len; +}; + +#define DCCP_SKB_CB(__skb) ((struct dccp_skb_cb *)&((__skb)->cb[0])) + +static inline int dccp_non_data_packet(const struct sk_buff *skb) +{ + const __u8 type = DCCP_SKB_CB(skb)->dccpd_type; + + return type == DCCP_PKT_ACK || + type == DCCP_PKT_CLOSE || + type == DCCP_PKT_CLOSEREQ || + type == DCCP_PKT_RESET || + type == DCCP_PKT_SYNC || + type == DCCP_PKT_SYNCACK; +} + +static inline int dccp_packet_without_ack(const struct sk_buff *skb) +{ + const __u8 type = DCCP_SKB_CB(skb)->dccpd_type; + + return type == DCCP_PKT_DATA || type == DCCP_PKT_REQUEST; +} + +#define DCCP_MAX_SEQNO ((((u64)1) << 48) - 1) +#define DCCP_PKT_WITHOUT_ACK_SEQ (DCCP_MAX_SEQNO << 2) + +static inline void dccp_set_seqno(u64 *seqno, u64 value) +{ + if (value > DCCP_MAX_SEQNO) + value -= DCCP_MAX_SEQNO + 1; + *seqno = value; +} + +static inline u64 dccp_delta_seqno(u64 seqno1, u64 seqno2) +{ + return ((seqno2 << 16) - (seqno1 << 16)) >> 16; +} + +static inline void dccp_inc_seqno(u64 *seqno) +{ + if (++*seqno > DCCP_MAX_SEQNO) + *seqno = 0; +} + +static inline void dccp_hdr_set_seq(struct dccp_hdr *dh, const u64 gss) +{ + struct dccp_hdr_ext *dhx = (struct dccp_hdr_ext *)((void *)dh + sizeof(*dh)); + +#if defined(__LITTLE_ENDIAN_BITFIELD) + dh->dccph_seq = htonl((gss >> 32)) >> 8; +#elif defined(__BIG_ENDIAN_BITFIELD) + dh->dccph_seq = htonl((gss >> 32)); +#else +#error "Adjust your defines" +#endif + dhx->dccph_seq_low = htonl(gss & 0xffffffff); +} + +static inline void dccp_hdr_set_ack(struct dccp_hdr_ack_bits *dhack, const u64 gsr) +{ +#if defined(__LITTLE_ENDIAN_BITFIELD) + dhack->dccph_ack_nr_high = htonl((gsr >> 32)) >> 8; +#elif defined(__BIG_ENDIAN_BITFIELD) + dhack->dccph_ack_nr_high = htonl((gsr >> 32)); +#else +#error "Adjust your defines" +#endif + dhack->dccph_ack_nr_low = htonl(gsr & 0xffffffff); +} + +static inline void dccp_update_gsr(struct sock *sk, u64 seq) +{ + struct dccp_sock *dp = dccp_sk(sk); + u64 tmp_gsr; + + dccp_set_seqno(&tmp_gsr, dp->dccps_gsr + 1 - (dp->dccps_options.dccpo_sequence_window / 4)); + dp->dccps_gsr = seq; + dccp_set_seqno(&dp->dccps_swl, max48(tmp_gsr, dp->dccps_isr)); + dccp_set_seqno(&dp->dccps_swh, + dp->dccps_gsr + (3 * dp->dccps_options.dccpo_sequence_window) / 4); +} + +static inline void dccp_update_gss(struct sock *sk, u64 seq) +{ + struct dccp_sock *dp = dccp_sk(sk); + u64 tmp_gss; + + dccp_set_seqno(&tmp_gss, dp->dccps_gss - dp->dccps_options.dccpo_sequence_window + 1); + dp->dccps_awl = max48(tmp_gss, dp->dccps_iss); + dp->dccps_awh = dp->dccps_gss = seq; +} + +extern void dccp_insert_options(struct sock *sk, struct sk_buff *skb); +extern void dccp_insert_option_elapsed_time(struct sock *sk, + struct sk_buff *skb, + u32 elapsed_time); +extern void dccp_insert_option(struct sock *sk, struct sk_buff *skb, + unsigned char option, + const void *value, unsigned char len); + +extern struct socket *dccp_ctl_socket; + +#define DCCP_ACKPKTS_STATE_RECEIVED 0 +#define DCCP_ACKPKTS_STATE_ECN_MARKED (1 << 6) +#define DCCP_ACKPKTS_STATE_NOT_RECEIVED (3 << 6) + +#define DCCP_ACKPKTS_STATE_MASK 0xC0 /* 11000000 */ +#define DCCP_ACKPKTS_LEN_MASK 0x3F /* 00111111 */ + +/** struct dccp_ackpkts - acknowledgeable packets + * + * This data structure is the one defined in the DCCP draft + * Appendix A. + * + * @dccpap_buf_head - circular buffer head + * @dccpap_buf_tail - circular buffer tail + * @dccpap_buf_ackno - ack # of the most recent packet acknoldgeable in the buffer (i.e. %dccpap_buf_head) + * @dccpap_buf_nonce - the one-bit sum of the ECN Nonces on all packets acked by the buffer with State 0 + * + * Additionally, the HC-Receiver must keep some information about the + * Ack Vectors it has recently sent. For each packet sent carrying an + * Ack Vector, it remembers four variables: + * + * @dccpap_ack_seqno - the Sequence Number used for the packet (HC-Receiver seqno) + * @dccpap_ack_ptr - the value of buf_head at the time of acknowledgement. + * @dccpap_ack_ackno - the Acknowledgement Number used for the packet (HC-Sender seqno) + * @dccpap_ack_nonce - the one-bit sum of the ECN Nonces for all State 0. + * + * @dccpap_buf_len - circular buffer length + * @dccpap_buf - circular buffer of acknowledgeable packets + */ +struct dccp_ackpkts { + unsigned int dccpap_buf_head; + unsigned int dccpap_buf_tail; + u64 dccpap_buf_ackno; + u64 dccpap_ack_seqno; + u64 dccpap_ack_ackno; + unsigned int dccpap_ack_ptr; + unsigned int dccpap_buf_vector_len; + unsigned int dccpap_ack_vector_len; + unsigned int dccpap_buf_len; + unsigned long dccpap_time; + u8 dccpap_buf_nonce; + u8 dccpap_ack_nonce; + u8 dccpap_buf[0]; +}; + +extern struct dccp_ackpkts *dccp_ackpkts_alloc(unsigned int len, int priority); +extern void dccp_ackpkts_free(struct dccp_ackpkts *ap); +extern int dccp_ackpkts_add(struct dccp_ackpkts *ap, u64 ackno, u8 state); +extern void dccp_ackpkts_check_rcv_ackno(struct dccp_ackpkts *ap, + struct sock *sk, u64 ackno); + +#ifdef DCCP_DEBUG +extern void dccp_ackvector_print(const u64 ackno, + const unsigned char *vector, int len); +extern void dccp_ackpkts_print(const struct dccp_ackpkts *ap); +#else +static inline void dccp_ackvector_print(const u64 ackno, + const unsigned char *vector, + int len) { } +static inline void dccp_ackpkts_print(const struct dccp_ackpkts *ap) { } +#endif + +#endif /* _DCCP_H */ diff --git a/net/dccp/input.c b/net/dccp/input.c new file mode 100644 index 0000000..622e976 --- /dev/null +++ b/net/dccp/input.c @@ -0,0 +1,510 @@ +/* + * net/dccp/input.c + * + * An implementation of the DCCP protocol + * Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include + +#include + +#include "ccid.h" +#include "dccp.h" + +static void dccp_fin(struct sock *sk, struct sk_buff *skb) +{ + sk->sk_shutdown |= RCV_SHUTDOWN; + sock_set_flag(sk, SOCK_DONE); + __skb_pull(skb, dccp_hdr(skb)->dccph_doff * 4); + __skb_queue_tail(&sk->sk_receive_queue, skb); + skb_set_owner_r(skb, sk); + sk->sk_data_ready(sk, 0); +} + +static void dccp_rcv_close(struct sock *sk, struct sk_buff *skb) +{ + switch (sk->sk_state) { + case DCCP_PARTOPEN: + case DCCP_OPEN: + dccp_v4_send_reset(sk, DCCP_RESET_CODE_CLOSED); + dccp_fin(sk, skb); + dccp_set_state(sk, DCCP_CLOSED); + break; + } +} + +static void dccp_rcv_closereq(struct sock *sk, struct sk_buff *skb) +{ + /* + * Step 7: Check for unexpected packet types + * If (S.is_server and P.type == CloseReq) + * Send Sync packet acknowledging P.seqno + * Drop packet and return + */ + if (dccp_sk(sk)->dccps_role != DCCP_ROLE_CLIENT) { + dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq); + return; + } + + switch (sk->sk_state) { + case DCCP_PARTOPEN: + case DCCP_OPEN: + dccp_set_state(sk, DCCP_CLOSING); + dccp_send_close(sk); + break; + } +} + +static inline void dccp_event_ack_recv(struct sock *sk, struct sk_buff *skb) +{ + struct dccp_sock *dp = dccp_sk(sk); + + if (dp->dccps_options.dccpo_send_ack_vector) + dccp_ackpkts_check_rcv_ackno(dp->dccps_hc_rx_ackpkts, sk, + DCCP_SKB_CB(skb)->dccpd_ack_seq); +} + +static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb) +{ + const struct dccp_hdr *dh = dccp_hdr(skb); + struct dccp_sock *dp = dccp_sk(sk); + u64 lswl = dp->dccps_swl; + u64 lawl = dp->dccps_awl; + + /* + * Step 5: Prepare sequence numbers for Sync + * If P.type == Sync or P.type == SyncAck, + * If S.AWL <= P.ackno <= S.AWH and P.seqno >= S.SWL, + * / * P is valid, so update sequence number variables + * accordingly. After this update, P will pass the tests + * in Step 6. A SyncAck is generated if necessary in + * Step 15 * / + * Update S.GSR, S.SWL, S.SWH + * Otherwise, + * Drop packet and return + */ + if (dh->dccph_type == DCCP_PKT_SYNC || + dh->dccph_type == DCCP_PKT_SYNCACK) { + if (between48(DCCP_SKB_CB(skb)->dccpd_ack_seq, dp->dccps_awl, dp->dccps_awh) && + !before48(DCCP_SKB_CB(skb)->dccpd_seq, dp->dccps_swl)) + dccp_update_gsr(sk, DCCP_SKB_CB(skb)->dccpd_seq); + else + return -1; + /* + * Step 6: Check sequence numbers + * Let LSWL = S.SWL and LAWL = S.AWL + * If P.type == CloseReq or P.type == Close or P.type == Reset, + * LSWL := S.GSR + 1, LAWL := S.GAR + * If LSWL <= P.seqno <= S.SWH + * and (P.ackno does not exist or LAWL <= P.ackno <= S.AWH), + * Update S.GSR, S.SWL, S.SWH + * If P.type != Sync, + * Update S.GAR + * Otherwise, + * Send Sync packet acknowledging P.seqno + * Drop packet and return + */ + } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ || + dh->dccph_type == DCCP_PKT_CLOSE || + dh->dccph_type == DCCP_PKT_RESET) { + lswl = dp->dccps_gsr; + dccp_inc_seqno(&lswl); + lawl = dp->dccps_gar; + } + + if (between48(DCCP_SKB_CB(skb)->dccpd_seq, lswl, dp->dccps_swh) && + (DCCP_SKB_CB(skb)->dccpd_ack_seq == DCCP_PKT_WITHOUT_ACK_SEQ || + between48(DCCP_SKB_CB(skb)->dccpd_ack_seq, lawl, dp->dccps_awh))) { + dccp_update_gsr(sk, DCCP_SKB_CB(skb)->dccpd_seq); + + if (dh->dccph_type != DCCP_PKT_SYNC && + DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) + dp->dccps_gar = DCCP_SKB_CB(skb)->dccpd_ack_seq; + } else { + dccp_pr_debug("Step 6 failed, sending SYNC...\n"); + dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq); + return -1; + } + + return 0; +} + +int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, + const struct dccp_hdr *dh, const unsigned len) +{ + struct dccp_sock *dp = dccp_sk(sk); + + if (dccp_check_seqno(sk, skb)) + goto discard; + + if (dccp_parse_options(sk, skb)) + goto discard; + + if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) + dccp_event_ack_recv(sk, skb); + + /* + * FIXME: check ECN to see if we should use + * DCCP_ACKPKTS_STATE_ECN_MARKED + */ + if (dp->dccps_options.dccpo_send_ack_vector) { + struct dccp_ackpkts *ap = dp->dccps_hc_rx_ackpkts; + + if (dccp_ackpkts_add(dp->dccps_hc_rx_ackpkts, + DCCP_SKB_CB(skb)->dccpd_seq, + DCCP_ACKPKTS_STATE_RECEIVED)) { + LIMIT_NETDEBUG(pr_info("DCCP: acknowledgeable packets buffer full!\n")); + ap->dccpap_ack_seqno = DCCP_MAX_SEQNO + 1; + inet_csk_schedule_ack(sk); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, TCP_DELACK_MIN, TCP_RTO_MAX); + goto discard; + } + + /* + * FIXME: this activation is probably wrong, have to study more + * TCP delack machinery and how it fits into DCCP draft, but + * for now it kinda "works" 8) + */ + if (!inet_csk_ack_scheduled(sk)) { + inet_csk_schedule_ack(sk); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, 5 * HZ, TCP_RTO_MAX); + } + } + + ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb); + ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb); + + switch (dccp_hdr(skb)->dccph_type) { + case DCCP_PKT_DATAACK: + case DCCP_PKT_DATA: + /* + * FIXME: check if sk_receive_queue is full, schedule DATA_DROPPED option + * if it is. + */ + __skb_pull(skb, dh->dccph_doff * 4); + __skb_queue_tail(&sk->sk_receive_queue, skb); + skb_set_owner_r(skb, sk); + sk->sk_data_ready(sk, 0); + return 0; + case DCCP_PKT_ACK: + goto discard; + case DCCP_PKT_RESET: + /* + * Step 9: Process Reset + * If P.type == Reset, + * Tear down connection + * S.state := TIMEWAIT + * Set TIMEWAIT timer + * Drop packet and return + */ + dccp_fin(sk, skb); + dccp_time_wait(sk, DCCP_TIME_WAIT, 0); + return 0; + case DCCP_PKT_CLOSEREQ: + dccp_rcv_closereq(sk, skb); + goto discard; + case DCCP_PKT_CLOSE: + dccp_rcv_close(sk, skb); + return 0; + case DCCP_PKT_REQUEST: + /* Step 7 + * or (S.is_server and P.type == Response) + * or (S.is_client and P.type == Request) + * or (S.state >= OPEN and P.type == Request + * and P.seqno >= S.OSR) + * or (S.state >= OPEN and P.type == Response + * and P.seqno >= S.OSR) + * or (S.state == RESPOND and P.type == Data), + * Send Sync packet acknowledging P.seqno + * Drop packet and return + */ + if (dp->dccps_role != DCCP_ROLE_LISTEN) + goto send_sync; + goto check_seq; + case DCCP_PKT_RESPONSE: + if (dp->dccps_role != DCCP_ROLE_CLIENT) + goto send_sync; +check_seq: + if (!before48(DCCP_SKB_CB(skb)->dccpd_seq, dp->dccps_osr)) { +send_sync: + dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq); + } + break; + } + + DCCP_INC_STATS_BH(DCCP_MIB_INERRS); +discard: + __kfree_skb(skb); + return 0; +} + +static int dccp_rcv_request_sent_state_process(struct sock *sk, + struct sk_buff *skb, + const struct dccp_hdr *dh, + const unsigned len) +{ + /* + * Step 4: Prepare sequence numbers in REQUEST + * If S.state == REQUEST, + * If (P.type == Response or P.type == Reset) + * and S.AWL <= P.ackno <= S.AWH, + * / * Set sequence number variables corresponding to the + * other endpoint, so P will pass the tests in Step 6 * / + * Set S.GSR, S.ISR, S.SWL, S.SWH + * / * Response processing continues in Step 10; Reset + * processing continues in Step 9 * / + */ + if (dh->dccph_type == DCCP_PKT_RESPONSE) { + const struct inet_connection_sock *icsk = inet_csk(sk); + struct dccp_sock *dp = dccp_sk(sk); + + /* Stop the REQUEST timer */ + inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); + BUG_TRAP(sk->sk_send_head != NULL); + __kfree_skb(sk->sk_send_head); + sk->sk_send_head = NULL; + + if (!between48(DCCP_SKB_CB(skb)->dccpd_ack_seq, dp->dccps_awl, dp->dccps_awh)) { + dccp_pr_debug("invalid ackno: S.AWL=%llu, P.ackno=%llu, S.AWH=%llu \n", + dp->dccps_awl, DCCP_SKB_CB(skb)->dccpd_ack_seq, dp->dccps_awh); + goto out_invalid_packet; + } + + dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq; + dccp_update_gsr(sk, DCCP_SKB_CB(skb)->dccpd_seq); + + if (ccid_hc_rx_init(dp->dccps_hc_rx_ccid, sk) != 0 || + ccid_hc_tx_init(dp->dccps_hc_tx_ccid, sk) != 0) { + ccid_hc_rx_exit(dp->dccps_hc_rx_ccid, sk); + ccid_hc_tx_exit(dp->dccps_hc_tx_ccid, sk); + /* FIXME: send appropriate RESET code */ + goto out_invalid_packet; + } + + dccp_sync_mss(sk, dp->dccps_pmtu_cookie); + + /* + * Step 10: Process REQUEST state (second part) + * If S.state == REQUEST, + * / * If we get here, P is a valid Response from the server (see + * Step 4), and we should move to PARTOPEN state. PARTOPEN + * means send an Ack, don't send Data packets, retransmit + * Acks periodically, and always include any Init Cookie from + * the Response * / + * S.state := PARTOPEN + * Set PARTOPEN timer + * Continue with S.state == PARTOPEN + * / * Step 12 will send the Ack completing the three-way + * handshake * / + */ + dccp_set_state(sk, DCCP_PARTOPEN); + + /* Make sure socket is routed, for correct metrics. */ + inet_sk_rebuild_header(sk); + + if (!sock_flag(sk, SOCK_DEAD)) { + sk->sk_state_change(sk); + sk_wake_async(sk, 0, POLL_OUT); + } + + if (sk->sk_write_pending || icsk->icsk_ack.pingpong || + icsk->icsk_accept_queue.rskq_defer_accept) { + /* Save one ACK. Data will be ready after + * several ticks, if write_pending is set. + * + * It may be deleted, but with this feature tcpdumps + * look so _wonderfully_ clever, that I was not able + * to stand against the temptation 8) --ANK + */ + /* + * OK, in DCCP we can as well do a similar trick, its + * even in the draft, but there is no need for us to + * schedule an ack here, as dccp_sendmsg does this for + * us, also stated in the draft. -acme + */ + __kfree_skb(skb); + return 0; + } + dccp_send_ack(sk); + return -1; + } + +out_invalid_packet: + return 1; /* dccp_v4_do_rcv will send a reset, but... + FIXME: the reset code should be DCCP_RESET_CODE_PACKET_ERROR */ +} + +static int dccp_rcv_respond_partopen_state_process(struct sock *sk, + struct sk_buff *skb, + const struct dccp_hdr *dh, + const unsigned len) +{ + int queued = 0; + + switch (dh->dccph_type) { + case DCCP_PKT_RESET: + inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); + break; + case DCCP_PKT_DATAACK: + case DCCP_PKT_ACK: + /* + * FIXME: we should be reseting the PARTOPEN (DELACK) timer here, + * but only if we haven't used the DELACK timer for something else, + * like sending a delayed ack for a TIMESTAMP echo, etc, for now + * were not clearing it, sending an extra ACK when there is nothing + * else to do in DELACK is not a big deal after all. + */ + + /* Stop the PARTOPEN timer */ + if (sk->sk_state == DCCP_PARTOPEN) + inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); + + dccp_sk(sk)->dccps_osr = DCCP_SKB_CB(skb)->dccpd_seq; + dccp_set_state(sk, DCCP_OPEN); + + if (dh->dccph_type == DCCP_PKT_DATAACK) { + dccp_rcv_established(sk, skb, dh, len); + queued = 1; /* packet was queued (by dccp_rcv_established) */ + } + break; + } + + return queued; +} + +int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, + struct dccp_hdr *dh, unsigned len) +{ + struct dccp_sock *dp = dccp_sk(sk); + const int old_state = sk->sk_state; + int queued = 0; + + if (sk->sk_state != DCCP_LISTEN && sk->sk_state != DCCP_REQUESTING) { + if (dccp_check_seqno(sk, skb)) + goto discard; + + /* + * Step 8: Process options and mark acknowledgeable + */ + if (dccp_parse_options(sk, skb)) + goto discard; + + if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) + dccp_event_ack_recv(sk, skb); + + ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb); + ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb); + + /* + * FIXME: check ECN to see if we should use + * DCCP_ACKPKTS_STATE_ECN_MARKED + */ + if (dp->dccps_options.dccpo_send_ack_vector) { + if (dccp_ackpkts_add(dp->dccps_hc_rx_ackpkts, + DCCP_SKB_CB(skb)->dccpd_seq, + DCCP_ACKPKTS_STATE_RECEIVED)) + goto discard; + /* + * FIXME: this activation is probably wrong, have to study more + * TCP delack machinery and how it fits into DCCP draft, but + * for now it kinda "works" 8) + */ + if (dp->dccps_hc_rx_ackpkts->dccpap_ack_seqno == DCCP_MAX_SEQNO + 1 && + !inet_csk_ack_scheduled(sk)) { + inet_csk_schedule_ack(sk); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, TCP_DELACK_MIN, TCP_RTO_MAX); + } + } + } + + /* + * Step 9: Process Reset + * If P.type == Reset, + * Tear down connection + * S.state := TIMEWAIT + * Set TIMEWAIT timer + * Drop packet and return + */ + if (dh->dccph_type == DCCP_PKT_RESET) { + /* Queue the equivalent of TCP fin so that dccp_recvmsg exits the loop */ + dccp_fin(sk, skb); + dccp_time_wait(sk, DCCP_TIME_WAIT, 0); + return 0; + /* + * Step 7: Check for unexpected packet types + * If (S.is_server and P.type == CloseReq) + * or (S.is_server and P.type == Response) + * or (S.is_client and P.type == Request) + * or (S.state == RESPOND and P.type == Data), + * Send Sync packet acknowledging P.seqno + * Drop packet and return + */ + } else if ((dp->dccps_role != DCCP_ROLE_CLIENT && + (dh->dccph_type == DCCP_PKT_RESPONSE || dh->dccph_type == DCCP_PKT_CLOSEREQ)) || + (dp->dccps_role == DCCP_ROLE_CLIENT && + dh->dccph_type == DCCP_PKT_REQUEST) || + (sk->sk_state == DCCP_RESPOND && dh->dccph_type == DCCP_PKT_DATA)) { + dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq); + goto discard; + } + + switch (sk->sk_state) { + case DCCP_CLOSED: + return 1; + + case DCCP_LISTEN: + if (dh->dccph_type == DCCP_PKT_ACK || + dh->dccph_type == DCCP_PKT_DATAACK) + return 1; + + if (dh->dccph_type == DCCP_PKT_RESET) + goto discard; + + if (dh->dccph_type == DCCP_PKT_REQUEST) { + if (dccp_v4_conn_request(sk, skb) < 0) + return 1; + + /* FIXME: do congestion control initialization */ + goto discard; + } + goto discard; + + case DCCP_REQUESTING: + /* FIXME: do congestion control initialization */ + + queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len); + if (queued >= 0) + return queued; + + __kfree_skb(skb); + return 0; + + case DCCP_RESPOND: + case DCCP_PARTOPEN: + queued = dccp_rcv_respond_partopen_state_process(sk, skb, dh, len); + break; + } + + if (dh->dccph_type == DCCP_PKT_ACK || dh->dccph_type == DCCP_PKT_DATAACK) { + switch (old_state) { + case DCCP_PARTOPEN: + sk->sk_state_change(sk); + sk_wake_async(sk, 0, POLL_OUT); + break; + } + } + + if (!queued) { +discard: + __kfree_skb(skb); + } + return 0; +} diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c new file mode 100644 index 0000000..083baca --- /dev/null +++ b/net/dccp/ipv4.c @@ -0,0 +1,1289 @@ +/* + * net/dccp/ipv4.c + * + * An implementation of the DCCP protocol + * Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "ccid.h" +#include "dccp.h" + +struct inet_hashinfo __cacheline_aligned dccp_hashinfo = { + .lhash_lock = RW_LOCK_UNLOCKED, + .lhash_users = ATOMIC_INIT(0), + .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(dccp_hashinfo.lhash_wait), + .portalloc_lock = SPIN_LOCK_UNLOCKED, + .port_rover = 1024 - 1, +}; + +static int dccp_v4_get_port(struct sock *sk, const unsigned short snum) +{ + return inet_csk_get_port(&dccp_hashinfo, sk, snum); +} + +static void dccp_v4_hash(struct sock *sk) +{ + inet_hash(&dccp_hashinfo, sk); +} + +static void dccp_v4_unhash(struct sock *sk) +{ + inet_unhash(&dccp_hashinfo, sk); +} + +/* called with local bh disabled */ +static int __dccp_v4_check_established(struct sock *sk, const __u16 lport, + struct inet_timewait_sock **twp) +{ + struct inet_sock *inet = inet_sk(sk); + const u32 daddr = inet->rcv_saddr; + const u32 saddr = inet->daddr; + const int dif = sk->sk_bound_dev_if; + INET_ADDR_COOKIE(acookie, saddr, daddr) + const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport); + const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, dccp_hashinfo.ehash_size); + struct inet_ehash_bucket *head = &dccp_hashinfo.ehash[hash]; + const struct sock *sk2; + const struct hlist_node *node; + struct inet_timewait_sock *tw; + + write_lock(&head->lock); + + /* Check TIME-WAIT sockets first. */ + sk_for_each(sk2, node, &(head + dccp_hashinfo.ehash_size)->chain) { + tw = inet_twsk(sk2); + + if (INET_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) + goto not_unique; + } + tw = NULL; + + /* And established part... */ + sk_for_each(sk2, node, &head->chain) { + if (INET_MATCH(sk2, acookie, saddr, daddr, ports, dif)) + goto not_unique; + } + + /* Must record num and sport now. Otherwise we will see + * in hash table socket with a funny identity. */ + inet->num = lport; + inet->sport = htons(lport); + sk->sk_hashent = hash; + BUG_TRAP(sk_unhashed(sk)); + __sk_add_node(sk, &head->chain); + sock_prot_inc_use(sk->sk_prot); + write_unlock(&head->lock); + + if (twp != NULL) { + *twp = tw; + NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); + } else if (tw != NULL) { + /* Silly. Should hash-dance instead... */ + dccp_tw_deschedule(tw); + NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); + + inet_twsk_put(tw); + } + + return 0; + +not_unique: + write_unlock(&head->lock); + return -EADDRNOTAVAIL; +} + +/* + * Bind a port for a connect operation and hash it. + */ +static int dccp_v4_hash_connect(struct sock *sk) +{ + const unsigned short snum = inet_sk(sk)->num; + struct inet_bind_hashbucket *head; + struct inet_bind_bucket *tb; + int ret; + + if (snum == 0) { + int rover; + int low = sysctl_local_port_range[0]; + int high = sysctl_local_port_range[1]; + int remaining = (high - low) + 1; + struct hlist_node *node; + struct inet_timewait_sock *tw = NULL; + + local_bh_disable(); + + /* TODO. Actually it is not so bad idea to remove + * dccp_hashinfo.portalloc_lock before next submission to Linus. + * As soon as we touch this place at all it is time to think. + * + * Now it protects single _advisory_ variable dccp_hashinfo.port_rover, + * hence it is mostly useless. + * Code will work nicely if we just delete it, but + * I am afraid in contented case it will work not better or + * even worse: another cpu just will hit the same bucket + * and spin there. + * So some cpu salt could remove both contention and + * memory pingpong. Any ideas how to do this in a nice way? + */ + spin_lock(&dccp_hashinfo.portalloc_lock); + rover = dccp_hashinfo.port_rover; + + do { + rover++; + if ((rover < low) || (rover > high)) + rover = low; + head = &dccp_hashinfo.bhash[inet_bhashfn(rover, dccp_hashinfo.bhash_size)]; + spin_lock(&head->lock); + + /* Does not bother with rcv_saddr checks, + * because the established check is already + * unique enough. + */ + inet_bind_bucket_for_each(tb, node, &head->chain) { + if (tb->port == rover) { + BUG_TRAP(!hlist_empty(&tb->owners)); + if (tb->fastreuse >= 0) + goto next_port; + if (!__dccp_v4_check_established(sk, + rover, + &tw)) + goto ok; + goto next_port; + } + } + + tb = inet_bind_bucket_create(dccp_hashinfo.bind_bucket_cachep, head, rover); + if (tb == NULL) { + spin_unlock(&head->lock); + break; + } + tb->fastreuse = -1; + goto ok; + + next_port: + spin_unlock(&head->lock); + } while (--remaining > 0); + dccp_hashinfo.port_rover = rover; + spin_unlock(&dccp_hashinfo.portalloc_lock); + + local_bh_enable(); + + return -EADDRNOTAVAIL; + +ok: + /* All locks still held and bhs disabled */ + dccp_hashinfo.port_rover = rover; + spin_unlock(&dccp_hashinfo.portalloc_lock); + + inet_bind_hash(sk, tb, rover); + if (sk_unhashed(sk)) { + inet_sk(sk)->sport = htons(rover); + __inet_hash(&dccp_hashinfo, sk, 0); + } + spin_unlock(&head->lock); + + if (tw != NULL) { + dccp_tw_deschedule(tw); + inet_twsk_put(tw); + } + + ret = 0; + goto out; + } + + head = &dccp_hashinfo.bhash[inet_bhashfn(snum, dccp_hashinfo.bhash_size)]; + tb = inet_csk(sk)->icsk_bind_hash; + spin_lock_bh(&head->lock); + if (sk_head(&tb->owners) == sk && sk->sk_bind_node.next == NULL) { + __inet_hash(&dccp_hashinfo, sk, 0); + spin_unlock_bh(&head->lock); + return 0; + } else { + spin_unlock(&head->lock); + /* No definite answer... Walk to established hash table */ + ret = __dccp_v4_check_established(sk, snum, NULL); +out: + local_bh_enable(); + return ret; + } +} + +static int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len) +{ + struct inet_sock *inet = inet_sk(sk); + struct dccp_sock *dp = dccp_sk(sk); + const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; + struct rtable *rt; + u32 daddr, nexthop; + int tmp; + int err; + + dp->dccps_role = DCCP_ROLE_CLIENT; + + if (addr_len < sizeof(struct sockaddr_in)) + return -EINVAL; + + if (usin->sin_family != AF_INET) + return -EAFNOSUPPORT; + + nexthop = daddr = usin->sin_addr.s_addr; + if (inet->opt != NULL && inet->opt->srr) { + if (daddr == 0) + return -EINVAL; + nexthop = inet->opt->faddr; + } + + tmp = ip_route_connect(&rt, nexthop, inet->saddr, + RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, + IPPROTO_DCCP, + inet->sport, usin->sin_port, sk); + if (tmp < 0) + return tmp; + + if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { + ip_rt_put(rt); + return -ENETUNREACH; + } + + if (inet->opt == NULL || !inet->opt->srr) + daddr = rt->rt_dst; + + if (inet->saddr == 0) + inet->saddr = rt->rt_src; + inet->rcv_saddr = inet->saddr; + + inet->dport = usin->sin_port; + inet->daddr = daddr; + + dp->dccps_ext_header_len = 0; + if (inet->opt != NULL) + dp->dccps_ext_header_len = inet->opt->optlen; + /* + * Socket identity is still unknown (sport may be zero). + * However we set state to DCCP_REQUESTING and not releasing socket + * lock select source port, enter ourselves into the hash tables and + * complete initialization after this. + */ + dccp_set_state(sk, DCCP_REQUESTING); + err = dccp_v4_hash_connect(sk); + if (err != 0) + goto failure; + + err = ip_route_newports(&rt, inet->sport, inet->dport, sk); + if (err != 0) + goto failure; + + /* OK, now commit destination to socket. */ + sk_setup_caps(sk, &rt->u.dst); + + dp->dccps_gar = + dp->dccps_iss = secure_dccp_sequence_number(inet->saddr, + inet->daddr, + inet->sport, + usin->sin_port); + dccp_update_gss(sk, dp->dccps_iss); + + inet->id = dp->dccps_iss ^ jiffies; + + err = dccp_connect(sk); + rt = NULL; + if (err != 0) + goto failure; +out: + return err; +failure: + /* This unhashes the socket and releases the local port, if necessary. */ + dccp_set_state(sk, DCCP_CLOSED); + ip_rt_put(rt); + sk->sk_route_caps = 0; + inet->dport = 0; + goto out; +} + +/* + * This routine does path mtu discovery as defined in RFC1191. + */ +static inline void dccp_do_pmtu_discovery(struct sock *sk, + const struct iphdr *iph, + u32 mtu) +{ + struct dst_entry *dst; + const struct inet_sock *inet = inet_sk(sk); + const struct dccp_sock *dp = dccp_sk(sk); + + /* We are not interested in DCCP_LISTEN and request_socks (RESPONSEs + * send out by Linux are always < 576bytes so they should go through + * unfragmented). + */ + if (sk->sk_state == DCCP_LISTEN) + return; + + /* We don't check in the destentry if pmtu discovery is forbidden + * on this route. We just assume that no packet_to_big packets + * are send back when pmtu discovery is not active. + * There is a small race when the user changes this flag in the + * route, but I think that's acceptable. + */ + if ((dst = __sk_dst_check(sk, 0)) == NULL) + return; + + dst->ops->update_pmtu(dst, mtu); + + /* Something is about to be wrong... Remember soft error + * for the case, if this connection will not able to recover. + */ + if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) + sk->sk_err_soft = EMSGSIZE; + + mtu = dst_mtu(dst); + + if (inet->pmtudisc != IP_PMTUDISC_DONT && + dp->dccps_pmtu_cookie > mtu) { + dccp_sync_mss(sk, mtu); + + /* + * From: draft-ietf-dccp-spec-11.txt + * + * DCCP-Sync packets are the best choice for upward probing, + * since DCCP-Sync probes do not risk application data loss. + */ + dccp_send_sync(sk, dp->dccps_gsr); + } /* else let the usual retransmit timer handle it */ +} + +static void dccp_v4_ctl_send_ack(struct sk_buff *rxskb) +{ + int err; + struct dccp_hdr *rxdh = dccp_hdr(rxskb), *dh; + const int dccp_hdr_ack_len = sizeof(struct dccp_hdr) + + sizeof(struct dccp_hdr_ext) + + sizeof(struct dccp_hdr_ack_bits); + struct sk_buff *skb; + + if (((struct rtable *)rxskb->dst)->rt_type != RTN_LOCAL) + return; + + skb = alloc_skb(MAX_DCCP_HEADER + 15, GFP_ATOMIC); + if (skb == NULL) + return; + + /* Reserve space for headers. */ + skb_reserve(skb, MAX_DCCP_HEADER); + + skb->dst = dst_clone(rxskb->dst); + + skb->h.raw = skb_push(skb, dccp_hdr_ack_len); + dh = dccp_hdr(skb); + memset(dh, 0, dccp_hdr_ack_len); + + /* Build DCCP header and checksum it. */ + dh->dccph_type = DCCP_PKT_ACK; + dh->dccph_sport = rxdh->dccph_dport; + dh->dccph_dport = rxdh->dccph_sport; + dh->dccph_doff = dccp_hdr_ack_len / 4; + dh->dccph_x = 1; + + dccp_hdr_set_seq(dh, DCCP_SKB_CB(rxskb)->dccpd_ack_seq); + dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), DCCP_SKB_CB(rxskb)->dccpd_seq); + + bh_lock_sock(dccp_ctl_socket->sk); + err = ip_build_and_send_pkt(skb, dccp_ctl_socket->sk, + rxskb->nh.iph->daddr, rxskb->nh.iph->saddr, NULL); + bh_unlock_sock(dccp_ctl_socket->sk); + + if (err == NET_XMIT_CN || err == 0) { + DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS); + DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS); + } +} + +static void dccp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req) +{ + dccp_v4_ctl_send_ack(skb); +} + +static int dccp_v4_send_response(struct sock *sk, struct request_sock *req, + struct dst_entry *dst) +{ + int err = -1; + struct sk_buff *skb; + + /* First, grab a route. */ + + if (dst == NULL && (dst = inet_csk_route_req(sk, req)) == NULL) + goto out; + + skb = dccp_make_response(sk, dst, req); + if (skb != NULL) { + const struct inet_request_sock *ireq = inet_rsk(req); + + err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr, + ireq->rmt_addr, + ireq->opt); + if (err == NET_XMIT_CN) + err = 0; + } + +out: + dst_release(dst); + return err; +} + +/* + * This routine is called by the ICMP module when it gets some sort of error + * condition. If err < 0 then the socket should be closed and the error + * returned to the user. If err > 0 it's just the icmp type << 8 | icmp code. + * After adjustment header points to the first 8 bytes of the tcp header. We + * need to find the appropriate port. + * + * The locking strategy used here is very "optimistic". When someone else + * accesses the socket the ICMP is just dropped and for some paths there is no + * check at all. A more general error queue to queue errors for later handling + * is probably better. + */ +void dccp_v4_err(struct sk_buff *skb, u32 info) +{ + const struct iphdr *iph = (struct iphdr *)skb->data; + const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data + (iph->ihl << 2)); + struct dccp_sock *dp; + struct inet_sock *inet; + const int type = skb->h.icmph->type; + const int code = skb->h.icmph->code; + struct sock *sk; + __u64 seq; + int err; + + if (skb->len < (iph->ihl << 2) + 8) { + ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + return; + } + + sk = inet_lookup(&dccp_hashinfo, iph->daddr, dh->dccph_dport, + iph->saddr, dh->dccph_sport, inet_iif(skb)); + if (sk == NULL) { + ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + return; + } + + if (sk->sk_state == DCCP_TIME_WAIT) { + inet_twsk_put((struct inet_timewait_sock *)sk); + return; + } + + bh_lock_sock(sk); + /* If too many ICMPs get dropped on busy + * servers this needs to be solved differently. + */ + if (sock_owned_by_user(sk)) + NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS); + + if (sk->sk_state == DCCP_CLOSED) + goto out; + + dp = dccp_sk(sk); + seq = dccp_hdr_seq(skb); + if (sk->sk_state != DCCP_LISTEN && + !between48(seq, dp->dccps_swl, dp->dccps_swh)) { + NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS); + goto out; + } + + switch (type) { + case ICMP_SOURCE_QUENCH: + /* Just silently ignore these. */ + goto out; + case ICMP_PARAMETERPROB: + err = EPROTO; + break; + case ICMP_DEST_UNREACH: + if (code > NR_ICMP_UNREACH) + goto out; + + if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ + if (!sock_owned_by_user(sk)) + dccp_do_pmtu_discovery(sk, iph, info); + goto out; + } + + err = icmp_err_convert[code].errno; + break; + case ICMP_TIME_EXCEEDED: + err = EHOSTUNREACH; + break; + default: + goto out; + } + + switch (sk->sk_state) { + struct request_sock *req , **prev; + case DCCP_LISTEN: + if (sock_owned_by_user(sk)) + goto out; + req = inet_csk_search_req(sk, &prev, dh->dccph_dport, + iph->daddr, iph->saddr); + if (!req) + goto out; + + /* + * ICMPs are not backlogged, hence we cannot get an established + * socket here. + */ + BUG_TRAP(!req->sk); + + if (seq != dccp_rsk(req)->dreq_iss) { + NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS); + goto out; + } + /* + * Still in RESPOND, just remove it silently. + * There is no good way to pass the error to the newly + * created socket, and POSIX does not want network + * errors returned from accept(). + */ + inet_csk_reqsk_queue_drop(sk, req, prev); + goto out; + + case DCCP_REQUESTING: + case DCCP_RESPOND: + if (!sock_owned_by_user(sk)) { + DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS); + sk->sk_err = err; + + sk->sk_error_report(sk); + + dccp_done(sk); + } else + sk->sk_err_soft = err; + goto out; + } + + /* If we've already connected we will keep trying + * until we time out, or the user gives up. + * + * rfc1122 4.2.3.9 allows to consider as hard errors + * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, + * but it is obsoleted by pmtu discovery). + * + * Note, that in modern internet, where routing is unreliable + * and in each dark corner broken firewalls sit, sending random + * errors ordered by their masters even this two messages finally lose + * their original sense (even Linux sends invalid PORT_UNREACHs) + * + * Now we are in compliance with RFCs. + * --ANK (980905) + */ + + inet = inet_sk(sk); + if (!sock_owned_by_user(sk) && inet->recverr) { + sk->sk_err = err; + sk->sk_error_report(sk); + } else /* Only an error on timeout */ + sk->sk_err_soft = err; +out: + bh_unlock_sock(sk); + sock_put(sk); +} + +extern struct sk_buff *dccp_make_reset(struct sock *sk, struct dst_entry *dst, enum dccp_reset_codes code); + +int dccp_v4_send_reset(struct sock *sk, enum dccp_reset_codes code) +{ + struct sk_buff *skb; + /* + * FIXME: what if rebuild_header fails? + * Should we be doing a rebuild_header here? + */ + int err = inet_sk_rebuild_header(sk); + + if (err != 0) + return err; + + skb = dccp_make_reset(sk, sk->sk_dst_cache, code); + if (skb != NULL) { + const struct dccp_sock *dp = dccp_sk(sk); + const struct inet_sock *inet = inet_sk(sk); + + err = ip_build_and_send_pkt(skb, sk, + inet->saddr, inet->daddr, NULL); + if (err == NET_XMIT_CN) + err = 0; + + ccid_hc_rx_exit(dp->dccps_hc_rx_ccid, sk); + ccid_hc_tx_exit(dp->dccps_hc_tx_ccid, sk); + } + + return err; +} + +static inline u64 dccp_v4_init_sequence(const struct sock *sk, + const struct sk_buff *skb) +{ + return secure_dccp_sequence_number(skb->nh.iph->daddr, + skb->nh.iph->saddr, + dccp_hdr(skb)->dccph_dport, + dccp_hdr(skb)->dccph_sport); +} + +int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) +{ + struct inet_request_sock *ireq; + struct dccp_sock dp; + struct request_sock *req; + struct dccp_request_sock *dreq; + const __u32 saddr = skb->nh.iph->saddr; + const __u32 daddr = skb->nh.iph->daddr; + struct dst_entry *dst = NULL; + + /* Never answer to DCCP_PKT_REQUESTs send to broadcast or multicast */ + if (((struct rtable *)skb->dst)->rt_flags & + (RTCF_BROADCAST | RTCF_MULTICAST)) + goto drop; + + /* + * TW buckets are converted to open requests without + * limitations, they conserve resources and peer is + * evidently real one. + */ + if (inet_csk_reqsk_queue_is_full(sk)) + goto drop; + + /* + * Accept backlog is full. If we have already queued enough + * of warm entries in syn queue, drop request. It is better than + * clogging syn queue with openreqs with exponentially increasing + * timeout. + */ + if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) + goto drop; + + req = reqsk_alloc(sk->sk_prot->rsk_prot); + if (req == NULL) + goto drop; + + /* FIXME: process options */ + + dccp_openreq_init(req, &dp, skb); + + ireq = inet_rsk(req); + ireq->loc_addr = daddr; + ireq->rmt_addr = saddr; + /* FIXME: Merge Aristeu's option parsing code when ready */ + req->rcv_wnd = 100; /* Fake, option parsing will get the right value */ + ireq->opt = NULL; + + /* + * Step 3: Process LISTEN state + * + * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie + * + * In fact we defer setting S.GSR, S.SWL, S.SWH to + * dccp_create_openreq_child. + */ + dreq = dccp_rsk(req); + dreq->dreq_isr = DCCP_SKB_CB(skb)->dccpd_seq; + dreq->dreq_iss = dccp_v4_init_sequence(sk, skb); + dreq->dreq_service = dccp_hdr_request(skb)->dccph_req_service; + + if (dccp_v4_send_response(sk, req, dst)) + goto drop_and_free; + + inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT); + return 0; + +drop_and_free: + /* + * FIXME: should be reqsk_free after implementing req->rsk_ops + */ + __reqsk_free(req); +drop: + DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS); + return -1; +} + +/* + * The three way handshake has completed - we got a valid ACK or DATAACK - + * now create the new socket. + * + * This is the equivalent of TCP's tcp_v4_syn_recv_sock + */ +struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst) +{ + struct inet_request_sock *ireq; + struct inet_sock *newinet; + struct dccp_sock *newdp; + struct sock *newsk; + + if (sk_acceptq_is_full(sk)) + goto exit_overflow; + + if (dst == NULL && (dst = inet_csk_route_req(sk, req)) == NULL) + goto exit; + + newsk = dccp_create_openreq_child(sk, req, skb); + if (newsk == NULL) + goto exit; + + sk_setup_caps(newsk, dst); + + newdp = dccp_sk(newsk); + newinet = inet_sk(newsk); + ireq = inet_rsk(req); + newinet->daddr = ireq->rmt_addr; + newinet->rcv_saddr = ireq->loc_addr; + newinet->saddr = ireq->loc_addr; + newinet->opt = ireq->opt; + ireq->opt = NULL; + newinet->mc_index = inet_iif(skb); + newinet->mc_ttl = skb->nh.iph->ttl; + newinet->id = jiffies; + + dccp_sync_mss(newsk, dst_mtu(dst)); + + __inet_hash(&dccp_hashinfo, newsk, 0); + __inet_inherit_port(&dccp_hashinfo, sk, newsk); + + return newsk; + +exit_overflow: + NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS); +exit: + NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS); + dst_release(dst); + return NULL; +} + +static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) +{ + const struct dccp_hdr *dh = dccp_hdr(skb); + const struct iphdr *iph = skb->nh.iph; + struct sock *nsk; + struct request_sock **prev; + /* Find possible connection requests. */ + struct request_sock *req = inet_csk_search_req(sk, &prev, + dh->dccph_sport, + iph->saddr, iph->daddr); + if (req != NULL) + return dccp_check_req(sk, skb, req, prev); + + nsk = __inet_lookup_established(&dccp_hashinfo, + iph->saddr, dh->dccph_sport, + iph->daddr, ntohs(dh->dccph_dport), + inet_iif(skb)); + if (nsk != NULL) { + if (nsk->sk_state != DCCP_TIME_WAIT) { + bh_lock_sock(nsk); + return nsk; + } + inet_twsk_put((struct inet_timewait_sock *)nsk); + return NULL; + } + + return sk; +} + +int dccp_v4_checksum(struct sk_buff *skb) +{ + struct dccp_hdr* dh = dccp_hdr(skb); + int checksum_len; + u32 tmp; + + if (dh->dccph_cscov == 0) + checksum_len = skb->len; + else { + checksum_len = (dh->dccph_cscov + dh->dccph_x) * sizeof(u32); + checksum_len = checksum_len < skb->len ? checksum_len : skb->len; + } + + tmp = csum_partial((unsigned char *)dh, checksum_len, 0); + return csum_fold(tmp); +} + +static int dccp_v4_verify_checksum(struct sk_buff *skb) +{ + struct dccp_hdr *th = dccp_hdr(skb); + const u16 remote_checksum = th->dccph_checksum; + u16 local_checksum; + + /* FIXME: don't mess with skb payload */ + th->dccph_checksum = 0; /* zero it for computation */ + + local_checksum = dccp_v4_checksum(skb); + + /* FIXME: don't mess with skb payload */ + th->dccph_checksum = remote_checksum; /* put it back */ + + return remote_checksum == local_checksum ? 0 : -1; +} + +static struct dst_entry* dccp_v4_route_skb(struct sock *sk, + struct sk_buff *skb) +{ + struct rtable *rt; + struct flowi fl = { .oif = ((struct rtable *)skb->dst)->rt_iif, + .nl_u = { .ip4_u = + { .daddr = skb->nh.iph->saddr, + .saddr = skb->nh.iph->daddr, + .tos = RT_CONN_FLAGS(sk) } }, + .proto = sk->sk_protocol, + .uli_u = { .ports = + { .sport = dccp_hdr(skb)->dccph_dport, + .dport = dccp_hdr(skb)->dccph_sport } } }; + + if (ip_route_output_flow(&rt, &fl, sk, 0)) { + IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); + return NULL; + } + + return &rt->u.dst; +} + +void dccp_v4_ctl_send_reset(struct sk_buff *rxskb) +{ + int err; + struct dccp_hdr *rxdh = dccp_hdr(rxskb), *dh; + const int dccp_hdr_reset_len = sizeof(struct dccp_hdr) + + sizeof(struct dccp_hdr_ext) + + sizeof(struct dccp_hdr_reset); + struct sk_buff *skb; + struct dst_entry *dst; + + /* Never send a reset in response to a reset. */ + if (rxdh->dccph_type == DCCP_PKT_RESET) + return; + + if (((struct rtable *)rxskb->dst)->rt_type != RTN_LOCAL) + return; + + dst = dccp_v4_route_skb(dccp_ctl_socket->sk, rxskb); + if (dst == NULL) + return; + + skb = alloc_skb(MAX_DCCP_HEADER + 15, GFP_ATOMIC); + if (skb == NULL) + goto out; + + /* Reserve space for headers. */ + skb_reserve(skb, MAX_DCCP_HEADER); + skb->dst = dst_clone(dst); + + skb->h.raw = skb_push(skb, dccp_hdr_reset_len); + dh = dccp_hdr(skb); + memset(dh, 0, dccp_hdr_reset_len); + + /* Build DCCP header and checksum it. */ + dh->dccph_type = DCCP_PKT_RESET; + dh->dccph_sport = rxdh->dccph_dport; + dh->dccph_dport = rxdh->dccph_sport; + dh->dccph_doff = dccp_hdr_reset_len / 4; + dh->dccph_x = 1; + dccp_hdr_reset(skb)->dccph_reset_code = DCCP_SKB_CB(rxskb)->dccpd_reset_code; + + dccp_hdr_set_seq(dh, DCCP_SKB_CB(rxskb)->dccpd_ack_seq); + dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), DCCP_SKB_CB(rxskb)->dccpd_seq); + + dh->dccph_checksum = dccp_v4_checksum(skb); + + bh_lock_sock(dccp_ctl_socket->sk); + err = ip_build_and_send_pkt(skb, dccp_ctl_socket->sk, + rxskb->nh.iph->daddr, rxskb->nh.iph->saddr, NULL); + bh_unlock_sock(dccp_ctl_socket->sk); + + if (err == NET_XMIT_CN || err == 0) { + DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS); + DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS); + } +out: + dst_release(dst); +} + +int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) +{ + struct dccp_hdr *dh = dccp_hdr(skb); + + if (sk->sk_state == DCCP_OPEN) { /* Fast path */ + if (dccp_rcv_established(sk, skb, dh, skb->len)) + goto reset; + return 0; + } + + /* + * Step 3: Process LISTEN state + * If S.state == LISTEN, + * If P.type == Request or P contains a valid Init Cookie option, + * * Must scan the packet's options to check for an Init + * Cookie. Only the Init Cookie is processed here, + * however; other options are processed in Step 8. This + * scan need only be performed if the endpoint uses Init + * Cookies * + * * Generate a new socket and switch to that socket * + * Set S := new socket for this port pair + * S.state = RESPOND + * Choose S.ISS (initial seqno) or set from Init Cookie + * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie + * Continue with S.state == RESPOND + * * A Response packet will be generated in Step 11 * + * Otherwise, + * Generate Reset(No Connection) unless P.type == Reset + * Drop packet and return + * + * NOTE: the check for the packet types is done in dccp_rcv_state_process + */ + if (sk->sk_state == DCCP_LISTEN) { + struct sock *nsk = dccp_v4_hnd_req(sk, skb); + + if (nsk == NULL) + goto discard; + + if (nsk != sk) { + if (dccp_child_process(sk, nsk, skb)) + goto reset; + return 0; + } + } + + if (dccp_rcv_state_process(sk, skb, dh, skb->len)) + goto reset; + return 0; + +reset: + DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; + dccp_v4_ctl_send_reset(skb); +discard: + kfree_skb(skb); + return 0; +} + +static inline int dccp_invalid_packet(struct sk_buff *skb) +{ + const struct dccp_hdr *dh; + + if (skb->pkt_type != PACKET_HOST) + return 1; + + if (!pskb_may_pull(skb, sizeof(struct dccp_hdr))) { + dccp_pr_debug("pskb_may_pull failed\n"); + return 1; + } + + dh = dccp_hdr(skb); + + /* If the packet type is not understood, drop packet and return */ + if (dh->dccph_type >= DCCP_PKT_INVALID) { + dccp_pr_debug("invalid packet type\n"); + return 1; + } + + /* + * If P.Data Offset is too small for packet type, or too large for + * packet, drop packet and return + */ + if (dh->dccph_doff < dccp_hdr_len(skb) / sizeof(u32)) { + dccp_pr_debug("Offset(%u) too small 1\n", dh->dccph_doff); + return 1; + } + + if (!pskb_may_pull(skb, dh->dccph_doff * sizeof(u32))) { + dccp_pr_debug("P.Data Offset(%u) too small 2\n", dh->dccph_doff); + return 1; + } + + dh = dccp_hdr(skb); + + /* + * If P.type is not Data, Ack, or DataAck and P.X == 0 (the packet + * has short sequence numbers), drop packet and return + */ + if (dh->dccph_x == 0 && + dh->dccph_type != DCCP_PKT_DATA && + dh->dccph_type != DCCP_PKT_ACK && + dh->dccph_type != DCCP_PKT_DATAACK) { + dccp_pr_debug("P.type (%s) not Data, Ack nor DataAck and P.X == 0\n", + dccp_packet_name(dh->dccph_type)); + return 1; + } + + /* If the header checksum is incorrect, drop packet and return */ + if (dccp_v4_verify_checksum(skb) < 0) { + dccp_pr_debug("header checksum is incorrect\n"); + return 1; + } + + return 0; +} + +/* this is called when real data arrives */ +int dccp_v4_rcv(struct sk_buff *skb) +{ + const struct dccp_hdr *dh; + struct sock *sk; + int rc; + + /* Step 1: Check header basics: */ + + if (dccp_invalid_packet(skb)) + goto discard_it; + + dh = dccp_hdr(skb); +#if 0 + /* + * Use something like this to simulate some DATA/DATAACK loss to test + * dccp_ackpkts_add, you'll get something like this on a session that + * sends 10 DATA/DATAACK packets: + * + * dccp_ackpkts_print: 281473596467422 |0,0|3,0|0,0|3,0|0,0|3,0|0,0|3,0|0,1| + * + * 0, 0 means: DCCP_ACKPKTS_STATE_RECEIVED, RLE == just this packet + * 0, 1 means: DCCP_ACKPKTS_STATE_RECEIVED, RLE == two adjacent packets with the same state + * 3, 0 means: DCCP_ACKPKTS_STATE_NOT_RECEIVED, RLE == just this packet + * + * So... + * + * 281473596467422 was received + * 281473596467421 was not received + * 281473596467420 was received + * 281473596467419 was not received + * 281473596467418 was received + * 281473596467417 was not received + * 281473596467416 was received + * 281473596467415 was not received + * 281473596467414 was received + * 281473596467413 was received (this one was the 3way handshake RESPONSE) + * + */ + if (dh->dccph_type == DCCP_PKT_DATA || dh->dccph_type == DCCP_PKT_DATAACK) { + static int discard = 0; + + if (discard) { + discard = 0; + goto discard_it; + } + discard = 1; + } +#endif + DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(skb); + DCCP_SKB_CB(skb)->dccpd_type = dh->dccph_type; + + dccp_pr_debug("%8.8s " + "src=%u.%u.%u.%u@%-5d " + "dst=%u.%u.%u.%u@%-5d seq=%llu", + dccp_packet_name(dh->dccph_type), + NIPQUAD(skb->nh.iph->saddr), ntohs(dh->dccph_sport), + NIPQUAD(skb->nh.iph->daddr), ntohs(dh->dccph_dport), + DCCP_SKB_CB(skb)->dccpd_seq); + + if (dccp_packet_without_ack(skb)) { + DCCP_SKB_CB(skb)->dccpd_ack_seq = DCCP_PKT_WITHOUT_ACK_SEQ; + dccp_pr_debug_cat("\n"); + } else { + DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb); + dccp_pr_debug_cat(", ack=%llu\n", DCCP_SKB_CB(skb)->dccpd_ack_seq); + } + + /* Step 2: + * Look up flow ID in table and get corresponding socket */ + sk = __inet_lookup(&dccp_hashinfo, + skb->nh.iph->saddr, dh->dccph_sport, + skb->nh.iph->daddr, ntohs(dh->dccph_dport), + inet_iif(skb)); + + /* + * Step 2: + * If no socket ... + * Generate Reset(No Connection) unless P.type == Reset + * Drop packet and return + */ + if (sk == NULL) { + dccp_pr_debug("failed to look up flow ID in table and " + "get corresponding socket\n"); + goto no_dccp_socket; + } + + /* + * Step 2: + * ... or S.state == TIMEWAIT, + * Generate Reset(No Connection) unless P.type == Reset + * Drop packet and return + */ + + if (sk->sk_state == DCCP_TIME_WAIT) { + dccp_pr_debug("sk->sk_state == DCCP_TIME_WAIT: discard_and_relse\n"); + goto discard_and_relse; + } + + if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { + dccp_pr_debug("xfrm4_policy_check failed\n"); + goto discard_and_relse; + } + + if (sk_filter(sk, skb, 0)) { + dccp_pr_debug("sk_filter failed\n"); + goto discard_and_relse; + } + + skb->dev = NULL; + + bh_lock_sock(sk); + rc = 0; + if (!sock_owned_by_user(sk)) + rc = dccp_v4_do_rcv(sk, skb); + else + sk_add_backlog(sk, skb); + bh_unlock_sock(sk); + + sock_put(sk); + return rc; + +no_dccp_socket: + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto discard_it; + /* + * Step 2: + * Generate Reset(No Connection) unless P.type == Reset + * Drop packet and return + */ + if (dh->dccph_type != DCCP_PKT_RESET) { + DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; + dccp_v4_ctl_send_reset(skb); + } + +discard_it: + /* Discard frame. */ + kfree_skb(skb); + return 0; + +discard_and_relse: + sock_put(sk); + goto discard_it; +} + +static int dccp_v4_init_sock(struct sock *sk) +{ + struct dccp_sock *dp = dccp_sk(sk); + static int dccp_ctl_socket_init = 1; + + dccp_options_init(&dp->dccps_options); + + if (dp->dccps_options.dccpo_send_ack_vector) { + dp->dccps_hc_rx_ackpkts = dccp_ackpkts_alloc(DCCP_MAX_ACK_VECTOR_LEN, + GFP_KERNEL); + + if (dp->dccps_hc_rx_ackpkts == NULL) + return -ENOMEM; + } + + /* + * FIXME: We're hardcoding the CCID, and doing this at this point makes + * the listening (master) sock get CCID control blocks, which is not + * necessary, but for now, to not mess with the test userspace apps, + * lets leave it here, later the real solution is to do this in a + * setsockopt(CCIDs-I-want/accept). -acme + */ + if (likely(!dccp_ctl_socket_init)) { + dp->dccps_hc_rx_ccid = ccid_init(dp->dccps_options.dccpo_ccid, sk); + dp->dccps_hc_tx_ccid = ccid_init(dp->dccps_options.dccpo_ccid, sk); + if (dp->dccps_hc_rx_ccid == NULL || + dp->dccps_hc_tx_ccid == NULL) { + ccid_exit(dp->dccps_hc_rx_ccid, sk); + ccid_exit(dp->dccps_hc_tx_ccid, sk); + dccp_ackpkts_free(dp->dccps_hc_rx_ackpkts); + dp->dccps_hc_rx_ackpkts = NULL; + dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; + return -ENOMEM; + } + } else + dccp_ctl_socket_init = 0; + + dccp_init_xmit_timers(sk); + sk->sk_state = DCCP_CLOSED; + dp->dccps_mss_cache = 536; + dp->dccps_role = DCCP_ROLE_UNDEFINED; + + return 0; +} + +int dccp_v4_destroy_sock(struct sock *sk) +{ + struct dccp_sock *dp = dccp_sk(sk); + + /* + * DCCP doesn't use sk_qrite_queue, just sk_send_head + * for retransmissions + */ + if (sk->sk_send_head != NULL) { + kfree_skb(sk->sk_send_head); + sk->sk_send_head = NULL; + } + + /* Clean up a referenced DCCP bind bucket. */ + if (inet_csk(sk)->icsk_bind_hash != NULL) + inet_put_port(&dccp_hashinfo, sk); + + dccp_ackpkts_free(dp->dccps_hc_rx_ackpkts); + dp->dccps_hc_rx_ackpkts = NULL; + ccid_exit(dp->dccps_hc_rx_ccid, sk); + ccid_exit(dp->dccps_hc_tx_ccid, sk); + dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; + + return 0; +} + +static void dccp_v4_reqsk_destructor(struct request_sock *req) +{ + kfree(inet_rsk(req)->opt); +} + +static struct request_sock_ops dccp_request_sock_ops = { + .family = PF_INET, + .obj_size = sizeof(struct dccp_request_sock), + .rtx_syn_ack = dccp_v4_send_response, + .send_ack = dccp_v4_reqsk_send_ack, + .destructor = dccp_v4_reqsk_destructor, + .send_reset = dccp_v4_ctl_send_reset, +}; + +struct proto dccp_v4_prot = { + .name = "DCCP", + .owner = THIS_MODULE, + .close = dccp_close, + .connect = dccp_v4_connect, + .disconnect = dccp_disconnect, + .ioctl = dccp_ioctl, + .init = dccp_v4_init_sock, + .setsockopt = dccp_setsockopt, + .getsockopt = dccp_getsockopt, + .sendmsg = dccp_sendmsg, + .recvmsg = dccp_recvmsg, + .backlog_rcv = dccp_v4_do_rcv, + .hash = dccp_v4_hash, + .unhash = dccp_v4_unhash, + .accept = inet_csk_accept, + .get_port = dccp_v4_get_port, + .shutdown = dccp_shutdown, + .destroy = dccp_v4_destroy_sock, + .orphan_count = &dccp_orphan_count, + .max_header = MAX_DCCP_HEADER, + .obj_size = sizeof(struct dccp_sock), + .rsk_prot = &dccp_request_sock_ops, + .twsk_obj_size = sizeof(struct inet_timewait_sock), /* FIXME! create dccp_timewait_sock */ +}; diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c new file mode 100644 index 0000000..810f0c2 --- /dev/null +++ b/net/dccp/minisocks.c @@ -0,0 +1,199 @@ +/* + * net/dccp/minisocks.c + * + * An implementation of the DCCP protocol + * Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include + +#include +#include +#include + +#include "ccid.h" +#include "dccp.h" + +void dccp_time_wait(struct sock *sk, int state, int timeo) +{ + /* FIXME: Implement */ + dccp_pr_debug("Want to help? Start here\n"); + dccp_set_state(sk, state); +} + +/* This is for handling early-kills of TIME_WAIT sockets. */ +void dccp_tw_deschedule(struct inet_timewait_sock *tw) +{ + dccp_pr_debug("Want to help? Start here\n"); + __inet_twsk_kill(tw, &dccp_hashinfo); +} + +struct sock *dccp_create_openreq_child(struct sock *sk, + const struct request_sock *req, + const struct sk_buff *skb) +{ + /* + * Step 3: Process LISTEN state + * + * // Generate a new socket and switch to that socket + * Set S := new socket for this port pair + */ + struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC); + + if (newsk != NULL) { + const struct dccp_request_sock *dreq = dccp_rsk(req); + struct inet_connection_sock *newicsk = inet_csk(sk); + struct dccp_sock *newdp = dccp_sk(newsk); + + newdp->dccps_hc_rx_ackpkts = NULL; + newdp->dccps_role = DCCP_ROLE_SERVER; + newicsk->icsk_rto = TCP_TIMEOUT_INIT; + + if (newdp->dccps_options.dccpo_send_ack_vector) { + newdp->dccps_hc_rx_ackpkts = dccp_ackpkts_alloc(DCCP_MAX_ACK_VECTOR_LEN, + GFP_ATOMIC); + /* + * XXX: We're using the same CCIDs set on the parent, i.e. sk_clone + * copied the master sock and left the CCID pointers for this child, + * that is why we do the __ccid_get calls. + */ + if (unlikely(newdp->dccps_hc_rx_ackpkts == NULL)) + goto out_free; + } + + if (unlikely(ccid_hc_rx_init(newdp->dccps_hc_rx_ccid, newsk) != 0 || + ccid_hc_tx_init(newdp->dccps_hc_tx_ccid, newsk) != 0)) { + dccp_ackpkts_free(newdp->dccps_hc_rx_ackpkts); + ccid_hc_rx_exit(newdp->dccps_hc_rx_ccid, newsk); + ccid_hc_tx_exit(newdp->dccps_hc_tx_ccid, newsk); +out_free: + /* It is still raw copy of parent, so invalidate + * destructor and make plain sk_free() */ + newsk->sk_destruct = NULL; + sk_free(newsk); + return NULL; + } + + __ccid_get(newdp->dccps_hc_rx_ccid); + __ccid_get(newdp->dccps_hc_tx_ccid); + + /* + * Step 3: Process LISTEN state + * + * Choose S.ISS (initial seqno) or set from Init Cookie + * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie + */ + + /* See dccp_v4_conn_request */ + newdp->dccps_options.dccpo_sequence_window = req->rcv_wnd; + + newdp->dccps_gar = newdp->dccps_isr = dreq->dreq_isr; + dccp_update_gsr(newsk, dreq->dreq_isr); + + newdp->dccps_iss = dreq->dreq_iss; + dccp_update_gss(newsk, dreq->dreq_iss); + + dccp_init_xmit_timers(newsk); + + DCCP_INC_STATS_BH(DCCP_MIB_PASSIVEOPENS); + } + return newsk; +} + +/* + * Process an incoming packet for RESPOND sockets represented + * as an request_sock. + */ +struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct request_sock **prev) +{ + struct sock *child = NULL; + + /* Check for retransmitted REQUEST */ + if (dccp_hdr(skb)->dccph_type == DCCP_PKT_REQUEST) { + if (after48(DCCP_SKB_CB(skb)->dccpd_seq, dccp_rsk(req)->dreq_isr)) { + struct dccp_request_sock *dreq = dccp_rsk(req); + + dccp_pr_debug("Retransmitted REQUEST\n"); + /* Send another RESPONSE packet */ + dccp_set_seqno(&dreq->dreq_iss, dreq->dreq_iss + 1); + dccp_set_seqno(&dreq->dreq_isr, DCCP_SKB_CB(skb)->dccpd_seq); + req->rsk_ops->rtx_syn_ack(sk, req, NULL); + } + /* Network Duplicate, discard packet */ + return NULL; + } + + DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR; + + if (dccp_hdr(skb)->dccph_type != DCCP_PKT_ACK && + dccp_hdr(skb)->dccph_type != DCCP_PKT_DATAACK) + goto drop; + + /* Invalid ACK */ + if (DCCP_SKB_CB(skb)->dccpd_ack_seq != dccp_rsk(req)->dreq_iss) { + dccp_pr_debug("Invalid ACK number: ack_seq=%llu, dreq_iss=%llu\n", + DCCP_SKB_CB(skb)->dccpd_ack_seq, dccp_rsk(req)->dreq_iss); + goto drop; + } + + child = dccp_v4_request_recv_sock(sk, skb, req, NULL); + if (child == NULL) + goto listen_overflow; + + /* FIXME: deal with options */ + + inet_csk_reqsk_queue_unlink(sk, req, prev); + inet_csk_reqsk_queue_removed(sk, req); + inet_csk_reqsk_queue_add(sk, req, child); +out: + return child; +listen_overflow: + dccp_pr_debug("listen_overflow!\n"); + DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY; +drop: + if (dccp_hdr(skb)->dccph_type != DCCP_PKT_RESET) + req->rsk_ops->send_reset(skb); + + inet_csk_reqsk_queue_drop(sk, req, prev); + goto out; +} + +/* + * Queue segment on the new socket if the new socket is active, + * otherwise we just shortcircuit this and continue with + * the new socket. + */ +int dccp_child_process(struct sock *parent, struct sock *child, + struct sk_buff *skb) +{ + int ret = 0; + const int state = child->sk_state; + + if (!sock_owned_by_user(child)) { + ret = dccp_rcv_state_process(child, skb, dccp_hdr(skb), skb->len); + + /* Wakeup parent, send SIGIO */ + if (state == DCCP_RESPOND && child->sk_state != state) + parent->sk_data_ready(parent, 0); + } else { + /* Alas, it is possible again, because we do lookup + * in main socket hash table and lock on listening + * socket does not protect us more. + */ + sk_add_backlog(child, skb); + } + + bh_unlock_sock(child); + sock_put(child); + return ret; +} diff --git a/net/dccp/options.c b/net/dccp/options.c new file mode 100644 index 0000000..e186776 --- /dev/null +++ b/net/dccp/options.c @@ -0,0 +1,763 @@ +/* + * net/dccp/options.c + * + * An implementation of the DCCP protocol + * Aristeu Sergio Rozanski Filho + * Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include +#include +#include +#include + +#include "ccid.h" +#include "dccp.h" + +static void dccp_ackpkts_check_rcv_ackvector(struct dccp_ackpkts *ap, + struct sock *sk, + const u64 ackno, + const unsigned char len, + const unsigned char *vector); + +/* stores the default values for new connection. may be changed with sysctl */ +static const struct dccp_options dccpo_default_values = { + .dccpo_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW, + .dccpo_ccid = DCCPF_INITIAL_CCID, + .dccpo_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR, + .dccpo_send_ndp_count = DCCPF_INITIAL_SEND_NDP_COUNT, +}; + +void dccp_options_init(struct dccp_options *dccpo) +{ + memcpy(dccpo, &dccpo_default_values, sizeof(*dccpo)); +} + +static u32 dccp_decode_value_var(const unsigned char *bf, const u8 len) +{ + u32 value = 0; + + if (len > 3) + value += *bf++ << 24; + if (len > 2) + value += *bf++ << 16; + if (len > 1) + value += *bf++ << 8; + if (len > 0) + value += *bf; + + return value; +} + +int dccp_parse_options(struct sock *sk, struct sk_buff *skb) +{ + struct dccp_sock *dp = dccp_sk(sk); +#ifdef DCCP_DEBUG + const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ? "CLIENT rx opt: " : + "server rx opt: "; +#endif + const struct dccp_hdr *dh = dccp_hdr(skb); + const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type; + unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb); + unsigned char *opt_ptr = options; + const unsigned char *opt_end = (unsigned char *)dh + (dh->dccph_doff * 4); + struct dccp_options_received *opt_recv = &dp->dccps_options_received; + unsigned char opt, len; + unsigned char *value; + + memset(opt_recv, 0, sizeof(*opt_recv)); + + while (opt_ptr != opt_end) { + opt = *opt_ptr++; + len = 0; + value = NULL; + + /* Check if this isn't a single byte option */ + if (opt > DCCPO_MAX_RESERVED) { + if (opt_ptr == opt_end) + goto out_invalid_option; + + len = *opt_ptr++; + if (len < 3) + goto out_invalid_option; + /* + * Remove the type and len fields, leaving + * just the value size + */ + len -= 2; + value = opt_ptr; + opt_ptr += len; + + if (opt_ptr > opt_end) + goto out_invalid_option; + } + + switch (opt) { + case DCCPO_PADDING: + break; + case DCCPO_NDP_COUNT: + if (len > 3) + goto out_invalid_option; + + opt_recv->dccpor_ndp = dccp_decode_value_var(value, len); + dccp_pr_debug("%sNDP count=%d\n", debug_prefix, opt_recv->dccpor_ndp); + break; + case DCCPO_ACK_VECTOR_0: + if (len > DCCP_MAX_ACK_VECTOR_LEN) + goto out_invalid_option; + + if (pkt_type == DCCP_PKT_DATA) + continue; + + opt_recv->dccpor_ack_vector_len = len; + opt_recv->dccpor_ack_vector_idx = value - options; + + dccp_pr_debug("%sACK vector 0, len=%d, ack_ackno=%llu\n", + debug_prefix, len, DCCP_SKB_CB(skb)->dccpd_ack_seq); + dccp_ackvector_print(DCCP_SKB_CB(skb)->dccpd_ack_seq, + value, len); + dccp_ackpkts_check_rcv_ackvector(dp->dccps_hc_rx_ackpkts, sk, + DCCP_SKB_CB(skb)->dccpd_ack_seq, + len, value); + break; + case DCCPO_TIMESTAMP: + if (len != 4) + goto out_invalid_option; + + opt_recv->dccpor_timestamp = ntohl(*(u32 *)value); + + dp->dccps_timestamp_echo = opt_recv->dccpor_timestamp; + dp->dccps_timestamp_time = jiffies; + + dccp_pr_debug("%sTIMESTAMP=%u, ackno=%llu\n", + debug_prefix, opt_recv->dccpor_timestamp, + DCCP_SKB_CB(skb)->dccpd_ack_seq); + break; + case DCCPO_TIMESTAMP_ECHO: + if (len < 4 || len > 8) + goto out_invalid_option; + + opt_recv->dccpor_timestamp_echo = ntohl(*(u32 *)value); + + dccp_pr_debug("%sTIMESTAMP_ECHO=%u, len=%d, ackno=%llu, diff=%u\n", + debug_prefix, opt_recv->dccpor_timestamp_echo, + len + 2, DCCP_SKB_CB(skb)->dccpd_ack_seq, + tcp_time_stamp - opt_recv->dccpor_timestamp_echo); + + opt_recv->dccpor_elapsed_time = dccp_decode_value_var(value + 4, len - 4); + dccp_pr_debug("%sTIMESTAMP_ECHO ELAPSED_TIME=%d\n", debug_prefix, + opt_recv->dccpor_elapsed_time); + break; + case DCCPO_ELAPSED_TIME: + if (len > 4) + goto out_invalid_option; + + if (pkt_type == DCCP_PKT_DATA) + continue; + opt_recv->dccpor_elapsed_time = dccp_decode_value_var(value, len); + dccp_pr_debug("%sELAPSED_TIME=%d\n", debug_prefix, + opt_recv->dccpor_elapsed_time); + break; + /* + * From draft-ietf-dccp-spec-11.txt: + * + * Option numbers 128 through 191 are for options sent from the HC- + * Sender to the HC-Receiver; option numbers 192 through 255 are for + * options sent from the HC-Receiver to the HC-Sender. + */ + case 128 ... 191: { + const u16 idx = value - options; + + if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk, opt, len, idx, value) != 0) + goto out_invalid_option; + } + break; + case 192 ... 255: { + const u16 idx = value - options; + + if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk, opt, len, idx, value) != 0) + goto out_invalid_option; + } + break; + default: + pr_info("DCCP(%p): option %d(len=%d) not implemented, ignoring\n", + sk, opt, len); + break; + } + } + + return 0; + +out_invalid_option: + DCCP_INC_STATS_BH(DCCP_MIB_INVALIDOPT); + DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_OPTION_ERROR; + pr_info("DCCP(%p): invalid option %d, len=%d\n", sk, opt, len); + return -1; +} + +static void dccp_encode_value_var(const u32 value, unsigned char *to, + const unsigned int len) +{ + if (len > 3) + *to++ = (value & 0xFF000000) >> 24; + if (len > 2) + *to++ = (value & 0xFF0000) >> 16; + if (len > 1) + *to++ = (value & 0xFF00) >> 8; + if (len > 0) + *to++ = (value & 0xFF); +} + +static inline int dccp_ndp_len(const int ndp) +{ + return likely(ndp <= 0xFF) ? 1 : ndp <= 0xFFFF ? 2 : 3; +} + +void dccp_insert_option(struct sock *sk, struct sk_buff *skb, + const unsigned char option, + const void *value, const unsigned char len) +{ + unsigned char *to; + + if (DCCP_SKB_CB(skb)->dccpd_opt_len + len + 2 > DCCP_MAX_OPT_LEN) { + LIMIT_NETDEBUG(pr_info("DCCP: packet too small to insert %d option!\n", option)); + return; + } + + DCCP_SKB_CB(skb)->dccpd_opt_len += len + 2; + + to = skb_push(skb, len + 2); + *to++ = option; + *to++ = len + 2; + + memcpy(to, value, len); +} + +EXPORT_SYMBOL_GPL(dccp_insert_option); + +static void dccp_insert_option_ndp(struct sock *sk, struct sk_buff *skb) +{ + struct dccp_sock *dp = dccp_sk(sk); + int ndp = dp->dccps_ndp_count; + + if (dccp_non_data_packet(skb)) + ++dp->dccps_ndp_count; + else + dp->dccps_ndp_count = 0; + + if (ndp > 0) { + unsigned char *ptr; + const int ndp_len = dccp_ndp_len(ndp); + const int len = ndp_len + 2; + + if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) + return; + + DCCP_SKB_CB(skb)->dccpd_opt_len += len; + + ptr = skb_push(skb, len); + *ptr++ = DCCPO_NDP_COUNT; + *ptr++ = len; + dccp_encode_value_var(ndp, ptr, ndp_len); + } +} + +static inline int dccp_elapsed_time_len(const u32 elapsed_time) +{ + return elapsed_time == 0 ? 0 : + elapsed_time <= 0xFF ? 1 : + elapsed_time <= 0xFFFF ? 2 : + elapsed_time <= 0xFFFFFF ? 3 : 4; +} + +void dccp_insert_option_elapsed_time(struct sock *sk, + struct sk_buff *skb, + u32 elapsed_time) +{ +#ifdef DCCP_DEBUG + struct dccp_sock *dp = dccp_sk(sk); + const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ? "CLIENT TX opt: " : + "server TX opt: "; +#endif + const int elapsed_time_len = dccp_elapsed_time_len(elapsed_time); + const int len = 2 + elapsed_time_len; + unsigned char *to; + + /* If elapsed_time == 0... */ + if (elapsed_time_len == 2) + return; + + if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) { + LIMIT_NETDEBUG(pr_info("DCCP: packet too small to insert elapsed time!\n")); + return; + } + + DCCP_SKB_CB(skb)->dccpd_opt_len += len; + + to = skb_push(skb, len); + *to++ = DCCPO_ELAPSED_TIME; + *to++ = len; + + dccp_encode_value_var(elapsed_time, to, elapsed_time_len); + + dccp_pr_debug("%sELAPSED_TIME=%u, len=%d, seqno=%llu\n", + debug_prefix, elapsed_time, + len, DCCP_SKB_CB(skb)->dccpd_seq); +} + +EXPORT_SYMBOL(dccp_insert_option_elapsed_time); + +static void dccp_insert_option_ack_vector(struct sock *sk, struct sk_buff *skb) +{ + struct dccp_sock *dp = dccp_sk(sk); +#ifdef DCCP_DEBUG + const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ? "CLIENT TX opt: " : + "server TX opt: "; +#endif + struct dccp_ackpkts *ap = dp->dccps_hc_rx_ackpkts; + int len = ap->dccpap_buf_vector_len + 2; + const u32 elapsed_time = jiffies_to_usecs(jiffies - ap->dccpap_time) / 10; + unsigned char *to, *from; + + if (elapsed_time != 0) + dccp_insert_option_elapsed_time(sk, skb, elapsed_time); + + if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) { + LIMIT_NETDEBUG(pr_info("DCCP: packet too small to insert ACK Vector!\n")); + return; + } + + /* + * XXX: now we have just one ack vector sent record, so + * we have to wait for it to be cleared. + * + * Of course this is not acceptable, but this is just for + * basic testing now. + */ + if (ap->dccpap_ack_seqno != DCCP_MAX_SEQNO + 1) + return; + + DCCP_SKB_CB(skb)->dccpd_opt_len += len; + + to = skb_push(skb, len); + *to++ = DCCPO_ACK_VECTOR_0; + *to++ = len; + + len = ap->dccpap_buf_vector_len; + from = ap->dccpap_buf + ap->dccpap_buf_head; + + /* Check if buf_head wraps */ + if (ap->dccpap_buf_head + len > ap->dccpap_buf_len) { + const unsigned int tailsize = ap->dccpap_buf_len - ap->dccpap_buf_head; + + memcpy(to, from, tailsize); + to += tailsize; + len -= tailsize; + from = ap->dccpap_buf; + } + + memcpy(to, from, len); + /* + * From draft-ietf-dccp-spec-11.txt: + * + * For each acknowledgement it sends, the HC-Receiver will add an + * acknowledgement record. ack_seqno will equal the HC-Receiver + * sequence number it used for the ack packet; ack_ptr will equal + * buf_head; ack_ackno will equal buf_ackno; and ack_nonce will equal + * buf_nonce. + * + * This implemention uses just one ack record for now. + */ + ap->dccpap_ack_seqno = DCCP_SKB_CB(skb)->dccpd_seq; + ap->dccpap_ack_ptr = ap->dccpap_buf_head; + ap->dccpap_ack_ackno = ap->dccpap_buf_ackno; + ap->dccpap_ack_nonce = ap->dccpap_buf_nonce; + ap->dccpap_ack_vector_len = ap->dccpap_buf_vector_len; + + dccp_pr_debug("%sACK Vector 0, len=%d, ack_seqno=%llu, ack_ackno=%llu\n", + debug_prefix, ap->dccpap_ack_vector_len, + ap->dccpap_ack_seqno, ap->dccpap_ack_ackno); +} + +static inline void dccp_insert_option_timestamp(struct sock *sk, struct sk_buff *skb) +{ + const u32 now = htonl(tcp_time_stamp); + dccp_insert_option(sk, skb, DCCPO_TIMESTAMP, &now, sizeof(now)); +} + +static void dccp_insert_option_timestamp_echo(struct sock *sk, struct sk_buff *skb) +{ + struct dccp_sock *dp = dccp_sk(sk); +#ifdef DCCP_DEBUG + const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ? "CLIENT TX opt: " : + "server TX opt: "; +#endif + u32 tstamp_echo; + const u32 elapsed_time = jiffies_to_usecs(jiffies - dp->dccps_timestamp_time) / 10; + const int elapsed_time_len = dccp_elapsed_time_len(elapsed_time); + const int len = 6 + elapsed_time_len; + unsigned char *to; + + if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) { + LIMIT_NETDEBUG(pr_info("DCCP: packet too small to insert timestamp echo!\n")); + return; + } + + DCCP_SKB_CB(skb)->dccpd_opt_len += len; + + to = skb_push(skb, len); + *to++ = DCCPO_TIMESTAMP_ECHO; + *to++ = len; + + tstamp_echo = htonl(dp->dccps_timestamp_echo); + memcpy(to, &tstamp_echo, 4); + to += 4; + dccp_encode_value_var(elapsed_time, to, elapsed_time_len); + + dccp_pr_debug("%sTIMESTAMP_ECHO=%u, len=%d, seqno=%llu\n", + debug_prefix, dp->dccps_timestamp_echo, + len, DCCP_SKB_CB(skb)->dccpd_seq); + + dp->dccps_timestamp_echo = 0; + dp->dccps_timestamp_time = 0; +} + +void dccp_insert_options(struct sock *sk, struct sk_buff *skb) +{ + struct dccp_sock *dp = dccp_sk(sk); + + DCCP_SKB_CB(skb)->dccpd_opt_len = 0; + + if (dp->dccps_options.dccpo_send_ndp_count) + dccp_insert_option_ndp(sk, skb); + + if (!dccp_packet_without_ack(skb)) { + if (dp->dccps_options.dccpo_send_ack_vector && + dp->dccps_hc_rx_ackpkts->dccpap_buf_ackno != DCCP_MAX_SEQNO + 1) + dccp_insert_option_ack_vector(sk, skb); + + dccp_insert_option_timestamp(sk, skb); + if (dp->dccps_timestamp_echo != 0) + dccp_insert_option_timestamp_echo(sk, skb); + } + + ccid_hc_rx_insert_options(dp->dccps_hc_rx_ccid, sk, skb); + ccid_hc_tx_insert_options(dp->dccps_hc_tx_ccid, sk, skb); + + /* XXX: insert other options when appropriate */ + + if (DCCP_SKB_CB(skb)->dccpd_opt_len != 0) { + /* The length of all options has to be a multiple of 4 */ + int padding = DCCP_SKB_CB(skb)->dccpd_opt_len % 4; + + if (padding != 0) { + padding = 4 - padding; + memset(skb_push(skb, padding), 0, padding); + DCCP_SKB_CB(skb)->dccpd_opt_len += padding; + } + } +} + +struct dccp_ackpkts *dccp_ackpkts_alloc(unsigned int len, int priority) +{ + struct dccp_ackpkts *ap = kmalloc(sizeof(*ap) + len, priority); + + if (ap != NULL) { +#ifdef DCCP_DEBUG + memset(ap->dccpap_buf, 0xFF, len); +#endif + ap->dccpap_buf_len = len; + ap->dccpap_buf_head = ap->dccpap_buf_tail = ap->dccpap_buf_len - 1; + ap->dccpap_buf_ackno = ap->dccpap_ack_ackno = ap->dccpap_ack_seqno = DCCP_MAX_SEQNO + 1; + ap->dccpap_buf_nonce = ap->dccpap_buf_nonce = 0; + ap->dccpap_ack_ptr = 0; + ap->dccpap_time = 0; + ap->dccpap_buf_vector_len = ap->dccpap_ack_vector_len = 0; + } + + return ap; +} + +void dccp_ackpkts_free(struct dccp_ackpkts *ap) +{ + if (ap != NULL) { +#ifdef DCCP_DEBUG + memset(ap, 0xFF, sizeof(*ap) + ap->dccpap_buf_len); +#endif + kfree(ap); + } +} + +static inline u8 dccp_ackpkts_state(const struct dccp_ackpkts *ap, + const unsigned int index) +{ + return ap->dccpap_buf[index] & DCCP_ACKPKTS_STATE_MASK; +} + +static inline u8 dccp_ackpkts_len(const struct dccp_ackpkts *ap, + const unsigned int index) +{ + return ap->dccpap_buf[index] & DCCP_ACKPKTS_LEN_MASK; +} + +/* + * If several packets are missing, the HC-Receiver may prefer to enter multiple + * bytes with run length 0, rather than a single byte with a larger run length; + * this simplifies table updates if one of the missing packets arrives. + */ +static inline int dccp_ackpkts_set_buf_head_state(struct dccp_ackpkts *ap, + const unsigned int packets, + const unsigned char state) +{ + unsigned int gap; + signed long new_head; + + if (ap->dccpap_buf_vector_len + packets > ap->dccpap_buf_len) + return -ENOBUFS; + + gap = packets - 1; + new_head = ap->dccpap_buf_head - packets; + + if (new_head < 0) { + if (gap > 0) { + memset(ap->dccpap_buf, DCCP_ACKPKTS_STATE_NOT_RECEIVED, + gap + new_head + 1); + gap = -new_head; + } + new_head += ap->dccpap_buf_len; + } + + ap->dccpap_buf_head = new_head; + + if (gap > 0) + memset(ap->dccpap_buf + ap->dccpap_buf_head + 1, + DCCP_ACKPKTS_STATE_NOT_RECEIVED, gap); + + ap->dccpap_buf[ap->dccpap_buf_head] = state; + ap->dccpap_buf_vector_len += packets; + return 0; +} + +/* + * Implements the draft-ietf-dccp-spec-11.txt Appendix A + */ +int dccp_ackpkts_add(struct dccp_ackpkts *ap, u64 ackno, u8 state) +{ + /* + * Check at the right places if the buffer is full, if it is, tell the + * caller to start dropping packets till the HC-Sender acks our ACK + * vectors, when we will free up space in dccpap_buf. + * + * We may well decide to do buffer compression, etc, but for now lets + * just drop. + * + * From Appendix A: + * + * Of course, the circular buffer may overflow, either when the HC- + * Sender is sending data at a very high rate, when the HC-Receiver's + * acknowledgements are not reaching the HC-Sender, or when the HC- + * Sender is forgetting to acknowledge those acks (so the HC-Receiver + * is unable to clean up old state). In this case, the HC-Receiver + * should either compress the buffer (by increasing run lengths when + * possible), transfer its state to a larger buffer, or, as a last + * resort, drop all received packets, without processing them + * whatsoever, until its buffer shrinks again. + */ + + /* See if this is the first ackno being inserted */ + if (ap->dccpap_buf_vector_len == 0) { + ap->dccpap_buf[ap->dccpap_buf_head] = state; + ap->dccpap_buf_vector_len = 1; + } else if (after48(ackno, ap->dccpap_buf_ackno)) { + const u64 delta = dccp_delta_seqno(ap->dccpap_buf_ackno, ackno); + + /* + * Look if the state of this packet is the same as the previous ackno + * and if so if we can bump the head len. + */ + if (delta == 1 && + dccp_ackpkts_state(ap, ap->dccpap_buf_head) == state && + dccp_ackpkts_len(ap, ap->dccpap_buf_head) < DCCP_ACKPKTS_LEN_MASK) + ap->dccpap_buf[ap->dccpap_buf_head]++; + else if (dccp_ackpkts_set_buf_head_state(ap, delta, state)) + return -ENOBUFS; + } else { + /* + * A.1.2. Old Packets + * + * When a packet with Sequence Number S arrives, and S <= buf_ackno, + * the HC-Receiver will scan the table for the byte corresponding to S. + * (Indexing structures could reduce the complexity of this scan.) + */ + u64 delta = dccp_delta_seqno(ackno, ap->dccpap_buf_ackno); + unsigned int index = ap->dccpap_buf_head; + + while (1) { + const u8 len = dccp_ackpkts_len(ap, index); + const u8 state = dccp_ackpkts_state(ap, index); + /* + * valid packets not yet in dccpap_buf have a reserved entry, with + * a len equal to 0 + */ + if (state == DCCP_ACKPKTS_STATE_NOT_RECEIVED && + len == 0 && delta == 0) { /* Found our reserved seat! */ + dccp_pr_debug("Found %llu reserved seat!\n", ackno); + ap->dccpap_buf[index] = state; + goto out; + } + /* len == 0 means one packet */ + if (delta < len + 1) + goto out_duplicate; + + delta -= len + 1; + if (++index == ap->dccpap_buf_len) + index = 0; + } + } + + ap->dccpap_buf_ackno = ackno; + ap->dccpap_time = jiffies; +out: + dccp_pr_debug(""); + dccp_ackpkts_print(ap); + return 0; + +out_duplicate: + /* Duplicate packet */ + dccp_pr_debug("Received a dup or already considered lost packet: %llu\n", ackno); + return -EILSEQ; +} + +#ifdef DCCP_DEBUG +void dccp_ackvector_print(const u64 ackno, const unsigned char *vector, int len) +{ + if (!dccp_debug) + return; + + printk("ACK vector len=%d, ackno=%llu |", len, ackno); + + while (len--) { + const u8 state = (*vector & DCCP_ACKPKTS_STATE_MASK) >> 6; + const u8 rl = (*vector & DCCP_ACKPKTS_LEN_MASK); + + printk("%d,%d|", state, rl); + ++vector; + } + + printk("\n"); +} + +void dccp_ackpkts_print(const struct dccp_ackpkts *ap) +{ + dccp_ackvector_print(ap->dccpap_buf_ackno, + ap->dccpap_buf + ap->dccpap_buf_head, + ap->dccpap_buf_vector_len); +} +#endif + +static void dccp_ackpkts_trow_away_ack_record(struct dccp_ackpkts *ap) +{ + /* + * As we're keeping track of the ack vector size + * (dccpap_buf_vector_len) and the sent ack vector size + * (dccpap_ack_vector_len) we don't need dccpap_buf_tail at all, but + * keep this code here as in the future we'll implement a vector of ack + * records, as suggested in draft-ietf-dccp-spec-11.txt Appendix A. -acme + */ +#if 0 + ap->dccpap_buf_tail = ap->dccpap_ack_ptr + 1; + if (ap->dccpap_buf_tail >= ap->dccpap_buf_len) + ap->dccpap_buf_tail -= ap->dccpap_buf_len; +#endif + ap->dccpap_buf_vector_len -= ap->dccpap_ack_vector_len; +} + +void dccp_ackpkts_check_rcv_ackno(struct dccp_ackpkts *ap, struct sock *sk, + u64 ackno) +{ + /* Check if we actually sent an ACK vector */ + if (ap->dccpap_ack_seqno == DCCP_MAX_SEQNO + 1) + return; + + if (ackno == ap->dccpap_ack_seqno) { +#ifdef DCCP_DEBUG + struct dccp_sock *dp = dccp_sk(sk); + const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ? "CLIENT rx ack: " : + "server rx ack: "; +#endif + dccp_pr_debug("%sACK packet 0, len=%d, ack_seqno=%llu, ack_ackno=%llu, ACKED!\n", + debug_prefix, 1, + ap->dccpap_ack_seqno, ap->dccpap_ack_ackno); + dccp_ackpkts_trow_away_ack_record(ap); + ap->dccpap_ack_seqno = DCCP_MAX_SEQNO + 1; + } +} + +static void dccp_ackpkts_check_rcv_ackvector(struct dccp_ackpkts *ap, + struct sock *sk, u64 ackno, + const unsigned char len, + const unsigned char *vector) +{ + unsigned char i; + + /* Check if we actually sent an ACK vector */ + if (ap->dccpap_ack_seqno == DCCP_MAX_SEQNO + 1) + return; + /* + * We're in the receiver half connection, so if the received an ACK vector + * ackno (e.g. 50) before dccpap_ack_seqno (e.g. 52), we're not interested. + * + * Extra explanation with example: + * + * if we received an ACK vector with ackno 50, it can only be acking + * 50, 49, 48, etc, not 52 (the seqno for the ACK vector we sent). + */ + // dccp_pr_debug("is %llu < %llu? ", ackno, ap->dccpap_ack_seqno); + if (before48(ackno, ap->dccpap_ack_seqno)) { + // dccp_pr_debug_cat("yes\n"); + return; + } + // dccp_pr_debug_cat("no\n"); + + i = len; + while (i--) { + const u8 rl = (*vector & DCCP_ACKPKTS_LEN_MASK); + u64 ackno_end_rl; + + dccp_set_seqno(&ackno_end_rl, ackno - rl); + + // dccp_pr_debug("is %llu <= %llu <= %llu? ", ackno_end_rl, ap->dccpap_ack_seqno, ackno); + if (between48(ap->dccpap_ack_seqno, ackno_end_rl, ackno)) { + const u8 state = (*vector & DCCP_ACKPKTS_STATE_MASK) >> 6; + // dccp_pr_debug_cat("yes\n"); + + if (state != DCCP_ACKPKTS_STATE_NOT_RECEIVED) { +#ifdef DCCP_DEBUG + struct dccp_sock *dp = dccp_sk(sk); + const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ? "CLIENT rx ack: " : + "server rx ack: "; +#endif + dccp_pr_debug("%sACK vector 0, len=%d, ack_seqno=%llu, ack_ackno=%llu, ACKED!\n", + debug_prefix, len, + ap->dccpap_ack_seqno, ap->dccpap_ack_ackno); + dccp_ackpkts_trow_away_ack_record(ap); + } + /* + * If dccpap_ack_seqno was not received, no problem we'll + * send another ACK vector. + */ + ap->dccpap_ack_seqno = DCCP_MAX_SEQNO + 1; + break; + } + // dccp_pr_debug_cat("no\n"); + + dccp_set_seqno(&ackno, ackno_end_rl - 1); + ++vector; + } +} diff --git a/net/dccp/output.c b/net/dccp/output.c new file mode 100644 index 0000000..22ca291 --- /dev/null +++ b/net/dccp/output.c @@ -0,0 +1,406 @@ +/* + * net/dccp/output.c + * + * An implementation of the DCCP protocol + * Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include + +#include + +#include "ccid.h" +#include "dccp.h" + +static inline void dccp_event_ack_sent(struct sock *sk) +{ + inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); +} + +/* + * All SKB's seen here are completely headerless. It is our + * job to build the DCCP header, and pass the packet down to + * IP so it can do the same plus pass the packet off to the + * device. + */ +int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) +{ + if (likely(skb != NULL)) { + const struct inet_sock *inet = inet_sk(sk); + struct dccp_sock *dp = dccp_sk(sk); + struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); + struct dccp_hdr *dh; + /* XXX For now we're using only 48 bits sequence numbers */ + const int dccp_header_size = sizeof(*dh) + + sizeof(struct dccp_hdr_ext) + + dccp_packet_hdr_len(dcb->dccpd_type); + int err, set_ack = 1; + u64 ackno = dp->dccps_gsr; + + /* + * FIXME: study DCCP_PKT_SYNC[ACK] to see what is the right thing + * to do here... + */ + dccp_inc_seqno(&dp->dccps_gss); + + dcb->dccpd_seq = dp->dccps_gss; + dccp_insert_options(sk, skb); + + switch (dcb->dccpd_type) { + case DCCP_PKT_DATA: + set_ack = 0; + break; + case DCCP_PKT_SYNC: + case DCCP_PKT_SYNCACK: + ackno = dcb->dccpd_seq; + break; + } + + skb->h.raw = skb_push(skb, dccp_header_size); + dh = dccp_hdr(skb); + /* Data packets are not cloned as they are never retransmitted */ + if (skb_cloned(skb)) + skb_set_owner_w(skb, sk); + + /* Build DCCP header and checksum it. */ + memset(dh, 0, dccp_header_size); + dh->dccph_type = dcb->dccpd_type; + dh->dccph_sport = inet->sport; + dh->dccph_dport = inet->dport; + dh->dccph_doff = (dccp_header_size + dcb->dccpd_opt_len) / 4; + dh->dccph_ccval = dcb->dccpd_ccval; + /* XXX For now we're using only 48 bits sequence numbers */ + dh->dccph_x = 1; + + dp->dccps_awh = dp->dccps_gss; + dccp_hdr_set_seq(dh, dp->dccps_gss); + if (set_ack) + dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), ackno); + + switch (dcb->dccpd_type) { + case DCCP_PKT_REQUEST: + dccp_hdr_request(skb)->dccph_req_service = dcb->dccpd_service; + break; + case DCCP_PKT_RESET: + dccp_hdr_reset(skb)->dccph_reset_code = dcb->dccpd_reset_code; + break; + } + + dh->dccph_checksum = dccp_v4_checksum(skb); + + if (dcb->dccpd_type == DCCP_PKT_ACK || + dcb->dccpd_type == DCCP_PKT_DATAACK) + dccp_event_ack_sent(sk); + + DCCP_INC_STATS(DCCP_MIB_OUTSEGS); + + err = ip_queue_xmit(skb, 0); + if (err <= 0) + return err; + + /* NET_XMIT_CN is special. It does not guarantee, + * that this packet is lost. It tells that device + * is about to start to drop packets or already + * drops some packets of the same priority and + * invokes us to send less aggressively. + */ + return err == NET_XMIT_CN ? 0 : err; + } + return -ENOBUFS; +} + +unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu) +{ + struct dccp_sock *dp = dccp_sk(sk); + int mss_now; + + /* + * FIXME: we really should be using the af_specific thing to support IPv6. + * mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct dccp_hdr) - sizeof(struct dccp_hdr_ext); + */ + mss_now = pmtu - sizeof(struct iphdr) - sizeof(struct dccp_hdr) - sizeof(struct dccp_hdr_ext); + + /* Now subtract optional transport overhead */ + mss_now -= dp->dccps_ext_header_len; + + /* + * FIXME: this should come from the CCID infrastructure, where, say, + * TFRC will say it wants TIMESTAMPS, ELAPSED time, etc, for now lets + * put a rough estimate for NDP + TIMESTAMP + TIMESTAMP_ECHO + ELAPSED + * TIME + TFRC_OPT_LOSS_EVENT_RATE + TFRC_OPT_RECEIVE_RATE + padding to + * make it a multiple of 4 + */ + + mss_now -= ((5 + 6 + 10 + 6 + 6 + 6 + 3) / 4) * 4; + + /* And store cached results */ + dp->dccps_pmtu_cookie = pmtu; + dp->dccps_mss_cache = mss_now; + + return mss_now; +} + +int dccp_retransmit_skb(struct sock *sk, struct sk_buff *skb) +{ + if (inet_sk_rebuild_header(sk) != 0) + return -EHOSTUNREACH; /* Routing failure or similar. */ + + return dccp_transmit_skb(sk, (skb_cloned(skb) ? + pskb_copy(skb, GFP_ATOMIC): + skb_clone(skb, GFP_ATOMIC))); +} + +struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst, + struct request_sock *req) +{ + struct dccp_hdr *dh; + const int dccp_header_size = sizeof(struct dccp_hdr) + + sizeof(struct dccp_hdr_ext) + + sizeof(struct dccp_hdr_response); + struct sk_buff *skb = sock_wmalloc(sk, MAX_HEADER + DCCP_MAX_OPT_LEN + + dccp_header_size, 1, + GFP_ATOMIC); + if (skb == NULL) + return NULL; + + /* Reserve space for headers. */ + skb_reserve(skb, MAX_HEADER + DCCP_MAX_OPT_LEN + dccp_header_size); + + skb->dst = dst_clone(dst); + skb->csum = 0; + + DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESPONSE; + DCCP_SKB_CB(skb)->dccpd_seq = dccp_rsk(req)->dreq_iss; + dccp_insert_options(sk, skb); + + skb->h.raw = skb_push(skb, dccp_header_size); + + dh = dccp_hdr(skb); + memset(dh, 0, dccp_header_size); + + dh->dccph_sport = inet_sk(sk)->sport; + dh->dccph_dport = inet_rsk(req)->rmt_port; + dh->dccph_doff = (dccp_header_size + DCCP_SKB_CB(skb)->dccpd_opt_len) / 4; + dh->dccph_type = DCCP_PKT_RESPONSE; + dh->dccph_x = 1; + dccp_hdr_set_seq(dh, dccp_rsk(req)->dreq_iss); + dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dccp_rsk(req)->dreq_isr); + + dh->dccph_checksum = dccp_v4_checksum(skb); + + DCCP_INC_STATS(DCCP_MIB_OUTSEGS); + return skb; +} + +struct sk_buff *dccp_make_reset(struct sock *sk, struct dst_entry *dst, + const enum dccp_reset_codes code) + +{ + struct dccp_hdr *dh; + struct dccp_sock *dp = dccp_sk(sk); + const int dccp_header_size = sizeof(struct dccp_hdr) + + sizeof(struct dccp_hdr_ext) + + sizeof(struct dccp_hdr_reset); + struct sk_buff *skb = sock_wmalloc(sk, MAX_HEADER + DCCP_MAX_OPT_LEN + + dccp_header_size, 1, + GFP_ATOMIC); + if (skb == NULL) + return NULL; + + /* Reserve space for headers. */ + skb_reserve(skb, MAX_HEADER + DCCP_MAX_OPT_LEN + dccp_header_size); + + skb->dst = dst_clone(dst); + skb->csum = 0; + + dccp_inc_seqno(&dp->dccps_gss); + + DCCP_SKB_CB(skb)->dccpd_reset_code = code; + DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESET; + DCCP_SKB_CB(skb)->dccpd_seq = dp->dccps_gss; + dccp_insert_options(sk, skb); + + skb->h.raw = skb_push(skb, dccp_header_size); + + dh = dccp_hdr(skb); + memset(dh, 0, dccp_header_size); + + dh->dccph_sport = inet_sk(sk)->sport; + dh->dccph_dport = inet_sk(sk)->dport; + dh->dccph_doff = (dccp_header_size + DCCP_SKB_CB(skb)->dccpd_opt_len) / 4; + dh->dccph_type = DCCP_PKT_RESET; + dh->dccph_x = 1; + dccp_hdr_set_seq(dh, dp->dccps_gss); + dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dp->dccps_gsr); + + dccp_hdr_reset(skb)->dccph_reset_code = code; + + dh->dccph_checksum = dccp_v4_checksum(skb); + + DCCP_INC_STATS(DCCP_MIB_OUTSEGS); + return skb; +} + +/* + * Do all connect socket setups that can be done AF independent. + */ +static inline void dccp_connect_init(struct sock *sk) +{ + struct dst_entry *dst = __sk_dst_get(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + + sk->sk_err = 0; + sock_reset_flag(sk, SOCK_DONE); + + dccp_sync_mss(sk, dst_mtu(dst)); + + /* + * FIXME: set dp->{dccps_swh,dccps_swl}, with + * something like dccp_inc_seq + */ + + icsk->icsk_retransmits = 0; +} + +int dccp_connect(struct sock *sk) +{ + struct sk_buff *skb; + struct inet_connection_sock *icsk = inet_csk(sk); + + dccp_connect_init(sk); + + skb = alloc_skb(MAX_DCCP_HEADER + 15, sk->sk_allocation); + if (unlikely(skb == NULL)) + return -ENOBUFS; + + /* Reserve space for headers. */ + skb_reserve(skb, MAX_DCCP_HEADER); + + DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_REQUEST; + /* FIXME: set service to something meaningful, coming + * from userspace*/ + DCCP_SKB_CB(skb)->dccpd_service = 0; + skb->csum = 0; + skb_set_owner_w(skb, sk); + + BUG_TRAP(sk->sk_send_head == NULL); + sk->sk_send_head = skb; + dccp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL)); + DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS); + + /* Timer for repeating the REQUEST until an answer. */ + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); + return 0; +} + +void dccp_send_ack(struct sock *sk) +{ + /* If we have been reset, we may not send again. */ + if (sk->sk_state != DCCP_CLOSED) { + struct sk_buff *skb = alloc_skb(MAX_DCCP_HEADER, GFP_ATOMIC); + + if (skb == NULL) { + inet_csk_schedule_ack(sk); + inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, TCP_DELACK_MAX, TCP_RTO_MAX); + return; + } + + /* Reserve space for headers */ + skb_reserve(skb, MAX_DCCP_HEADER); + skb->csum = 0; + DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_ACK; + skb_set_owner_w(skb, sk); + dccp_transmit_skb(sk, skb); + } +} + +EXPORT_SYMBOL_GPL(dccp_send_ack); + +void dccp_send_delayed_ack(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + /* + * FIXME: tune this timer. elapsed time fixes the skew, so no problem + * with using 2s, and active senders also piggyback the ACK into a + * DATAACK packet, so this is really for quiescent senders. + */ + unsigned long timeout = jiffies + 2 * HZ; + + /* Use new timeout only if there wasn't a older one earlier. */ + if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) { + /* If delack timer was blocked or is about to expire, + * send ACK now. + * + * FIXME: check the "about to expire" part + */ + if (icsk->icsk_ack.blocked) { + dccp_send_ack(sk); + return; + } + + if (!time_before(timeout, icsk->icsk_ack.timeout)) + timeout = icsk->icsk_ack.timeout; + } + icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER; + icsk->icsk_ack.timeout = timeout; + sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout); +} + +void dccp_send_sync(struct sock *sk, u64 seq) +{ + /* + * We are not putting this on the write queue, so + * dccp_transmit_skb() will set the ownership to this + * sock. + */ + struct sk_buff *skb = alloc_skb(MAX_DCCP_HEADER, GFP_ATOMIC); + + if (skb == NULL) + /* FIXME: how to make sure the sync is sent? */ + return; + + /* Reserve space for headers and prepare control bits. */ + skb_reserve(skb, MAX_DCCP_HEADER); + skb->csum = 0; + DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_SYNC; + DCCP_SKB_CB(skb)->dccpd_seq = seq; + + skb_set_owner_w(skb, sk); + dccp_transmit_skb(sk, skb); +} + +/* Send a DCCP_PKT_CLOSE/CLOSEREQ. The caller locks the socket for us. This cannot be + * allowed to fail queueing a DCCP_PKT_CLOSE/CLOSEREQ frame under any circumstances. + */ +void dccp_send_close(struct sock *sk) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct sk_buff *skb; + + /* Socket is locked, keep trying until memory is available. */ + for (;;) { + skb = alloc_skb(sk->sk_prot->max_header, GFP_KERNEL); + if (skb != NULL) + break; + yield(); + } + + /* Reserve space for headers and prepare control bits. */ + skb_reserve(skb, sk->sk_prot->max_header); + skb->csum = 0; + DCCP_SKB_CB(skb)->dccpd_type = dp->dccps_role == DCCP_ROLE_CLIENT ? DCCP_PKT_CLOSE : DCCP_PKT_CLOSEREQ; + + skb_set_owner_w(skb, sk); + dccp_transmit_skb(sk, skb); + + ccid_hc_rx_exit(dp->dccps_hc_rx_ccid, sk); + ccid_hc_tx_exit(dp->dccps_hc_tx_ccid, sk); +} diff --git a/net/dccp/proto.c b/net/dccp/proto.c new file mode 100644 index 0000000..70284e6 --- /dev/null +++ b/net/dccp/proto.c @@ -0,0 +1,818 @@ +/* + * net/dccp/proto.c + * + * An implementation of the DCCP protocol + * Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "ccid.h" +#include "dccp.h" + +DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics); + +atomic_t dccp_orphan_count = ATOMIC_INIT(0); + +static struct net_protocol dccp_protocol = { + .handler = dccp_v4_rcv, + .err_handler = dccp_v4_err, +}; + +const char *dccp_packet_name(const int type) +{ + static const char *dccp_packet_names[] = { + [DCCP_PKT_REQUEST] = "REQUEST", + [DCCP_PKT_RESPONSE] = "RESPONSE", + [DCCP_PKT_DATA] = "DATA", + [DCCP_PKT_ACK] = "ACK", + [DCCP_PKT_DATAACK] = "DATAACK", + [DCCP_PKT_CLOSEREQ] = "CLOSEREQ", + [DCCP_PKT_CLOSE] = "CLOSE", + [DCCP_PKT_RESET] = "RESET", + [DCCP_PKT_SYNC] = "SYNC", + [DCCP_PKT_SYNCACK] = "SYNCACK", + }; + + if (type >= DCCP_NR_PKT_TYPES) + return "INVALID"; + else + return dccp_packet_names[type]; +} + +EXPORT_SYMBOL_GPL(dccp_packet_name); + +const char *dccp_state_name(const int state) +{ + static char *dccp_state_names[] = { + [DCCP_OPEN] = "OPEN", + [DCCP_REQUESTING] = "REQUESTING", + [DCCP_PARTOPEN] = "PARTOPEN", + [DCCP_LISTEN] = "LISTEN", + [DCCP_RESPOND] = "RESPOND", + [DCCP_CLOSING] = "CLOSING", + [DCCP_TIME_WAIT] = "TIME_WAIT", + [DCCP_CLOSED] = "CLOSED", + }; + + if (state >= DCCP_MAX_STATES) + return "INVALID STATE!"; + else + return dccp_state_names[state]; +} + +EXPORT_SYMBOL_GPL(dccp_state_name); + +static inline int dccp_listen_start(struct sock *sk) +{ + dccp_sk(sk)->dccps_role = DCCP_ROLE_LISTEN; + return inet_csk_listen_start(sk, TCP_SYNQ_HSIZE); +} + +int dccp_disconnect(struct sock *sk, int flags) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_sock *inet = inet_sk(sk); + int err = 0; + const int old_state = sk->sk_state; + + if (old_state != DCCP_CLOSED) + dccp_set_state(sk, DCCP_CLOSED); + + /* ABORT function of RFC793 */ + if (old_state == DCCP_LISTEN) { + inet_csk_listen_stop(sk); + /* FIXME: do the active reset thing */ + } else if (old_state == DCCP_REQUESTING) + sk->sk_err = ECONNRESET; + + dccp_clear_xmit_timers(sk); + __skb_queue_purge(&sk->sk_receive_queue); + if (sk->sk_send_head != NULL) { + __kfree_skb(sk->sk_send_head); + sk->sk_send_head = NULL; + } + + inet->dport = 0; + + if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) + inet_reset_saddr(sk); + + sk->sk_shutdown = 0; + sock_reset_flag(sk, SOCK_DONE); + + icsk->icsk_backoff = 0; + inet_csk_delack_init(sk); + __sk_dst_reset(sk); + + BUG_TRAP(!inet->num || icsk->icsk_bind_hash); + + sk->sk_error_report(sk); + return err; +} + +int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg) +{ + dccp_pr_debug("entry\n"); + return -ENOIOCTLCMD; +} + +int dccp_setsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen) +{ + dccp_pr_debug("entry\n"); + + if (level != SOL_DCCP) + return ip_setsockopt(sk, level, optname, optval, optlen); + + return -EOPNOTSUPP; +} + +int dccp_getsockopt(struct sock *sk, int level, int optname, + char *optval, int *optlen) +{ + dccp_pr_debug("entry\n"); + + if (level != SOL_DCCP) + return ip_getsockopt(sk, level, optname, optval, optlen); + + return -EOPNOTSUPP; +} + +int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len) +{ + const struct dccp_sock *dp = dccp_sk(sk); + const int flags = msg->msg_flags; + const int noblock = flags & MSG_DONTWAIT; + struct sk_buff *skb; + int rc, size; + long timeo; + + if (len > dp->dccps_mss_cache) + return -EMSGSIZE; + + lock_sock(sk); + + timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); + + /* + * We have to use sk_stream_wait_connect here to set sk_write_pending, + * so that the trick in dccp_rcv_request_sent_state_process. + */ + /* Wait for a connection to finish. */ + if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN | DCCPF_CLOSING)) + if ((rc = sk_stream_wait_connect(sk, &timeo)) != 0) + goto out_err; + + size = sk->sk_prot->max_header + len; + release_sock(sk); + skb = sock_alloc_send_skb(sk, size, noblock, &rc); + lock_sock(sk); + + if (skb == NULL) + goto out_release; + + skb_reserve(skb, sk->sk_prot->max_header); + rc = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len); + if (rc == 0) { + struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); + const struct dccp_ackpkts *ap = dp->dccps_hc_rx_ackpkts; + long delay; + + /* + * XXX: This is just to match the Waikato tree CA interaction + * points, after the CCID3 code is stable and I have a better + * understanding of behaviour I'll change this to look more like + * TCP. + */ + while (1) { + rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, + skb, len, &delay); + if (rc == 0) + break; + if (rc != -EAGAIN) + goto out_discard; + if (delay > timeo) + goto out_discard; + release_sock(sk); + delay = schedule_timeout(delay); + lock_sock(sk); + timeo -= delay; + if (signal_pending(current)) + goto out_interrupted; + rc = -EPIPE; + if (!(sk->sk_state == DCCP_PARTOPEN || sk->sk_state == DCCP_OPEN)) + goto out_discard; + } + + if (sk->sk_state == DCCP_PARTOPEN) { + /* See 8.1.5. Handshake Completion */ + inet_csk_schedule_ack(sk); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, inet_csk(sk)->icsk_rto, TCP_RTO_MAX); + dcb->dccpd_type = DCCP_PKT_DATAACK; + /* FIXME: we really should have a dccps_ack_pending or use icsk */ + } else if (inet_csk_ack_scheduled(sk) || + (dp->dccps_options.dccpo_send_ack_vector && + ap->dccpap_buf_ackno != DCCP_MAX_SEQNO + 1 && + ap->dccpap_ack_seqno == DCCP_MAX_SEQNO + 1)) + dcb->dccpd_type = DCCP_PKT_DATAACK; + else + dcb->dccpd_type = DCCP_PKT_DATA; + dccp_transmit_skb(sk, skb); + ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, 0, len); + } else { +out_discard: + kfree_skb(skb); + } +out_release: + release_sock(sk); + return rc ? : len; +out_err: + rc = sk_stream_error(sk, flags, rc); + goto out_release; +out_interrupted: + rc = sock_intr_errno(timeo); + goto out_discard; +} + +EXPORT_SYMBOL(dccp_sendmsg); + +int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len, int nonblock, int flags, int *addr_len) +{ + const struct dccp_hdr *dh; + int copied = 0; + unsigned long used; + int err; + int target; /* Read at least this many bytes */ + long timeo; + + lock_sock(sk); + + err = -ENOTCONN; + if (sk->sk_state == DCCP_LISTEN) + goto out; + + timeo = sock_rcvtimeo(sk, nonblock); + + /* Urgent data needs to be handled specially. */ + if (flags & MSG_OOB) + goto recv_urg; + + /* FIXME */ +#if 0 + seq = &tp->copied_seq; + if (flags & MSG_PEEK) { + peek_seq = tp->copied_seq; + seq = &peek_seq; + } +#endif + + target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); + + do { + struct sk_buff *skb; + u32 offset; + + /* FIXME */ +#if 0 + /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */ + if (tp->urg_data && tp->urg_seq == *seq) { + if (copied) + break; + if (signal_pending(current)) { + copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; + break; + } + } +#endif + + /* Next get a buffer. */ + + skb = skb_peek(&sk->sk_receive_queue); + do { + if (!skb) + break; + + offset = 0; + dh = dccp_hdr(skb); + + if (dh->dccph_type == DCCP_PKT_DATA || + dh->dccph_type == DCCP_PKT_DATAACK) + goto found_ok_skb; + + if (dh->dccph_type == DCCP_PKT_RESET || + dh->dccph_type == DCCP_PKT_CLOSE) { + dccp_pr_debug("found fin ok!\n"); + goto found_fin_ok; + } + dccp_pr_debug("packet_type=%s\n", dccp_packet_name(dh->dccph_type)); + BUG_TRAP(flags & MSG_PEEK); + skb = skb->next; + } while (skb != (struct sk_buff *)&sk->sk_receive_queue); + + /* Well, if we have backlog, try to process it now yet. */ + if (copied >= target && !sk->sk_backlog.tail) + break; + + if (copied) { + if (sk->sk_err || + sk->sk_state == DCCP_CLOSED || + (sk->sk_shutdown & RCV_SHUTDOWN) || + !timeo || + signal_pending(current) || + (flags & MSG_PEEK)) + break; + } else { + if (sock_flag(sk, SOCK_DONE)) + break; + + if (sk->sk_err) { + copied = sock_error(sk); + break; + } + + if (sk->sk_shutdown & RCV_SHUTDOWN) + break; + + if (sk->sk_state == DCCP_CLOSED) { + if (!sock_flag(sk, SOCK_DONE)) { + /* This occurs when user tries to read + * from never connected socket. + */ + copied = -ENOTCONN; + break; + } + break; + } + + if (!timeo) { + copied = -EAGAIN; + break; + } + + if (signal_pending(current)) { + copied = sock_intr_errno(timeo); + break; + } + } + + /* FIXME: cleanup_rbuf(sk, copied); */ + + if (copied >= target) { + /* Do not sleep, just process backlog. */ + release_sock(sk); + lock_sock(sk); + } else + sk_wait_data(sk, &timeo); + + continue; + + found_ok_skb: + /* Ok so how much can we use? */ + used = skb->len - offset; + if (len < used) + used = len; + + if (!(flags & MSG_TRUNC)) { + err = skb_copy_datagram_iovec(skb, offset, + msg->msg_iov, used); + if (err) { + /* Exception. Bailout! */ + if (!copied) + copied = -EFAULT; + break; + } + } + + copied += used; + len -= used; + + /* FIXME: tcp_rcv_space_adjust(sk); */ + +//skip_copy: + if (used + offset < skb->len) + continue; + + if (!(flags & MSG_PEEK)) + sk_eat_skb(sk, skb); + continue; + found_fin_ok: + if (!(flags & MSG_PEEK)) + sk_eat_skb(sk, skb); + break; + + } while (len > 0); + + /* According to UNIX98, msg_name/msg_namelen are ignored + * on connected socket. I was just happy when found this 8) --ANK + */ + + /* Clean up data we have read: This will do ACK frames. */ + /* FIXME: cleanup_rbuf(sk, copied); */ + + release_sock(sk); + return copied; + +out: + release_sock(sk); + return err; + +recv_urg: + /* FIXME: err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len); */ + goto out; +} + +static int inet_dccp_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + unsigned char old_state; + int err; + + lock_sock(sk); + + err = -EINVAL; + if (sock->state != SS_UNCONNECTED || sock->type != SOCK_DCCP) + goto out; + + old_state = sk->sk_state; + if (!((1 << old_state) & (DCCPF_CLOSED | DCCPF_LISTEN))) + goto out; + + /* Really, if the socket is already in listen state + * we can only allow the backlog to be adjusted. + */ + if (old_state != DCCP_LISTEN) { + /* + * FIXME: here it probably should be sk->sk_prot->listen_start + * see tcp_listen_start + */ + err = dccp_listen_start(sk); + if (err) + goto out; + } + sk->sk_max_ack_backlog = backlog; + err = 0; + +out: + release_sock(sk); + return err; +} + +static const unsigned char dccp_new_state[] = { + /* current state: new state: action: */ + [0] = DCCP_CLOSED, + [DCCP_OPEN] = DCCP_CLOSING | DCCP_ACTION_FIN, + [DCCP_REQUESTING] = DCCP_CLOSED, + [DCCP_PARTOPEN] = DCCP_CLOSING | DCCP_ACTION_FIN, + [DCCP_LISTEN] = DCCP_CLOSED, + [DCCP_RESPOND] = DCCP_CLOSED, + [DCCP_CLOSING] = DCCP_CLOSED, + [DCCP_TIME_WAIT] = DCCP_CLOSED, + [DCCP_CLOSED] = DCCP_CLOSED, +}; + +static int dccp_close_state(struct sock *sk) +{ + const int next = dccp_new_state[sk->sk_state]; + const int ns = next & DCCP_STATE_MASK; + + if (ns != sk->sk_state) + dccp_set_state(sk, ns); + + return next & DCCP_ACTION_FIN; +} + +void dccp_close(struct sock *sk, long timeout) +{ + struct sk_buff *skb; + + lock_sock(sk); + + sk->sk_shutdown = SHUTDOWN_MASK; + + if (sk->sk_state == DCCP_LISTEN) { + dccp_set_state(sk, DCCP_CLOSED); + + /* Special case. */ + inet_csk_listen_stop(sk); + + goto adjudge_to_death; + } + + /* + * We need to flush the recv. buffs. We do this only on the + * descriptor close, not protocol-sourced closes, because the + *reader process may not have drained the data yet! + */ + /* FIXME: check for unread data */ + while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { + __kfree_skb(skb); + } + + if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { + /* Check zero linger _after_ checking for unread data. */ + sk->sk_prot->disconnect(sk, 0); + } else if (dccp_close_state(sk)) { + dccp_send_close(sk); + } + + sk_stream_wait_close(sk, timeout); + +adjudge_to_death: + release_sock(sk); + /* + * Now socket is owned by kernel and we acquire BH lock + * to finish close. No need to check for user refs. + */ + local_bh_disable(); + bh_lock_sock(sk); + BUG_TRAP(!sock_owned_by_user(sk)); + + sock_hold(sk); + sock_orphan(sk); + + if (sk->sk_state != DCCP_CLOSED) + dccp_set_state(sk, DCCP_CLOSED); + + atomic_inc(&dccp_orphan_count); + if (sk->sk_state == DCCP_CLOSED) + inet_csk_destroy_sock(sk); + + /* Otherwise, socket is reprieved until protocol close. */ + + bh_unlock_sock(sk); + local_bh_enable(); + sock_put(sk); +} + +void dccp_shutdown(struct sock *sk, int how) +{ + dccp_pr_debug("entry\n"); +} + +struct proto_ops inet_dccp_ops = { + .family = PF_INET, + .owner = THIS_MODULE, + .release = inet_release, + .bind = inet_bind, + .connect = inet_stream_connect, + .socketpair = sock_no_socketpair, + .accept = inet_accept, + .getname = inet_getname, + .poll = sock_no_poll, + .ioctl = inet_ioctl, + .listen = inet_dccp_listen, /* FIXME: work on inet_listen to rename it to sock_common_listen */ + .shutdown = inet_shutdown, + .setsockopt = sock_common_setsockopt, + .getsockopt = sock_common_getsockopt, + .sendmsg = inet_sendmsg, + .recvmsg = sock_common_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +extern struct net_proto_family inet_family_ops; + +static struct inet_protosw dccp_v4_protosw = { + .type = SOCK_DCCP, + .protocol = IPPROTO_DCCP, + .prot = &dccp_v4_prot, + .ops = &inet_dccp_ops, + .capability = -1, + .no_check = 0, + .flags = 0, +}; + +/* + * This is the global socket data structure used for responding to + * the Out-of-the-blue (OOTB) packets. A control sock will be created + * for this socket at the initialization time. + */ +struct socket *dccp_ctl_socket; + +static char dccp_ctl_socket_err_msg[] __initdata = + KERN_ERR "DCCP: Failed to create the control socket.\n"; + +static int __init dccp_ctl_sock_init(void) +{ + int rc = sock_create_kern(PF_INET, SOCK_DCCP, IPPROTO_DCCP, + &dccp_ctl_socket); + if (rc < 0) + printk(dccp_ctl_socket_err_msg); + else { + dccp_ctl_socket->sk->sk_allocation = GFP_ATOMIC; + inet_sk(dccp_ctl_socket->sk)->uc_ttl = -1; + + /* Unhash it so that IP input processing does not even + * see it, we do not wish this socket to see incoming + * packets. + */ + dccp_ctl_socket->sk->sk_prot->unhash(dccp_ctl_socket->sk); + } + + return rc; +} + +static void __exit dccp_ctl_sock_exit(void) +{ + if (dccp_ctl_socket != NULL) + sock_release(dccp_ctl_socket); +} + +static int __init init_dccp_v4_mibs(void) +{ + int rc = -ENOMEM; + + dccp_statistics[0] = alloc_percpu(struct dccp_mib); + if (dccp_statistics[0] == NULL) + goto out; + + dccp_statistics[1] = alloc_percpu(struct dccp_mib); + if (dccp_statistics[1] == NULL) + goto out_free_one; + + rc = 0; +out: + return rc; +out_free_one: + free_percpu(dccp_statistics[0]); + dccp_statistics[0] = NULL; + goto out; + +} + +static int thash_entries; +module_param(thash_entries, int, 0444); +MODULE_PARM_DESC(thash_entries, "Number of ehash buckets"); + +int dccp_debug; +module_param(dccp_debug, int, 0444); +MODULE_PARM_DESC(dccp_debug, "Enable debug messages"); + +static int __init dccp_init(void) +{ + unsigned long goal; + int ehash_order, bhash_order, i; + int rc = proto_register(&dccp_v4_prot, 1); + + if (rc) + goto out; + + dccp_hashinfo.bind_bucket_cachep = kmem_cache_create("dccp_bind_bucket", + sizeof(struct inet_bind_bucket), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!dccp_hashinfo.bind_bucket_cachep) + goto out_proto_unregister; + + /* + * Size and allocate the main established and bind bucket + * hash tables. + * + * The methodology is similar to that of the buffer cache. + */ + if (num_physpages >= (128 * 1024)) + goal = num_physpages >> (21 - PAGE_SHIFT); + else + goal = num_physpages >> (23 - PAGE_SHIFT); + + if (thash_entries) + goal = (thash_entries * sizeof(struct inet_ehash_bucket)) >> PAGE_SHIFT; + for (ehash_order = 0; (1UL << ehash_order) < goal; ehash_order++) + ; + do { + dccp_hashinfo.ehash_size = (1UL << ehash_order) * PAGE_SIZE / + sizeof(struct inet_ehash_bucket); + dccp_hashinfo.ehash_size >>= 1; + while (dccp_hashinfo.ehash_size & (dccp_hashinfo.ehash_size - 1)) + dccp_hashinfo.ehash_size--; + dccp_hashinfo.ehash = (struct inet_ehash_bucket *) + __get_free_pages(GFP_ATOMIC, ehash_order); + } while (!dccp_hashinfo.ehash && --ehash_order > 0); + + if (!dccp_hashinfo.ehash) { + printk(KERN_CRIT "Failed to allocate DCCP " + "established hash table\n"); + goto out_free_bind_bucket_cachep; + } + + for (i = 0; i < (dccp_hashinfo.ehash_size << 1); i++) { + rwlock_init(&dccp_hashinfo.ehash[i].lock); + INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].chain); + } + + bhash_order = ehash_order; + + do { + dccp_hashinfo.bhash_size = (1UL << bhash_order) * PAGE_SIZE / + sizeof(struct inet_bind_hashbucket); + if ((dccp_hashinfo.bhash_size > (64 * 1024)) && bhash_order > 0) + continue; + dccp_hashinfo.bhash = (struct inet_bind_hashbucket *) + __get_free_pages(GFP_ATOMIC, bhash_order); + } while (!dccp_hashinfo.bhash && --bhash_order >= 0); + + if (!dccp_hashinfo.bhash) { + printk(KERN_CRIT "Failed to allocate DCCP bind hash table\n"); + goto out_free_dccp_ehash; + } + + for (i = 0; i < dccp_hashinfo.bhash_size; i++) { + spin_lock_init(&dccp_hashinfo.bhash[i].lock); + INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain); + } + + if (init_dccp_v4_mibs()) + goto out_free_dccp_bhash; + + rc = -EAGAIN; + if (inet_add_protocol(&dccp_protocol, IPPROTO_DCCP)) + goto out_free_dccp_v4_mibs; + + inet_register_protosw(&dccp_v4_protosw); + + rc = dccp_ctl_sock_init(); + if (rc) + goto out_unregister_protosw; +out: + return rc; +out_unregister_protosw: + inet_unregister_protosw(&dccp_v4_protosw); + inet_del_protocol(&dccp_protocol, IPPROTO_DCCP); +out_free_dccp_v4_mibs: + free_percpu(dccp_statistics[0]); + free_percpu(dccp_statistics[1]); + dccp_statistics[0] = dccp_statistics[1] = NULL; +out_free_dccp_bhash: + free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order); + dccp_hashinfo.bhash = NULL; +out_free_dccp_ehash: + free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order); + dccp_hashinfo.ehash = NULL; +out_free_bind_bucket_cachep: + kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); + dccp_hashinfo.bind_bucket_cachep = NULL; +out_proto_unregister: + proto_unregister(&dccp_v4_prot); + goto out; +} + +static const char dccp_del_proto_err_msg[] __exitdata = + KERN_ERR "can't remove dccp net_protocol\n"; + +static void __exit dccp_fini(void) +{ + dccp_ctl_sock_exit(); + + inet_unregister_protosw(&dccp_v4_protosw); + + if (inet_del_protocol(&dccp_protocol, IPPROTO_DCCP) < 0) + printk(dccp_del_proto_err_msg); + + /* Free the control endpoint. */ + sock_release(dccp_ctl_socket); + + proto_unregister(&dccp_v4_prot); + + kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); +} + +module_init(dccp_init); +module_exit(dccp_fini); + +/* __stringify doesn't likes enums, so use SOCK_DCCP (6) value directly */ +MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-6"); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Arnaldo Carvalho de Melo "); +MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol"); diff --git a/net/dccp/timer.c b/net/dccp/timer.c new file mode 100644 index 0000000..8c396ee --- /dev/null +++ b/net/dccp/timer.c @@ -0,0 +1,249 @@ +/* + * net/dccp/timer.c + * + * An implementation of the DCCP protocol + * Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include + +#include "dccp.h" + +static void dccp_write_timer(unsigned long data); +static void dccp_keepalive_timer(unsigned long data); +static void dccp_delack_timer(unsigned long data); + +void dccp_init_xmit_timers(struct sock *sk) +{ + inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer, + &dccp_keepalive_timer); +} + +static void dccp_write_err(struct sock *sk) +{ + sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT; + sk->sk_error_report(sk); + + dccp_v4_send_reset(sk, DCCP_RESET_CODE_ABORTED); + dccp_done(sk); + DCCP_INC_STATS_BH(DCCP_MIB_ABORTONTIMEOUT); +} + +/* A write timeout has occurred. Process the after effects. */ +static int dccp_write_timeout(struct sock *sk) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + int retry_until; + + if (sk->sk_state == DCCP_REQUESTING || sk->sk_state == DCCP_PARTOPEN) { + if (icsk->icsk_retransmits != 0) + dst_negative_advice(&sk->sk_dst_cache); + retry_until = icsk->icsk_syn_retries ? : /* FIXME! */ 3 /* FIXME! sysctl_tcp_syn_retries */; + } else { + if (icsk->icsk_retransmits >= /* FIXME! sysctl_tcp_retries1 */ 5 /* FIXME! */) { + /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black + hole detection. :-( + + It is place to make it. It is not made. I do not want + to make it. It is disguisting. It does not work in any + case. Let me to cite the same draft, which requires for + us to implement this: + + "The one security concern raised by this memo is that ICMP black holes + are often caused by over-zealous security administrators who block + all ICMP messages. It is vitally important that those who design and + deploy security systems understand the impact of strict filtering on + upper-layer protocols. The safest web site in the world is worthless + if most TCP implementations cannot transfer data from it. It would + be far nicer to have all of the black holes fixed rather than fixing + all of the TCP implementations." + + Golden words :-). + */ + + dst_negative_advice(&sk->sk_dst_cache); + } + + retry_until = /* FIXME! */ 15 /* FIXME! sysctl_tcp_retries2 */; + /* + * FIXME: see tcp_write_timout and tcp_out_of_resources + */ + } + + if (icsk->icsk_retransmits >= retry_until) { + /* Has it gone just too far? */ + dccp_write_err(sk); + return 1; + } + return 0; +} + +/* This is the same as tcp_delack_timer, sans prequeue & mem_reclaim stuff */ +static void dccp_delack_timer(unsigned long data) +{ + struct sock *sk = (struct sock *)data; + struct inet_connection_sock *icsk = inet_csk(sk); + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + /* Try again later. */ + icsk->icsk_ack.blocked = 1; + NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED); + sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN); + goto out; + } + + if (sk->sk_state == DCCP_CLOSED || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) + goto out; + if (time_after(icsk->icsk_ack.timeout, jiffies)) { + sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout); + goto out; + } + + icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER; + + if (inet_csk_ack_scheduled(sk)) { + if (!icsk->icsk_ack.pingpong) { + /* Delayed ACK missed: inflate ATO. */ + icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto); + } else { + /* Delayed ACK missed: leave pingpong mode and + * deflate ATO. + */ + icsk->icsk_ack.pingpong = 0; + icsk->icsk_ack.ato = TCP_ATO_MIN; + } + dccp_send_ack(sk); + NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS); + } +out: + bh_unlock_sock(sk); + sock_put(sk); +} + +/* + * The DCCP retransmit timer. + */ +static void dccp_retransmit_timer(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + /* + * sk->sk_send_head has to have one skb with + * DCCP_SKB_CB(skb)->dccpd_type set to one of the retransmittable DCCP + * packet types (REQUEST, RESPONSE, the ACK in the 3way hanshake + * (PARTOPEN timer), etc). + */ + BUG_TRAP(sk->sk_send_head != NULL); + + /* + * More than than 4MSL (8 minutes) has passed, a RESET(aborted) was + * sent, no need to retransmit, this sock is dead. + */ + if (dccp_write_timeout(sk)) + goto out; + + /* + * We want to know the number of packets retransmitted, not the + * total number of retransmissions of clones of original packets. + */ + if (icsk->icsk_retransmits == 0) + DCCP_INC_STATS_BH(DCCP_MIB_TIMEOUTS); + + if (dccp_retransmit_skb(sk, sk->sk_send_head) < 0) { + /* + * Retransmission failed because of local congestion, + * do not backoff. + */ + if (icsk->icsk_retransmits == 0) + icsk->icsk_retransmits = 1; + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + min(icsk->icsk_rto, + TCP_RESOURCE_PROBE_INTERVAL), + TCP_RTO_MAX); + goto out; + } + + icsk->icsk_backoff++; + icsk->icsk_retransmits++; + + icsk->icsk_rto = min(icsk->icsk_rto << 1, DCCP_RTO_MAX); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); + if (icsk->icsk_retransmits > 3 /* FIXME: sysctl_dccp_retries1 */) + __sk_dst_reset(sk); +out:; +} + +static void dccp_write_timer(unsigned long data) +{ + struct sock *sk = (struct sock *)data; + struct inet_connection_sock *icsk = inet_csk(sk); + int event = 0; + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + /* Try again later */ + sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20)); + goto out; + } + + if (sk->sk_state == DCCP_CLOSED || !icsk->icsk_pending) + goto out; + + if (time_after(icsk->icsk_timeout, jiffies)) { + sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); + goto out; + } + + event = icsk->icsk_pending; + icsk->icsk_pending = 0; + + switch (event) { + case ICSK_TIME_RETRANS: + dccp_retransmit_timer(sk); + break; + } +out: + bh_unlock_sock(sk); + sock_put(sk); +} + +/* + * Timer for listening sockets + */ +static void dccp_response_timer(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + const int max_retries = icsk->icsk_syn_retries ? : TCP_SYNACK_RETRIES /* FIXME sysctl_tcp_synack_retries */; + + reqsk_queue_prune(&icsk->icsk_accept_queue, sk, TCP_SYNQ_INTERVAL, + DCCP_TIMEOUT_INIT, DCCP_RTO_MAX, max_retries); +} + +static void dccp_keepalive_timer(unsigned long data) +{ + struct sock *sk = (struct sock *)data; + + /* Only process if socket is not in use. */ + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + /* Try again later. */ + inet_csk_reset_keepalive_timer(sk, HZ / 20); + goto out; + } + + if (sk->sk_state == DCCP_LISTEN) { + dccp_response_timer(sk); + goto out; + } +out: + bh_unlock_sock(sk); + sock_put(sk); +} -- cgit v0.10.2 From a019d6fe2b9da68ea4ba6cf3c4e86fc1dbf554c3 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:15:09 -0700 Subject: [ICSK]: Move generalised functions from tcp to inet_connection_sock This also improves reqsk_queue_prune and renames it to inet_csk_reqsk_queue_prune, as it deals with both inet_connection_sock and inet_request_sock objects, not just with request_sock ones thus belonging to inet_request_sock. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 692825f..bec19d5 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -239,6 +239,13 @@ static inline void inet_csk_reqsk_queue_drop(struct sock *sk, reqsk_free(req); } +extern void inet_csk_reqsk_queue_prune(struct sock *parent, + const unsigned long interval, + const unsigned long timeout, + const unsigned long max_rto); + +extern void inet_csk_destroy_sock(struct sock *sk); +extern int inet_csk_listen_start(struct sock *sk, const int nr_table_entries); extern void inet_csk_listen_stop(struct sock *sk); #endif /* _INET_CONNECTION_SOCK_H */ diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 447d287..b52cc52 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -258,8 +258,4 @@ static inline void reqsk_queue_hash_req(struct request_sock_queue *queue, write_unlock(&queue->syn_wait_lock); } -extern void reqsk_queue_prune(struct request_sock_queue *queue, struct sock *parent, - const unsigned long interval, const unsigned long timeout, - const unsigned long max_rto, int max_retries); - #endif /* _REQUEST_SOCK_H */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 2423f05..077db85 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -423,9 +423,6 @@ extern int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, size_t len, int nonblock, int flags, int *addr_len); -extern int inet_csk_listen_start(struct sock *sk, - const int nr_table_entries); - extern void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab); @@ -861,9 +858,6 @@ static __inline__ void tcp_update_wl(struct tcp_sock *tp, u32 ack, u32 seq) tp->snd_wl1 = seq; } -extern void inet_csk_destroy_sock(struct sock *sk); - - /* * Calculate(/check) TCP checksum */ diff --git a/net/dccp/timer.c b/net/dccp/timer.c index 8c396ee..9f1f1ab 100644 --- a/net/dccp/timer.c +++ b/net/dccp/timer.c @@ -220,11 +220,7 @@ out: */ static void dccp_response_timer(struct sock *sk) { - struct inet_connection_sock *icsk = inet_csk(sk); - const int max_retries = icsk->icsk_syn_retries ? : TCP_SYNACK_RETRIES /* FIXME sysctl_tcp_synack_retries */; - - reqsk_queue_prune(&icsk->icsk_accept_queue, sk, TCP_SYNQ_INTERVAL, - DCCP_TIMEOUT_INIT, DCCP_RTO_MAX, max_retries); + inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL, DCCP_TIMEOUT_INIT, DCCP_RTO_MAX); } static void dccp_keepalive_timer(unsigned long data) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 136ada0..026630a 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -23,6 +23,7 @@ #include #include #include +#include #ifdef INET_CSK_DEBUG const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; @@ -398,8 +399,100 @@ void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, inet_csk_reqsk_queue_added(sk, timeout); } +/* Only thing we need from tcp.h */ +extern int sysctl_tcp_synack_retries; + EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); +void inet_csk_reqsk_queue_prune(struct sock *parent, + const unsigned long interval, + const unsigned long timeout, + const unsigned long max_rto) +{ + struct inet_connection_sock *icsk = inet_csk(parent); + struct request_sock_queue *queue = &icsk->icsk_accept_queue; + struct listen_sock *lopt = queue->listen_opt; + int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; + int thresh = max_retries; + unsigned long now = jiffies; + struct request_sock **reqp, *req; + int i, budget; + + if (lopt == NULL || lopt->qlen == 0) + return; + + /* Normally all the openreqs are young and become mature + * (i.e. converted to established socket) for first timeout. + * If synack was not acknowledged for 3 seconds, it means + * one of the following things: synack was lost, ack was lost, + * rtt is high or nobody planned to ack (i.e. synflood). + * When server is a bit loaded, queue is populated with old + * open requests, reducing effective size of queue. + * When server is well loaded, queue size reduces to zero + * after several minutes of work. It is not synflood, + * it is normal operation. The solution is pruning + * too old entries overriding normal timeout, when + * situation becomes dangerous. + * + * Essentially, we reserve half of room for young + * embrions; and abort old ones without pity, if old + * ones are about to clog our table. + */ + if (lopt->qlen>>(lopt->max_qlen_log-1)) { + int young = (lopt->qlen_young<<1); + + while (thresh > 2) { + if (lopt->qlen < young) + break; + thresh--; + young <<= 1; + } + } + + if (queue->rskq_defer_accept) + max_retries = queue->rskq_defer_accept; + + budget = 2 * (lopt->nr_table_entries / (timeout / interval)); + i = lopt->clock_hand; + + do { + reqp=&lopt->syn_table[i]; + while ((req = *reqp) != NULL) { + if (time_after_eq(now, req->expires)) { + if ((req->retrans < thresh || + (inet_rsk(req)->acked && req->retrans < max_retries)) + && !req->rsk_ops->rtx_syn_ack(parent, req, NULL)) { + unsigned long timeo; + + if (req->retrans++ == 0) + lopt->qlen_young--; + timeo = min((timeout << req->retrans), max_rto); + req->expires = now + timeo; + reqp = &req->dl_next; + continue; + } + + /* Drop this request */ + inet_csk_reqsk_queue_unlink(parent, req, reqp); + reqsk_queue_removed(queue, req); + reqsk_free(req); + continue; + } + reqp = &req->dl_next; + } + + i = (i + 1) & (lopt->nr_table_entries - 1); + + } while (--budget > 0); + + lopt->clock_hand = i; + + if (lopt->qlen) + inet_csk_reset_keepalive_timer(parent, interval); +} + +EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune); + struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req, const unsigned int __nocast priority) { @@ -424,3 +517,124 @@ struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req, } EXPORT_SYMBOL_GPL(inet_csk_clone); + +/* + * At this point, there should be no process reference to this + * socket, and thus no user references at all. Therefore we + * can assume the socket waitqueue is inactive and nobody will + * try to jump onto it. + */ +void inet_csk_destroy_sock(struct sock *sk) +{ + BUG_TRAP(sk->sk_state == TCP_CLOSE); + BUG_TRAP(sock_flag(sk, SOCK_DEAD)); + + /* It cannot be in hash table! */ + BUG_TRAP(sk_unhashed(sk)); + + /* If it has not 0 inet_sk(sk)->num, it must be bound */ + BUG_TRAP(!inet_sk(sk)->num || inet_csk(sk)->icsk_bind_hash); + + sk->sk_prot->destroy(sk); + + sk_stream_kill_queues(sk); + + xfrm_sk_free_policy(sk); + + sk_refcnt_debug_release(sk); + + atomic_dec(sk->sk_prot->orphan_count); + sock_put(sk); +} + +EXPORT_SYMBOL(inet_csk_destroy_sock); + +int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) +{ + struct inet_sock *inet = inet_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries); + + if (rc != 0) + return rc; + + sk->sk_max_ack_backlog = 0; + sk->sk_ack_backlog = 0; + inet_csk_delack_init(sk); + + /* There is race window here: we announce ourselves listening, + * but this transition is still not validated by get_port(). + * It is OK, because this socket enters to hash table only + * after validation is complete. + */ + sk->sk_state = TCP_LISTEN; + if (!sk->sk_prot->get_port(sk, inet->num)) { + inet->sport = htons(inet->num); + + sk_dst_reset(sk); + sk->sk_prot->hash(sk); + + return 0; + } + + sk->sk_state = TCP_CLOSE; + __reqsk_queue_destroy(&icsk->icsk_accept_queue); + return -EADDRINUSE; +} + +EXPORT_SYMBOL_GPL(inet_csk_listen_start); + +/* + * This routine closes sockets which have been at least partially + * opened, but not yet accepted. + */ +void inet_csk_listen_stop(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct request_sock *acc_req; + struct request_sock *req; + + inet_csk_delete_keepalive_timer(sk); + + /* make all the listen_opt local to us */ + acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue); + + /* Following specs, it would be better either to send FIN + * (and enter FIN-WAIT-1, it is normal close) + * or to send active reset (abort). + * Certainly, it is pretty dangerous while synflood, but it is + * bad justification for our negligence 8) + * To be honest, we are not able to make either + * of the variants now. --ANK + */ + reqsk_queue_destroy(&icsk->icsk_accept_queue); + + while ((req = acc_req) != NULL) { + struct sock *child = req->sk; + + acc_req = req->dl_next; + + local_bh_disable(); + bh_lock_sock(child); + BUG_TRAP(!sock_owned_by_user(child)); + sock_hold(child); + + sk->sk_prot->disconnect(child, O_NONBLOCK); + + sock_orphan(child); + + atomic_inc(sk->sk_prot->orphan_count); + + inet_csk_destroy_sock(child); + + bh_unlock_sock(child); + local_bh_enable(); + sock_put(child); + + sk_acceptq_removed(sk); + __reqsk_free(req); + } + BUG_TRAP(!sk->sk_ack_backlog); +} + +EXPORT_SYMBOL_GPL(inet_csk_listen_stop); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index a4e9eec..4bda522 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -456,96 +456,6 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) return put_user(answ, (int __user *)arg); } -int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) -{ - struct inet_sock *inet = inet_sk(sk); - struct inet_connection_sock *icsk = inet_csk(sk); - int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries); - - if (rc != 0) - return rc; - - sk->sk_max_ack_backlog = 0; - sk->sk_ack_backlog = 0; - inet_csk_delack_init(sk); - - /* There is race window here: we announce ourselves listening, - * but this transition is still not validated by get_port(). - * It is OK, because this socket enters to hash table only - * after validation is complete. - */ - sk->sk_state = TCP_LISTEN; - if (!sk->sk_prot->get_port(sk, inet->num)) { - inet->sport = htons(inet->num); - - sk_dst_reset(sk); - sk->sk_prot->hash(sk); - - return 0; - } - - sk->sk_state = TCP_CLOSE; - __reqsk_queue_destroy(&icsk->icsk_accept_queue); - return -EADDRINUSE; -} - -EXPORT_SYMBOL_GPL(inet_csk_listen_start); - -/* - * This routine closes sockets which have been at least partially - * opened, but not yet accepted. - */ -void inet_csk_listen_stop(struct sock *sk) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct request_sock *acc_req; - struct request_sock *req; - - inet_csk_delete_keepalive_timer(sk); - - /* make all the listen_opt local to us */ - acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue); - - /* Following specs, it would be better either to send FIN - * (and enter FIN-WAIT-1, it is normal close) - * or to send active reset (abort). - * Certainly, it is pretty dangerous while synflood, but it is - * bad justification for our negligence 8) - * To be honest, we are not able to make either - * of the variants now. --ANK - */ - reqsk_queue_destroy(&icsk->icsk_accept_queue); - - while ((req = acc_req) != NULL) { - struct sock *child = req->sk; - - acc_req = req->dl_next; - - local_bh_disable(); - bh_lock_sock(child); - BUG_TRAP(!sock_owned_by_user(child)); - sock_hold(child); - - sk->sk_prot->disconnect(child, O_NONBLOCK); - - sock_orphan(child); - - atomic_inc(sk->sk_prot->orphan_count); - - inet_csk_destroy_sock(child); - - bh_unlock_sock(child); - local_bh_enable(); - sock_put(child); - - sk_acceptq_removed(sk); - __reqsk_free(req); - } - BUG_TRAP(!sk->sk_ack_backlog); -} - -EXPORT_SYMBOL_GPL(inet_csk_listen_stop); - static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) { TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; @@ -1559,35 +1469,6 @@ void tcp_shutdown(struct sock *sk, int how) } } -/* - * At this point, there should be no process reference to this - * socket, and thus no user references at all. Therefore we - * can assume the socket waitqueue is inactive and nobody will - * try to jump onto it. - */ -void inet_csk_destroy_sock(struct sock *sk) -{ - BUG_TRAP(sk->sk_state == TCP_CLOSE); - BUG_TRAP(sock_flag(sk, SOCK_DEAD)); - - /* It cannot be in hash table! */ - BUG_TRAP(sk_unhashed(sk)); - - /* If it has not 0 inet_sk(sk)->num, it must be bound */ - BUG_TRAP(!inet_sk(sk)->num || inet_csk(sk)->icsk_bind_hash); - - sk->sk_prot->destroy(sk); - - sk_stream_kill_queues(sk); - - xfrm_sk_free_policy(sk); - - sk_refcnt_debug_release(sk); - - atomic_dec(sk->sk_prot->orphan_count); - sock_put(sk); -} - void tcp_close(struct sock *sk, long timeout) { struct sk_buff *skb; @@ -2258,7 +2139,6 @@ void __init tcp_init(void) } EXPORT_SYMBOL(tcp_close); -EXPORT_SYMBOL(inet_csk_destroy_sock); EXPORT_SYMBOL(tcp_disconnect); EXPORT_SYMBOL(tcp_getsockopt); EXPORT_SYMBOL(tcp_ioctl); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index b614ad4..72cec69 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -424,103 +424,14 @@ out_unlock: sock_put(sk); } -void reqsk_queue_prune(struct request_sock_queue *queue, struct sock *parent, - const unsigned long interval, const unsigned long timeout, - const unsigned long max_rto, int max_retries) -{ - struct inet_connection_sock *icsk = inet_csk(parent); - struct listen_sock *lopt = queue->listen_opt; - int thresh = max_retries; - unsigned long now = jiffies; - struct request_sock **reqp, *req; - int i, budget; - - if (lopt == NULL || lopt->qlen == 0) - return; - - /* Normally all the openreqs are young and become mature - * (i.e. converted to established socket) for first timeout. - * If synack was not acknowledged for 3 seconds, it means - * one of the following things: synack was lost, ack was lost, - * rtt is high or nobody planned to ack (i.e. synflood). - * When server is a bit loaded, queue is populated with old - * open requests, reducing effective size of queue. - * When server is well loaded, queue size reduces to zero - * after several minutes of work. It is not synflood, - * it is normal operation. The solution is pruning - * too old entries overriding normal timeout, when - * situation becomes dangerous. - * - * Essentially, we reserve half of room for young - * embrions; and abort old ones without pity, if old - * ones are about to clog our table. - */ - if (lopt->qlen>>(lopt->max_qlen_log-1)) { - int young = (lopt->qlen_young<<1); - - while (thresh > 2) { - if (lopt->qlen < young) - break; - thresh--; - young <<= 1; - } - } - - if (queue->rskq_defer_accept) - max_retries = queue->rskq_defer_accept; - - budget = 2 * (lopt->nr_table_entries / (timeout / interval)); - i = lopt->clock_hand; - - do { - reqp=&lopt->syn_table[i]; - while ((req = *reqp) != NULL) { - if (time_after_eq(now, req->expires)) { - if ((req->retrans < thresh || - (inet_rsk(req)->acked && req->retrans < max_retries)) - && !req->rsk_ops->rtx_syn_ack(parent, req, NULL)) { - unsigned long timeo; - - if (req->retrans++ == 0) - lopt->qlen_young--; - timeo = min((timeout << req->retrans), max_rto); - req->expires = now + timeo; - reqp = &req->dl_next; - continue; - } - - /* Drop this request */ - inet_csk_reqsk_queue_unlink(parent, req, reqp); - reqsk_queue_removed(&icsk->icsk_accept_queue, req); - reqsk_free(req); - continue; - } - reqp = &req->dl_next; - } - - i = (i + 1) & (lopt->nr_table_entries - 1); - - } while (--budget > 0); - - lopt->clock_hand = i; - - if (lopt->qlen) - inet_csk_reset_keepalive_timer(parent, interval); -} - -EXPORT_SYMBOL_GPL(reqsk_queue_prune); - /* * Timer for listening sockets */ static void tcp_synack_timer(struct sock *sk) { - struct inet_connection_sock *icsk = inet_csk(sk); - const int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; - - reqsk_queue_prune(&icsk->icsk_accept_queue, sk, TCP_SYNQ_INTERVAL, - TCP_TIMEOUT_INIT, TCP_RTO_MAX, max_retries); + inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL, + TCP_TIMEOUT_INIT, TCP_RTO_MAX); } void tcp_set_keepalive(struct sock *sk, int val) -- cgit v0.10.2 From 95b81ef794278c835b321f6376b0522cd5df59b7 Mon Sep 17 00:00:00 2001 From: Yoshifumi Nishida Date: Tue, 9 Aug 2005 20:15:35 -0700 Subject: [DCCP]: Fix checksum routines Signed-off-by: Yoshifumi Nishida Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index fb83454..55b690a 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -242,7 +242,8 @@ extern int dccp_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen); extern void dccp_shutdown(struct sock *sk, int how); -extern int dccp_v4_checksum(struct sk_buff *skb); +extern int dccp_v4_checksum(const struct sk_buff *skb, + const u32 saddr, const u32 daddr); extern int dccp_v4_send_reset(struct sock *sk, enum dccp_reset_codes code); extern void dccp_send_close(struct sock *sk); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 083baca..7b90606 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -802,9 +802,9 @@ static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) return sk; } -int dccp_v4_checksum(struct sk_buff *skb) +int dccp_v4_checksum(const struct sk_buff *skb, const u32 saddr, const u32 daddr) { - struct dccp_hdr* dh = dccp_hdr(skb); + const struct dccp_hdr* dh = dccp_hdr(skb); int checksum_len; u32 tmp; @@ -816,24 +816,24 @@ int dccp_v4_checksum(struct sk_buff *skb) } tmp = csum_partial((unsigned char *)dh, checksum_len, 0); - return csum_fold(tmp); + return csum_tcpudp_magic(saddr, daddr, checksum_len, IPPROTO_DCCP, tmp); } -static int dccp_v4_verify_checksum(struct sk_buff *skb) +static int dccp_v4_verify_checksum(struct sk_buff *skb, + const u32 saddr, const u32 daddr) { - struct dccp_hdr *th = dccp_hdr(skb); - const u16 remote_checksum = th->dccph_checksum; - u16 local_checksum; - - /* FIXME: don't mess with skb payload */ - th->dccph_checksum = 0; /* zero it for computation */ - - local_checksum = dccp_v4_checksum(skb); - - /* FIXME: don't mess with skb payload */ - th->dccph_checksum = remote_checksum; /* put it back */ + struct dccp_hdr *dh = dccp_hdr(skb); + int checksum_len; + u32 tmp; - return remote_checksum == local_checksum ? 0 : -1; + if (dh->dccph_cscov == 0) + checksum_len = skb->len; + else { + checksum_len = (dh->dccph_cscov + dh->dccph_x) * sizeof(u32); + checksum_len = checksum_len < skb->len ? checksum_len : skb->len; + } + tmp = csum_partial((unsigned char *)dh, checksum_len, 0); + return csum_tcpudp_magic(saddr, daddr, checksum_len, IPPROTO_DCCP, tmp) == 0 ? 0 : -1; } static struct dst_entry* dccp_v4_route_skb(struct sock *sk, @@ -902,7 +902,8 @@ void dccp_v4_ctl_send_reset(struct sk_buff *rxskb) dccp_hdr_set_seq(dh, DCCP_SKB_CB(rxskb)->dccpd_ack_seq); dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), DCCP_SKB_CB(rxskb)->dccpd_seq); - dh->dccph_checksum = dccp_v4_checksum(skb); + dh->dccph_checksum = dccp_v4_checksum(skb, rxskb->nh.iph->saddr, + rxskb->nh.iph->daddr); bh_lock_sock(dccp_ctl_socket->sk); err = ip_build_and_send_pkt(skb, dccp_ctl_socket->sk, @@ -1024,7 +1025,8 @@ static inline int dccp_invalid_packet(struct sk_buff *skb) } /* If the header checksum is incorrect, drop packet and return */ - if (dccp_v4_verify_checksum(skb) < 0) { + if (dccp_v4_verify_checksum(skb, skb->nh.iph->saddr, + skb->nh.iph->daddr) < 0) { dccp_pr_debug("header checksum is incorrect\n"); return 1; } diff --git a/net/dccp/output.c b/net/dccp/output.c index 22ca291..4945eaa 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -93,7 +93,8 @@ int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) break; } - dh->dccph_checksum = dccp_v4_checksum(skb); + dh->dccph_checksum = dccp_v4_checksum(skb, inet->saddr, + inet->daddr); if (dcb->dccpd_type == DCCP_PKT_ACK || dcb->dccpd_type == DCCP_PKT_DATAACK) @@ -193,7 +194,8 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst, dccp_hdr_set_seq(dh, dccp_rsk(req)->dreq_iss); dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dccp_rsk(req)->dreq_isr); - dh->dccph_checksum = dccp_v4_checksum(skb); + dh->dccph_checksum = dccp_v4_checksum(skb, inet_rsk(req)->loc_addr, + inet_rsk(req)->rmt_addr); DCCP_INC_STATS(DCCP_MIB_OUTSEGS); return skb; @@ -242,7 +244,8 @@ struct sk_buff *dccp_make_reset(struct sock *sk, struct dst_entry *dst, dccp_hdr_reset(skb)->dccph_reset_code = code; - dh->dccph_checksum = dccp_v4_checksum(skb); + dh->dccph_checksum = dccp_v4_checksum(skb, inet_sk(sk)->saddr, + inet_sk(sk)->daddr); DCCP_INC_STATS(DCCP_MIB_OUTSEGS); return skb; -- cgit v0.10.2 From 74459dc7bacda04d14626d239c8f5c4dac22560d Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:15:51 -0700 Subject: [LIST]: Introduce list_for_each_entry_safe_continue Used in the dccp CCID3 code, that is going to be submitted RSN. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/linux/list.h b/include/linux/list.h index aab2db2..597094e 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -419,6 +419,19 @@ static inline void list_splice_init(struct list_head *list, pos = n, n = list_entry(n->member.next, typeof(*n), member)) /** + * list_for_each_entry_safe_continue - iterate over list of given type + * continuing after existing point safe against removal of list entry + * @pos: the type * to use as a loop counter. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#define list_for_each_entry_safe_continue(pos, n, head, member) \ + for (pos = n, n = list_entry(n->member.next, typeof(*n), member); \ + &pos->member != (head); \ + pos = n, n = list_entry(n->member.next, typeof(*n), member)) + +/** * list_for_each_rcu - iterate over an rcu-protected list * @pos: the &struct list_head to use as a loop counter. * @head: the head for your list. -- cgit v0.10.2 From 757f612e091e7d13707eedc3ff71f1a9b53f5537 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:16:04 -0700 Subject: [CCID3]: Reenable list_for_each_entry_safe_continue usage Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 4f45902..04299c7 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -1272,13 +1272,10 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) ccid3_calc_new_delta(hctx); /* remove all packets older than the one acked from history */ -#if 0 - FIXME! list_for_each_entry_safe_continue(entry, next, &hctx->ccid3hctx_hist, ccid3htx_node) { list_del_init(&entry->ccid3htx_node); ccid3_tx_hist_entry_delete(entry); } -#endif if (hctx->ccid3hctx_x < 10) { ccid3_pr_debug("ccid3_hc_tx_packet_recv hctx->ccid3hctx_x < 10\n"); hctx->ccid3hctx_x = 10; @@ -1820,8 +1817,7 @@ static void ccid3_hc_rx_detect_loss(struct sock *sk) a_next = b_next; num_later = 1; -#if 0 - FIXME MERGE GIT! + list_for_each_entry_safe_continue(entry, a_next, &hcrx->ccid3hcrx_hist, ccid3hrx_node) { if (num_later == 0) { a_loss = entry; @@ -1830,7 +1826,6 @@ static void ccid3_hc_rx_detect_loss(struct sock *sk) entry->ccid3hrx_type == DCCP_PKT_DATAACK) --num_later; } -#endif if (a_loss == NULL) { if (list_empty(&hcrx->ccid3hcrx_loss_interval_hist)) { @@ -1848,8 +1843,6 @@ static void ccid3_hc_rx_detect_loss(struct sock *sk) /* Locate a lost data packet */ entry = packet = b_loss; -#if 0 - FIXME MERGE GIT! list_for_each_entry_safe_continue(entry, b_next, &hcrx->ccid3hcrx_hist, ccid3hrx_node) { u64 delta = dccp_delta_seqno(entry->ccid3hrx_seqno, packet->ccid3hrx_seqno); @@ -1875,7 +1868,6 @@ static void ccid3_hc_rx_detect_loss(struct sock *sk) if (packet == a_loss) break; } -#endif if (seq_loss != DCCP_MAX_SEQNO + 1) win_loss = a_loss->ccid3hrx_win_count; -- cgit v0.10.2 From 2009493065e01b1fe27c1b98ffbcfab98e185f72 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Tue, 9 Aug 2005 20:16:32 -0700 Subject: [TG3]: Add basic register access function pointers This patch adds the basic function pointers to do register accesses in the fast path. This was suggested by David Miller. The idea is that various register access methods for different hardware errata can easily be implemented with these function pointers and performance will not be degraded on chips that use normal register access methods. The various register read write macros (e.g. tw32, tr32, tw32_mailbox) are redefined to call the function pointers. Signed-off-by: Michael Chan Signed-off-by: David S. Miller diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c index 6d4ab1e..13283c2 100644 --- a/drivers/net/tg3.c +++ b/drivers/net/tg3.c @@ -366,7 +366,7 @@ static void _tw32_flush(struct tg3 *tp, u32 off, u32 val) } } -static inline void _tw32_rx_mbox(struct tg3 *tp, u32 off, u32 val) +static void tg3_write32_rx_mbox(struct tg3 *tp, u32 off, u32 val) { void __iomem *mbox = tp->regs + off; writel(val, mbox); @@ -374,7 +374,7 @@ static inline void _tw32_rx_mbox(struct tg3 *tp, u32 off, u32 val) readl(mbox); } -static inline void _tw32_tx_mbox(struct tg3 *tp, u32 off, u32 val) +static void tg3_write32_tx_mbox(struct tg3 *tp, u32 off, u32 val) { void __iomem *mbox = tp->regs + off; writel(val, mbox); @@ -384,17 +384,23 @@ static inline void _tw32_tx_mbox(struct tg3 *tp, u32 off, u32 val) readl(mbox); } -#define tw32_mailbox(reg, val) writel(((val) & 0xffffffff), tp->regs + (reg)) -#define tw32_rx_mbox(reg, val) _tw32_rx_mbox(tp, reg, val) -#define tw32_tx_mbox(reg, val) _tw32_tx_mbox(tp, reg, val) +static void tg3_write32(struct tg3 *tp, u32 off, u32 val) +{ + writel(val, tp->regs + off); +} -#define tw32(reg,val) tg3_write_indirect_reg32(tp,(reg),(val)) +static u32 tg3_read32(struct tg3 *tp, u32 off) +{ + return (readl(tp->regs + off)); +} + +#define tw32_mailbox(reg, val) tp->write32_mbox(tp, reg, val) +#define tw32_rx_mbox(reg, val) tp->write32_rx_mbox(tp, reg, val) +#define tw32_tx_mbox(reg, val) tp->write32_tx_mbox(tp, reg, val) + +#define tw32(reg,val) tp->write32(tp, reg, val) #define tw32_f(reg,val) _tw32_flush(tp,(reg),(val)) -#define tw16(reg,val) writew(((val) & 0xffff), tp->regs + (reg)) -#define tw8(reg,val) writeb(((val) & 0xff), tp->regs + (reg)) -#define tr32(reg) readl(tp->regs + (reg)) -#define tr16(reg) readw(tp->regs + (reg)) -#define tr8(reg) readb(tp->regs + (reg)) +#define tr32(reg) tp->read32(tp, reg) static void tg3_write_mem(struct tg3 *tp, u32 off, u32 val) { @@ -9325,6 +9331,12 @@ static int __devinit tg3_get_invariants(struct tg3 *tp) pci_write_config_dword(tp->pdev, TG3PCI_PCISTATE, pci_state_reg); } + tp->read32 = tg3_read32; + tp->write32 = tg3_write_indirect_reg32; + tp->write32_mbox = tg3_write32; + tp->write32_tx_mbox = tg3_write32_tx_mbox; + tp->write32_rx_mbox = tg3_write32_rx_mbox; + /* Get eeprom hw config before calling tg3_set_power_state(). * In particular, the TG3_FLAG_EEPROM_WRITE_PROT flag must be * determined before calling tg3_set_power_state() so that diff --git a/drivers/net/tg3.h b/drivers/net/tg3.h index 5c4433c..394acdd 100644 --- a/drivers/net/tg3.h +++ b/drivers/net/tg3.h @@ -2049,6 +2049,10 @@ struct tg3 { spinlock_t lock; spinlock_t indirect_lock; + u32 (*read32) (struct tg3 *, u32); + void (*write32) (struct tg3 *, u32, u32); + void (*write32_mbox) (struct tg3 *, u32, + u32); void __iomem *regs; struct net_device *dev; struct pci_dev *pdev; @@ -2060,6 +2064,8 @@ struct tg3 { u32 msg_enable; /* begin "tx thread" cacheline section */ + void (*write32_tx_mbox) (struct tg3 *, u32, + u32); u32 tx_prod; u32 tx_cons; u32 tx_pending; @@ -2071,6 +2077,8 @@ struct tg3 { dma_addr_t tx_desc_mapping; /* begin "rx thread" cacheline section */ + void (*write32_rx_mbox) (struct tg3 *, u32, + u32); u32 rx_rcb_ptr; u32 rx_std_ptr; u32 rx_jumbo_ptr; -- cgit v0.10.2 From 1ee582d8e49a1c9dd43b2599f1cd26507182a8d4 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Tue, 9 Aug 2005 20:16:46 -0700 Subject: [TG3]: Add various register methods This patch adds various dedicated register read/write methods for the existing workarounds, including PCIX target workaround, write with read flush, etc. The chips that require these workarounds will use these dedicated access functions. Signed-off-by: Michael Chan Signed-off-by: David S. Miller diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c index 13283c2..80fbb18 100644 --- a/drivers/net/tg3.c +++ b/drivers/net/tg3.c @@ -340,16 +340,16 @@ static struct { static void tg3_write_indirect_reg32(struct tg3 *tp, u32 off, u32 val) { - if ((tp->tg3_flags & TG3_FLAG_PCIX_TARGET_HWBUG) != 0) { - spin_lock_bh(&tp->indirect_lock); - pci_write_config_dword(tp->pdev, TG3PCI_REG_BASE_ADDR, off); - pci_write_config_dword(tp->pdev, TG3PCI_REG_DATA, val); - spin_unlock_bh(&tp->indirect_lock); - } else { - writel(val, tp->regs + off); - if ((tp->tg3_flags & TG3_FLAG_5701_REG_WRITE_BUG) != 0) - readl(tp->regs + off); - } + spin_lock_bh(&tp->indirect_lock); + pci_write_config_dword(tp->pdev, TG3PCI_REG_BASE_ADDR, off); + pci_write_config_dword(tp->pdev, TG3PCI_REG_DATA, val); + spin_unlock_bh(&tp->indirect_lock); +} + +static void tg3_write_flush_reg32(struct tg3 *tp, u32 off, u32 val) +{ + writel(val, tp->regs + off); + readl(tp->regs + off); } static void _tw32_flush(struct tg3 *tp, u32 off, u32 val) @@ -366,14 +366,6 @@ static void _tw32_flush(struct tg3 *tp, u32 off, u32 val) } } -static void tg3_write32_rx_mbox(struct tg3 *tp, u32 off, u32 val) -{ - void __iomem *mbox = tp->regs + off; - writel(val, mbox); - if (tp->tg3_flags & TG3_FLAG_MBOX_WRITE_REORDER) - readl(mbox); -} - static void tg3_write32_tx_mbox(struct tg3 *tp, u32 off, u32 val) { void __iomem *mbox = tp->regs + off; @@ -4222,7 +4214,7 @@ static void tg3_stop_fw(struct tg3 *); static int tg3_chip_reset(struct tg3 *tp) { u32 val; - u32 flags_save; + void (*write_op)(struct tg3 *, u32, u32); int i; if (!(tp->tg3_flags2 & TG3_FLG2_SUN_570X)) @@ -4234,8 +4226,9 @@ static int tg3_chip_reset(struct tg3 *tp) * fun things. So, temporarily disable the 5701 * hardware workaround, while we do the reset. */ - flags_save = tp->tg3_flags; - tp->tg3_flags &= ~TG3_FLAG_5701_REG_WRITE_BUG; + write_op = tp->write32; + if (write_op == tg3_write_flush_reg32) + tp->write32 = tg3_write32; /* do the reset */ val = GRC_MISC_CFG_CORECLK_RESET; @@ -4254,8 +4247,8 @@ static int tg3_chip_reset(struct tg3 *tp) val |= GRC_MISC_CFG_KEEP_GPHY_POWER; tw32(GRC_MISC_CFG, val); - /* restore 5701 hardware bug workaround flag */ - tp->tg3_flags = flags_save; + /* restore 5701 hardware bug workaround write method */ + tp->write32 = write_op; /* Unfortunately, we have to delay before the PCI read back. * Some 575X chips even will not respond to a PCI cfg access @@ -4641,7 +4634,6 @@ static int tg3_load_firmware_cpu(struct tg3 *tp, u32 cpu_base, u32 cpu_scratch_b int cpu_scratch_size, struct fw_info *info) { int err, i; - u32 orig_tg3_flags = tp->tg3_flags; void (*write_op)(struct tg3 *, u32, u32); if (cpu_base == TX_CPU_BASE && @@ -4657,11 +4649,6 @@ static int tg3_load_firmware_cpu(struct tg3 *tp, u32 cpu_base, u32 cpu_scratch_b else write_op = tg3_write_indirect_reg32; - /* Force use of PCI config space for indirect register - * write calls. - */ - tp->tg3_flags |= TG3_FLAG_PCIX_TARGET_HWBUG; - /* It is possible that bootcode is still loading at this point. * Get the nvram lock first before halting the cpu. */ @@ -4697,7 +4684,6 @@ static int tg3_load_firmware_cpu(struct tg3 *tp, u32 cpu_base, u32 cpu_scratch_b err = 0; out: - tp->tg3_flags = orig_tg3_flags; return err; } @@ -9331,11 +9317,25 @@ static int __devinit tg3_get_invariants(struct tg3 *tp) pci_write_config_dword(tp->pdev, TG3PCI_PCISTATE, pci_state_reg); } + /* Default fast path register access methods */ tp->read32 = tg3_read32; - tp->write32 = tg3_write_indirect_reg32; + tp->write32 = tg3_write32; tp->write32_mbox = tg3_write32; - tp->write32_tx_mbox = tg3_write32_tx_mbox; - tp->write32_rx_mbox = tg3_write32_rx_mbox; + tp->write32_tx_mbox = tg3_write32; + tp->write32_rx_mbox = tg3_write32; + + /* Various workaround register access methods */ + if (tp->tg3_flags & TG3_FLAG_PCIX_TARGET_HWBUG) + tp->write32 = tg3_write_indirect_reg32; + else if (tp->tg3_flags & TG3_FLAG_5701_REG_WRITE_BUG) + tp->write32 = tg3_write_flush_reg32; + + if ((tp->tg3_flags & TG3_FLAG_TXD_MBOX_HWBUG) || + (tp->tg3_flags & TG3_FLAG_MBOX_WRITE_REORDER)) { + tp->write32_tx_mbox = tg3_write32_tx_mbox; + if (tp->tg3_flags & TG3_FLAG_MBOX_WRITE_REORDER) + tp->write32_rx_mbox = tg3_write_flush_reg32; + } /* Get eeprom hw config before calling tg3_set_power_state(). * In particular, the TG3_FLAG_EEPROM_WRITE_PROT flag must be -- cgit v0.10.2 From 09ee929cccfd0b56ea3724b3c6299fbbe813df43 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Tue, 9 Aug 2005 20:17:00 -0700 Subject: [TG3]: Add mailbox read method This patch adds the mailbox read method and also adds an inline function tw32_mailbox_f() for mailbox writes that require read flush. Signed-off-by: Michael Chan Signed-off-by: David S. Miller diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c index 80fbb18..8411e0f 100644 --- a/drivers/net/tg3.c +++ b/drivers/net/tg3.c @@ -366,6 +366,12 @@ static void _tw32_flush(struct tg3 *tp, u32 off, u32 val) } } +static inline void tw32_mailbox_flush(struct tg3 *tp, u32 off, u32 val) +{ + tp->write32_mbox(tp, off, val); + tp->read32_mbox(tp, off); +} + static void tg3_write32_tx_mbox(struct tg3 *tp, u32 off, u32 val) { void __iomem *mbox = tp->regs + off; @@ -387,8 +393,10 @@ static u32 tg3_read32(struct tg3 *tp, u32 off) } #define tw32_mailbox(reg, val) tp->write32_mbox(tp, reg, val) +#define tw32_mailbox_f(reg, val) tw32_mailbox_flush(tp, (reg), (val)) #define tw32_rx_mbox(reg, val) tp->write32_rx_mbox(tp, reg, val) #define tw32_tx_mbox(reg, val) tp->write32_tx_mbox(tp, reg, val) +#define tr32_mailbox(reg) tp->read32_mbox(tp, reg) #define tw32(reg,val) tp->write32(tp, reg, val) #define tw32_f(reg,val) _tw32_flush(tp,(reg),(val)) @@ -420,8 +428,7 @@ static void tg3_disable_ints(struct tg3 *tp) { tw32(TG3PCI_MISC_HOST_CTRL, (tp->misc_host_ctrl | MISC_HOST_CTRL_MASK_PCI_INT)); - tw32_mailbox(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, 0x00000001); - tr32(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW); + tw32_mailbox_f(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, 0x00000001); } static inline void tg3_cond_int(struct tg3 *tp) @@ -437,9 +444,8 @@ static void tg3_enable_ints(struct tg3 *tp) tw32(TG3PCI_MISC_HOST_CTRL, (tp->misc_host_ctrl & ~MISC_HOST_CTRL_MASK_PCI_INT)); - tw32_mailbox(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, - (tp->last_tag << 24)); - tr32(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW); + tw32_mailbox_f(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, + (tp->last_tag << 24)); tg3_cond_int(tp); } @@ -3276,9 +3282,8 @@ static irqreturn_t tg3_interrupt(int irq, void *dev_id, struct pt_regs *regs) /* No work, shared interrupt perhaps? re-enable * interrupts, and flush that PCI write */ - tw32_mailbox(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, + tw32_mailbox_f(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, 0x00000000); - tr32(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW); } } else { /* shared interrupt */ handled = 0; @@ -3321,9 +3326,8 @@ static irqreturn_t tg3_interrupt_tagged(int irq, void *dev_id, struct pt_regs *r /* no work, shared interrupt perhaps? re-enable * interrupts, and flush that PCI write */ - tw32_mailbox(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, - tp->last_tag << 24); - tr32(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW); + tw32_mailbox_f(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, + tp->last_tag << 24); } } else { /* shared interrupt */ handled = 0; @@ -5800,8 +5804,7 @@ static int tg3_reset_hw(struct tg3 *tp) tw32_f(GRC_LOCAL_CTRL, tp->grc_local_ctrl); udelay(100); - tw32_mailbox(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, 0); - tr32(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW); + tw32_mailbox_f(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, 0); tp->last_tag = 0; if (!(tp->tg3_flags2 & TG3_FLG2_5705_PLUS)) { @@ -6190,7 +6193,8 @@ static int tg3_test_interrupt(struct tg3 *tp) HOSTCC_MODE_NOW); for (i = 0; i < 5; i++) { - int_mbox = tr32(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW); + int_mbox = tr32_mailbox(MAILBOX_INTERRUPT_0 + + TG3_64BIT_REG_LOW); if (int_mbox != 0) break; msleep(10); @@ -6590,10 +6594,10 @@ static int tg3_open(struct net_device *dev) /* Mailboxes */ printk("DEBUG: SNDHOST_PROD[%08x%08x] SNDNIC_PROD[%08x%08x]\n", - tr32(MAILBOX_SNDHOST_PROD_IDX_0 + 0x0), - tr32(MAILBOX_SNDHOST_PROD_IDX_0 + 0x4), - tr32(MAILBOX_SNDNIC_PROD_IDX_0 + 0x0), - tr32(MAILBOX_SNDNIC_PROD_IDX_0 + 0x4)); + tr32_mailbox(MAILBOX_SNDHOST_PROD_IDX_0 + 0x0), + tr32_mailbox(MAILBOX_SNDHOST_PROD_IDX_0 + 0x4), + tr32_mailbox(MAILBOX_SNDNIC_PROD_IDX_0 + 0x0), + tr32_mailbox(MAILBOX_SNDNIC_PROD_IDX_0 + 0x4)); /* NIC side send descriptors. */ for (i = 0; i < 6; i++) { @@ -7893,7 +7897,7 @@ static int tg3_test_loopback(struct tg3 *tp) num_pkts++; tw32_tx_mbox(MAILBOX_SNDHOST_PROD_IDX_0 + TG3_64BIT_REG_LOW, send_idx); - tr32(MAILBOX_SNDHOST_PROD_IDX_0 + TG3_64BIT_REG_LOW); + tr32_mailbox(MAILBOX_SNDHOST_PROD_IDX_0 + TG3_64BIT_REG_LOW); udelay(10); @@ -9320,6 +9324,7 @@ static int __devinit tg3_get_invariants(struct tg3 *tp) /* Default fast path register access methods */ tp->read32 = tg3_read32; tp->write32 = tg3_write32; + tp->read32_mbox = tg3_read32; tp->write32_mbox = tg3_write32; tp->write32_tx_mbox = tg3_write32; tp->write32_rx_mbox = tg3_write32; diff --git a/drivers/net/tg3.h b/drivers/net/tg3.h index 394acdd..c398b84 100644 --- a/drivers/net/tg3.h +++ b/drivers/net/tg3.h @@ -2051,6 +2051,7 @@ struct tg3 { u32 (*read32) (struct tg3 *, u32); void (*write32) (struct tg3 *, u32, u32); + u32 (*read32_mbox) (struct tg3 *, u32); void (*write32_mbox) (struct tg3 *, u32, u32); void __iomem *regs; -- cgit v0.10.2 From 6892914fb7980d844f2bac859f4095df9ebd18da Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Tue, 9 Aug 2005 20:17:14 -0700 Subject: [TG3]: Add indirect register method for 5703 behind ICH This patch adds the new workaround for 5703 A1/A2 if it is behind certain ICH bridges. The workaround disables memory and uses config. cycles only to access all registers. The 5702/03 chips can mistakenly decode the special cycles from the ICH chipsets as memory write cycles, causing corruption of register and memory space. Only certain ICH bridges will drive special cycles with non-zero data during the address phase which can fall within the 5703's address range. This is not an ICH bug as the PCI spec allows non-zero address during special cycles. However, only these ICH bridges are known to drive non-zero addresses during special cycles. The indirect_lock is also changed to spin_lock_irqsave from spin_lock_bh because it is used in irq handler when using the indirect method to disable interrupts. Signed-off-by: Michael Chan Signed-off-by: David S. Miller diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c index 8411e0f..3a7cfb8 100644 --- a/drivers/net/tg3.c +++ b/drivers/net/tg3.c @@ -340,10 +340,12 @@ static struct { static void tg3_write_indirect_reg32(struct tg3 *tp, u32 off, u32 val) { - spin_lock_bh(&tp->indirect_lock); + unsigned long flags; + + spin_lock_irqsave(&tp->indirect_lock, flags); pci_write_config_dword(tp->pdev, TG3PCI_REG_BASE_ADDR, off); pci_write_config_dword(tp->pdev, TG3PCI_REG_DATA, val); - spin_unlock_bh(&tp->indirect_lock); + spin_unlock_irqrestore(&tp->indirect_lock, flags); } static void tg3_write_flush_reg32(struct tg3 *tp, u32 off, u32 val) @@ -352,24 +354,75 @@ static void tg3_write_flush_reg32(struct tg3 *tp, u32 off, u32 val) readl(tp->regs + off); } -static void _tw32_flush(struct tg3 *tp, u32 off, u32 val) +static u32 tg3_read_indirect_reg32(struct tg3 *tp, u32 off) { - if ((tp->tg3_flags & TG3_FLAG_PCIX_TARGET_HWBUG) != 0) { - spin_lock_bh(&tp->indirect_lock); - pci_write_config_dword(tp->pdev, TG3PCI_REG_BASE_ADDR, off); - pci_write_config_dword(tp->pdev, TG3PCI_REG_DATA, val); - spin_unlock_bh(&tp->indirect_lock); - } else { - void __iomem *dest = tp->regs + off; - writel(val, dest); - readl(dest); /* always flush PCI write */ + unsigned long flags; + u32 val; + + spin_lock_irqsave(&tp->indirect_lock, flags); + pci_write_config_dword(tp->pdev, TG3PCI_REG_BASE_ADDR, off); + pci_read_config_dword(tp->pdev, TG3PCI_REG_DATA, &val); + spin_unlock_irqrestore(&tp->indirect_lock, flags); + return val; +} + +static void tg3_write_indirect_mbox(struct tg3 *tp, u32 off, u32 val) +{ + unsigned long flags; + + if (off == (MAILBOX_RCVRET_CON_IDX_0 + TG3_64BIT_REG_LOW)) { + pci_write_config_dword(tp->pdev, TG3PCI_RCV_RET_RING_CON_IDX + + TG3_64BIT_REG_LOW, val); + return; + } + if (off == (MAILBOX_RCV_STD_PROD_IDX + TG3_64BIT_REG_LOW)) { + pci_write_config_dword(tp->pdev, TG3PCI_STD_RING_PROD_IDX + + TG3_64BIT_REG_LOW, val); + return; } + + spin_lock_irqsave(&tp->indirect_lock, flags); + pci_write_config_dword(tp->pdev, TG3PCI_REG_BASE_ADDR, off + 0x5600); + pci_write_config_dword(tp->pdev, TG3PCI_REG_DATA, val); + spin_unlock_irqrestore(&tp->indirect_lock, flags); + + /* In indirect mode when disabling interrupts, we also need + * to clear the interrupt bit in the GRC local ctrl register. + */ + if ((off == (MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW)) && + (val == 0x1)) { + pci_write_config_dword(tp->pdev, TG3PCI_MISC_LOCAL_CTRL, + tp->grc_local_ctrl|GRC_LCLCTRL_CLEARINT); + } +} + +static u32 tg3_read_indirect_mbox(struct tg3 *tp, u32 off) +{ + unsigned long flags; + u32 val; + + spin_lock_irqsave(&tp->indirect_lock, flags); + pci_write_config_dword(tp->pdev, TG3PCI_REG_BASE_ADDR, off + 0x5600); + pci_read_config_dword(tp->pdev, TG3PCI_REG_DATA, &val); + spin_unlock_irqrestore(&tp->indirect_lock, flags); + return val; +} + +static void _tw32_flush(struct tg3 *tp, u32 off, u32 val) +{ + tp->write32(tp, off, val); + if (!(tp->tg3_flags & TG3_FLAG_PCIX_TARGET_HWBUG) && + !(tp->tg3_flags & TG3_FLAG_5701_REG_WRITE_BUG) && + !(tp->tg3_flags2 & TG3_FLG2_ICH_WORKAROUND)) + tp->read32(tp, off); /* flush */ } static inline void tw32_mailbox_flush(struct tg3 *tp, u32 off, u32 val) { tp->write32_mbox(tp, off, val); - tp->read32_mbox(tp, off); + if (!(tp->tg3_flags & TG3_FLAG_MBOX_WRITE_REORDER) && + !(tp->tg3_flags2 & TG3_FLG2_ICH_WORKAROUND)) + tp->read32_mbox(tp, off); } static void tg3_write32_tx_mbox(struct tg3 *tp, u32 off, u32 val) @@ -404,24 +457,28 @@ static u32 tg3_read32(struct tg3 *tp, u32 off) static void tg3_write_mem(struct tg3 *tp, u32 off, u32 val) { - spin_lock_bh(&tp->indirect_lock); + unsigned long flags; + + spin_lock_irqsave(&tp->indirect_lock, flags); pci_write_config_dword(tp->pdev, TG3PCI_MEM_WIN_BASE_ADDR, off); pci_write_config_dword(tp->pdev, TG3PCI_MEM_WIN_DATA, val); /* Always leave this as zero. */ pci_write_config_dword(tp->pdev, TG3PCI_MEM_WIN_BASE_ADDR, 0); - spin_unlock_bh(&tp->indirect_lock); + spin_unlock_irqrestore(&tp->indirect_lock, flags); } static void tg3_read_mem(struct tg3 *tp, u32 off, u32 *val) { - spin_lock_bh(&tp->indirect_lock); + unsigned long flags; + + spin_lock_irqsave(&tp->indirect_lock, flags); pci_write_config_dword(tp->pdev, TG3PCI_MEM_WIN_BASE_ADDR, off); pci_read_config_dword(tp->pdev, TG3PCI_MEM_WIN_DATA, val); /* Always leave this as zero. */ pci_write_config_dword(tp->pdev, TG3PCI_MEM_WIN_BASE_ADDR, 0); - spin_unlock_bh(&tp->indirect_lock); + spin_unlock_irqrestore(&tp->indirect_lock, flags); } static void tg3_disable_ints(struct tg3 *tp) @@ -9149,14 +9206,6 @@ static int __devinit tg3_is_sun_570X(struct tg3 *tp) static int __devinit tg3_get_invariants(struct tg3 *tp) { static struct pci_device_id write_reorder_chipsets[] = { - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, - PCI_DEVICE_ID_INTEL_82801AA_8) }, - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, - PCI_DEVICE_ID_INTEL_82801AB_8) }, - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, - PCI_DEVICE_ID_INTEL_82801BA_11) }, - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, - PCI_DEVICE_ID_INTEL_82801BA_6) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_FE_GATE_700C) }, { }, @@ -9173,7 +9222,7 @@ static int __devinit tg3_get_invariants(struct tg3 *tp) tp->tg3_flags2 |= TG3_FLG2_SUN_570X; #endif - /* If we have an AMD 762 or Intel ICH/ICH0/ICH2 chipset, write + /* If we have an AMD 762 chipset, write * reordering to the mailbox registers done by the host * controller can cause major troubles. We read back from * every mailbox register write to force the writes to be @@ -9211,6 +9260,69 @@ static int __devinit tg3_get_invariants(struct tg3 *tp) if (tp->pci_chip_rev_id == CHIPREV_ID_5752_A0_HW) tp->pci_chip_rev_id = CHIPREV_ID_5752_A0; + /* If we have 5702/03 A1 or A2 on certain ICH chipsets, + * we need to disable memory and use config. cycles + * only to access all registers. The 5702/03 chips + * can mistakenly decode the special cycles from the + * ICH chipsets as memory write cycles, causing corruption + * of register and memory space. Only certain ICH bridges + * will drive special cycles with non-zero data during the + * address phase which can fall within the 5703's address + * range. This is not an ICH bug as the PCI spec allows + * non-zero address during special cycles. However, only + * these ICH bridges are known to drive non-zero addresses + * during special cycles. + * + * Since special cycles do not cross PCI bridges, we only + * enable this workaround if the 5703 is on the secondary + * bus of these ICH bridges. + */ + if ((tp->pci_chip_rev_id == CHIPREV_ID_5703_A1) || + (tp->pci_chip_rev_id == CHIPREV_ID_5703_A2)) { + static struct tg3_dev_id { + u32 vendor; + u32 device; + u32 rev; + } ich_chipsets[] = { + { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801AA_8, + PCI_ANY_ID }, + { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801AB_8, + PCI_ANY_ID }, + { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801BA_11, + 0xa }, + { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801BA_6, + PCI_ANY_ID }, + { }, + }; + struct tg3_dev_id *pci_id = &ich_chipsets[0]; + struct pci_dev *bridge = NULL; + + while (pci_id->vendor != 0) { + bridge = pci_get_device(pci_id->vendor, pci_id->device, + bridge); + if (!bridge) { + pci_id++; + continue; + } + if (pci_id->rev != PCI_ANY_ID) { + u8 rev; + + pci_read_config_byte(bridge, PCI_REVISION_ID, + &rev); + if (rev > pci_id->rev) + continue; + } + if (bridge->subordinate && + (bridge->subordinate->number == + tp->pdev->bus->number)) { + + tp->tg3_flags2 |= TG3_FLG2_ICH_WORKAROUND; + pci_dev_put(bridge); + break; + } + } + } + /* Find msi capability. */ if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5780) tp->msi_cap = pci_find_capability(tp->pdev, PCI_CAP_ID_MSI); @@ -9342,6 +9454,22 @@ static int __devinit tg3_get_invariants(struct tg3 *tp) tp->write32_rx_mbox = tg3_write_flush_reg32; } + if (tp->tg3_flags2 & TG3_FLG2_ICH_WORKAROUND) { + tp->read32 = tg3_read_indirect_reg32; + tp->write32 = tg3_write_indirect_reg32; + tp->read32_mbox = tg3_read_indirect_mbox; + tp->write32_mbox = tg3_write_indirect_mbox; + tp->write32_tx_mbox = tg3_write_indirect_mbox; + tp->write32_rx_mbox = tg3_write_indirect_mbox; + + iounmap(tp->regs); + tp->regs = 0; + + pci_read_config_word(tp->pdev, PCI_COMMAND, &pci_cmd); + pci_cmd &= ~PCI_COMMAND_MEMORY; + pci_write_config_word(tp->pdev, PCI_COMMAND, pci_cmd); + } + /* Get eeprom hw config before calling tg3_set_power_state(). * In particular, the TG3_FLAG_EEPROM_WRITE_PROT flag must be * determined before calling tg3_set_power_state() so that @@ -10486,7 +10614,10 @@ static int __devinit tg3_init_one(struct pci_dev *pdev, return 0; err_out_iounmap: - iounmap(tp->regs); + if (tp->regs) { + iounmap(tp->regs); + tp->regs = 0; + } err_out_free_dev: free_netdev(dev); @@ -10508,7 +10639,10 @@ static void __devexit tg3_remove_one(struct pci_dev *pdev) struct tg3 *tp = netdev_priv(dev); unregister_netdev(dev); - iounmap(tp->regs); + if (tp->regs) { + iounmap(tp->regs); + tp->regs = 0; + } free_netdev(dev); pci_release_regions(pdev); pci_disable_device(pdev); diff --git a/drivers/net/tg3.h b/drivers/net/tg3.h index c398b84..c184b77 100644 --- a/drivers/net/tg3.h +++ b/drivers/net/tg3.h @@ -2174,6 +2174,7 @@ struct tg3 { #define TG3_FLG2_ANY_SERDES (TG3_FLG2_PHY_SERDES | \ TG3_FLG2_MII_SERDES) #define TG3_FLG2_PARALLEL_DETECT 0x01000000 +#define TG3_FLG2_ICH_WORKAROUND 0x02000000 u32 split_mode_max_reqs; #define SPLIT_MODE_5704_MAX_REQ 3 -- cgit v0.10.2 From 15f5a585c6b8dac31ed0a55693aacf51934f0f5d Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Tue, 9 Aug 2005 20:17:28 -0700 Subject: [TG3]: Eliminate one register write in tg3_restart_ints() The register write to register 0x68 to restart interrupts is unnecessary as the interrupt wasn't masked in that register by the irq handler. This will save one register write in the fast path. Signed-off-by: Michael Chan Signed-off-by: David S. Miller diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c index 3a7cfb8..8bc28b1 100644 --- a/drivers/net/tg3.c +++ b/drivers/net/tg3.c @@ -533,8 +533,6 @@ static inline unsigned int tg3_has_work(struct tg3 *tp) */ static void tg3_restart_ints(struct tg3 *tp) { - tw32(TG3PCI_MISC_HOST_CTRL, - (tp->misc_host_ctrl & ~MISC_HOST_CTRL_MASK_PCI_INT)); tw32_mailbox(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, tp->last_tag << 24); mmiowb(); -- cgit v0.10.2 From 087fe256f0aef8d16b19a30c6fb10b899bf1a701 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Tue, 9 Aug 2005 20:17:41 -0700 Subject: [TG3]: Fix bug in setting a tg3_flag Found a bug while reviewing the patches the second time. The TG3_FLAG_TXD_MBOX_HWBUG flag is set after the register access methods have been determined. This patch fixes it by moving it up before the various access methods are assigned. Signed-off-by: Michael Chan Signed-off-by: David S. Miller diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c index 8bc28b1..af8263a 100644 --- a/drivers/net/tg3.c +++ b/drivers/net/tg3.c @@ -9408,6 +9408,12 @@ static int __devinit tg3_get_invariants(struct tg3 *tp) } } + /* 5700 BX chips need to have their TX producer index mailboxes + * written twice to workaround a bug. + */ + if (GET_CHIP_REV(tp->pci_chip_rev_id) == CHIPREV_5700_BX) + tp->tg3_flags |= TG3_FLAG_TXD_MBOX_HWBUG; + /* Back to back register writes can cause problems on this chip, * the workaround is to read back all reg writes except those to * mailbox regs. See tg3_write_indirect_reg32(). @@ -9682,14 +9688,6 @@ static int __devinit tg3_get_invariants(struct tg3 *tp) else tp->tg3_flags &= ~TG3_FLAG_POLL_SERDES; - /* 5700 BX chips need to have their TX producer index mailboxes - * written twice to workaround a bug. - */ - if (GET_CHIP_REV(tp->pci_chip_rev_id) == CHIPREV_5700_BX) - tp->tg3_flags |= TG3_FLAG_TXD_MBOX_HWBUG; - else - tp->tg3_flags &= ~TG3_FLAG_TXD_MBOX_HWBUG; - /* It seems all chips can get confused if TX buffers * straddle the 4GB address boundary in some cases. */ -- cgit v0.10.2 From bb97d31f5130d677644d9931ef38613d1164ec94 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:19:14 -0700 Subject: [INET]: Make inet_create try to load protocol modules Syntax is net-pf-PROTOCOL_FAMILY-PROTOCOL-SOCK_TYPE and if this fails net-pf-PROTOCOL_FAMILY-PROTOCOL. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 70284e6..66c43fc 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -811,8 +811,13 @@ static void __exit dccp_fini(void) module_init(dccp_init); module_exit(dccp_fini); -/* __stringify doesn't likes enums, so use SOCK_DCCP (6) value directly */ -MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-6"); +/* + * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33) + * values directly, Also cover the case where the protocol is not specified, + * i.e. net-pf-PF_INET-proto-0-type-SOCK_DCCP + */ +MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-33-type-6"); +MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-0-type-6"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Arnaldo Carvalho de Melo "); MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol"); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 52f5ecc..20f52b5 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -228,12 +228,14 @@ static int inet_create(struct socket *sock, int protocol) struct proto *answer_prot; unsigned char answer_flags; char answer_no_check; - int err; + int try_loading_module = 0; + int err = -ESOCKTNOSUPPORT; sock->state = SS_UNCONNECTED; /* Look for the requested type/protocol pair. */ answer = NULL; +lookup_protocol: rcu_read_lock(); list_for_each_rcu(p, &inetsw[sock->type]) { answer = list_entry(p, struct inet_protosw, list); @@ -254,9 +256,28 @@ static int inet_create(struct socket *sock, int protocol) answer = NULL; } - err = -ESOCKTNOSUPPORT; - if (!answer) - goto out_rcu_unlock; + if (unlikely(answer == NULL)) { + if (try_loading_module < 2) { + rcu_read_unlock(); + /* + * Be more specific, e.g. net-pf-2-proto-132-type-1 + * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM) + */ + if (++try_loading_module == 1) + request_module("net-pf-%d-proto-%d-type-%d", + PF_INET, protocol, sock->type); + /* + * Fall back to generic, e.g. net-pf-2-proto-132 + * (net-pf-PF_INET-proto-IPPROTO_SCTP) + */ + else + request_module("net-pf-%d-proto-%d", + PF_INET, protocol); + goto lookup_protocol; + } else + goto out_rcu_unlock; + } + err = -EPERM; if (answer->capability > 0 && !capable(answer->capability)) goto out_rcu_unlock; diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 8d3f809..7d8ec65 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1242,6 +1242,10 @@ SCTP_STATIC __exit void sctp_exit(void) module_init(sctp_init); module_exit(sctp_exit); +/* + * __stringify doesn't likes enums, so use IPPROTO_SCTP value (132) directly. + */ +MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-132"); MODULE_AUTHOR("Linux Kernel SCTP developers "); MODULE_DESCRIPTION("Support for the SCTP protocol (RFC2960)"); MODULE_LICENSE("GPL"); -- cgit v0.10.2 From 2669d63d20683828f673b606915957f3a070602d Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 20:19:44 -0700 Subject: [NETFILTER]: move conntrack helper buffers from BSS to kmalloc()ed memory According to DaveM, it is preferrable to have large data structures be allocated dynamically from the module init() function rather than putting them as static global variables into BSS. This patch moves the conntrack helper packet buffers into dynamically allocated memory. Signed-off-by: Harald Welte Signed-off-by: David S. Miller diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c index 01e1b58..be4c9eb 100644 --- a/net/ipv4/netfilter/ip_conntrack_amanda.c +++ b/net/ipv4/netfilter/ip_conntrack_amanda.c @@ -40,7 +40,7 @@ MODULE_PARM_DESC(master_timeout, "timeout for the master connection"); static char *conns[] = { "DATA ", "MESG ", "INDEX " }; /* This is slow, but it's simple. --RR */ -static char amanda_buffer[65536]; +static char *amanda_buffer; static DEFINE_SPINLOCK(amanda_buffer_lock); unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb, @@ -153,11 +153,25 @@ static struct ip_conntrack_helper amanda_helper = { static void __exit fini(void) { ip_conntrack_helper_unregister(&amanda_helper); + kfree(amanda_buffer); } static int __init init(void) { - return ip_conntrack_helper_register(&amanda_helper); + int ret; + + amanda_buffer = kmalloc(65536, GFP_KERNEL); + if (!amanda_buffer) + return -ENOMEM; + + ret = ip_conntrack_helper_register(&amanda_helper); + if (ret < 0) { + kfree(amanda_buffer); + return ret; + } + return 0; + + } module_init(init); diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c index 9658896..3a2627d 100644 --- a/net/ipv4/netfilter/ip_conntrack_ftp.c +++ b/net/ipv4/netfilter/ip_conntrack_ftp.c @@ -25,8 +25,7 @@ MODULE_AUTHOR("Rusty Russell "); MODULE_DESCRIPTION("ftp connection tracking helper"); /* This is slow, but it's simple. --RR */ -static char ftp_buffer[65536]; - +static char *ftp_buffer; static DEFINE_SPINLOCK(ip_ftp_lock); #define MAX_PORTS 8 @@ -461,6 +460,8 @@ static void fini(void) ports[i]); ip_conntrack_helper_unregister(&ftp[i]); } + + kfree(ftp_buffer); } static int __init init(void) @@ -468,6 +469,10 @@ static int __init init(void) int i, ret; char *tmpname; + ftp_buffer = kmalloc(65536, GFP_KERNEL); + if (!ftp_buffer) + return -ENOMEM; + if (ports_c == 0) ports[ports_c++] = FTP_PORT; diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c index 4a28f29..25438ee 100644 --- a/net/ipv4/netfilter/ip_conntrack_irc.c +++ b/net/ipv4/netfilter/ip_conntrack_irc.c @@ -39,7 +39,7 @@ static int ports_c; static int max_dcc_channels = 8; static unsigned int dcc_timeout = 300; /* This is slow, but it's simple. --RR */ -static char irc_buffer[65536]; +static char *irc_buffer; static DEFINE_SPINLOCK(irc_buffer_lock); unsigned int (*ip_nat_irc_hook)(struct sk_buff **pskb, @@ -257,6 +257,10 @@ static int __init init(void) printk("ip_conntrack_irc: dcc_timeout must be a positive integer\n"); return -EBUSY; } + + irc_buffer = kmalloc(65536, GFP_KERNEL); + if (!irc_buffer) + return -ENOMEM; /* If no port given, default to standard irc port */ if (ports_c == 0) @@ -304,6 +308,7 @@ static void fini(void) ports[i]); ip_conntrack_helper_unregister(&irc_helpers[i]); } + kfree(irc_buffer); } module_init(init); -- cgit v0.10.2 From 91483c4b711549bff5e9069e25c4c1400b135198 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 9 Aug 2005 20:20:07 -0700 Subject: [SUNRPC]: svcsock.c needs linux/tcp.h Signed-off-by: Andrew Morton Signed-off-by: David S. Miller diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index e750cb6..199d374 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include -- cgit v0.10.2 From f682faefb8c6045468c4cf0fe435128352683c22 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 20:20:34 -0700 Subject: [NETFILTER]: fix autoloading of nfnetlink_log This patch adds the MODULE_ALIAS required for netnlink autoloading of nfnetlink_log. Signed-off-by: Harald Welte Signed-off-by: David S. Miller diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 1750f0d..1158428 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -991,6 +991,7 @@ static void __exit fini(void) MODULE_DESCRIPTION("netfilter userspace logging"); MODULE_AUTHOR("Harald Welte "); MODULE_LICENSE("GPL"); +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ULOG); module_init(init); module_exit(fini); -- cgit v0.10.2 From 210a9ebef2d1bd32d9e9d81c84d538e237769cdb Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 20:20:54 -0700 Subject: [NETFILTER]: ip{6}_queue: prevent unregistration race with nfnetlink_queue Since nfnetlink_queue can override ip{6}_queue as queue handlers, we can no longer blindly unregister whoever is registered for PF_INET[6], but only unregister ourselves. Signed-off-by: Harald Welte Signed-off-by: David S. Miller diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index cfc886f..629de64 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -692,7 +692,7 @@ init_or_cleanup(int init) return status; cleanup: - nf_unregister_queue_handler(PF_INET); + nf_unregister_queue_handlers(&ipq_enqueue_packet); synchronize_net(); ipq_flush(NF_DROP); diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c index 5af4cee..56ffec3 100644 --- a/net/ipv6/netfilter/ip6_queue.c +++ b/net/ipv6/netfilter/ip6_queue.c @@ -687,7 +687,7 @@ init_or_cleanup(int init) return status; cleanup: - nf_unregister_queue_handler(PF_INET6); + nf_unregister_queue_handlers(&ipq_enqueue_packet); synchronize_net(); ipq_flush(NF_DROP); -- cgit v0.10.2 From f6ebe77f955d77a988ce726f0818ec0103b11323 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 20:21:49 -0700 Subject: [NETFILTER]: split net/core/netfilter.c into net/netfilter/*.c This patch doesn't introduce any code changes, but merely splits the core netfilter code into four separate files. It also moves it from it's old location in net/core/ to the recently-created net/netfilter/ directory. Signed-off-by: Harald Welte Signed-off-by: David S. Miller diff --git a/net/core/Makefile b/net/core/Makefile index f5f5e58..630da0f 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -12,7 +12,6 @@ obj-y += dev.o ethtool.o dev_mcast.o dst.o \ obj-$(CONFIG_XFRM) += flow.o obj-$(CONFIG_SYSFS) += net-sysfs.o -obj-$(CONFIG_NETFILTER) += netfilter.o obj-$(CONFIG_NET_DIVERT) += dv.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o obj-$(CONFIG_NET_RADIO) += wireless.o diff --git a/net/core/netfilter.c b/net/core/netfilter.c deleted file mode 100644 index 98cc61e..0000000 --- a/net/core/netfilter.c +++ /dev/null @@ -1,737 +0,0 @@ -/* netfilter.c: look after the filters for various protocols. - * Heavily influenced by the old firewall.c by David Bonn and Alan Cox. - * - * Thanks to Rob `CmdrTaco' Malda for not influencing this code in any - * way. - * - * Rusty Russell (C)2000 -- This code is GPL. - * - * February 2000: Modified by James Morris to have 1 queue per protocol. - * 15-Mar-2000: Added NF_REPEAT --RR. - * 08-May-2003: Internal logging interface added by Jozsef Kadlecsik. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* In this code, we can be waiting indefinitely for userspace to - * service a packet if a hook returns NF_QUEUE. We could keep a count - * of skbuffs queued for userspace, and not deregister a hook unless - * this is zero, but that sucks. Now, we simply check when the - * packets come back: if the hook is gone, the packet is discarded. */ -#ifdef CONFIG_NETFILTER_DEBUG -#define NFDEBUG(format, args...) printk(format , ## args) -#else -#define NFDEBUG(format, args...) -#endif - -/* Sockopts only registered and called from user context, so - net locking would be overkill. Also, [gs]etsockopt calls may - sleep. */ -static DECLARE_MUTEX(nf_sockopt_mutex); - -struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS]; -static LIST_HEAD(nf_sockopts); -static DEFINE_SPINLOCK(nf_hook_lock); - -/* - * A queue handler may be registered for each protocol. Each is protected by - * long term mutex. The handler must provide an an outfn() to accept packets - * for queueing and must reinject all packets it receives, no matter what. - */ -static struct nf_queue_handler_t { - nf_queue_outfn_t outfn; - void *data; -} queue_handler[NPROTO]; - -static struct nf_queue_rerouter *queue_rerouter; - -static DEFINE_RWLOCK(queue_handler_lock); - -int nf_register_hook(struct nf_hook_ops *reg) -{ - struct list_head *i; - - spin_lock_bh(&nf_hook_lock); - list_for_each(i, &nf_hooks[reg->pf][reg->hooknum]) { - if (reg->priority < ((struct nf_hook_ops *)i)->priority) - break; - } - list_add_rcu(®->list, i->prev); - spin_unlock_bh(&nf_hook_lock); - - synchronize_net(); - return 0; -} - -void nf_unregister_hook(struct nf_hook_ops *reg) -{ - spin_lock_bh(&nf_hook_lock); - list_del_rcu(®->list); - spin_unlock_bh(&nf_hook_lock); - - synchronize_net(); -} - -/* Do exclusive ranges overlap? */ -static inline int overlap(int min1, int max1, int min2, int max2) -{ - return max1 > min2 && min1 < max2; -} - -/* Functions to register sockopt ranges (exclusive). */ -int nf_register_sockopt(struct nf_sockopt_ops *reg) -{ - struct list_head *i; - int ret = 0; - - if (down_interruptible(&nf_sockopt_mutex) != 0) - return -EINTR; - - list_for_each(i, &nf_sockopts) { - struct nf_sockopt_ops *ops = (struct nf_sockopt_ops *)i; - if (ops->pf == reg->pf - && (overlap(ops->set_optmin, ops->set_optmax, - reg->set_optmin, reg->set_optmax) - || overlap(ops->get_optmin, ops->get_optmax, - reg->get_optmin, reg->get_optmax))) { - NFDEBUG("nf_sock overlap: %u-%u/%u-%u v %u-%u/%u-%u\n", - ops->set_optmin, ops->set_optmax, - ops->get_optmin, ops->get_optmax, - reg->set_optmin, reg->set_optmax, - reg->get_optmin, reg->get_optmax); - ret = -EBUSY; - goto out; - } - } - - list_add(®->list, &nf_sockopts); -out: - up(&nf_sockopt_mutex); - return ret; -} - -void nf_unregister_sockopt(struct nf_sockopt_ops *reg) -{ - /* No point being interruptible: we're probably in cleanup_module() */ - restart: - down(&nf_sockopt_mutex); - if (reg->use != 0) { - /* To be woken by nf_sockopt call... */ - /* FIXME: Stuart Young's name appears gratuitously. */ - set_current_state(TASK_UNINTERRUPTIBLE); - reg->cleanup_task = current; - up(&nf_sockopt_mutex); - schedule(); - goto restart; - } - list_del(®->list); - up(&nf_sockopt_mutex); -} - -/* Call get/setsockopt() */ -static int nf_sockopt(struct sock *sk, int pf, int val, - char __user *opt, int *len, int get) -{ - struct list_head *i; - struct nf_sockopt_ops *ops; - int ret; - - if (down_interruptible(&nf_sockopt_mutex) != 0) - return -EINTR; - - list_for_each(i, &nf_sockopts) { - ops = (struct nf_sockopt_ops *)i; - if (ops->pf == pf) { - if (get) { - if (val >= ops->get_optmin - && val < ops->get_optmax) { - ops->use++; - up(&nf_sockopt_mutex); - ret = ops->get(sk, val, opt, len); - goto out; - } - } else { - if (val >= ops->set_optmin - && val < ops->set_optmax) { - ops->use++; - up(&nf_sockopt_mutex); - ret = ops->set(sk, val, opt, *len); - goto out; - } - } - } - } - up(&nf_sockopt_mutex); - return -ENOPROTOOPT; - - out: - down(&nf_sockopt_mutex); - ops->use--; - if (ops->cleanup_task) - wake_up_process(ops->cleanup_task); - up(&nf_sockopt_mutex); - return ret; -} - -int nf_setsockopt(struct sock *sk, int pf, int val, char __user *opt, - int len) -{ - return nf_sockopt(sk, pf, val, opt, &len, 0); -} - -int nf_getsockopt(struct sock *sk, int pf, int val, char __user *opt, int *len) -{ - return nf_sockopt(sk, pf, val, opt, len, 1); -} - -static unsigned int nf_iterate(struct list_head *head, - struct sk_buff **skb, - int hook, - const struct net_device *indev, - const struct net_device *outdev, - struct list_head **i, - int (*okfn)(struct sk_buff *), - int hook_thresh) -{ - unsigned int verdict; - - /* - * The caller must not block between calls to this - * function because of risk of continuing from deleted element. - */ - list_for_each_continue_rcu(*i, head) { - struct nf_hook_ops *elem = (struct nf_hook_ops *)*i; - - if (hook_thresh > elem->priority) - continue; - - /* Optimization: we don't need to hold module - reference here, since function can't sleep. --RR */ - verdict = elem->hook(hook, skb, indev, outdev, okfn); - if (verdict != NF_ACCEPT) { -#ifdef CONFIG_NETFILTER_DEBUG - if (unlikely((verdict & NF_VERDICT_MASK) - > NF_MAX_VERDICT)) { - NFDEBUG("Evil return from %p(%u).\n", - elem->hook, hook); - continue; - } -#endif - if (verdict != NF_REPEAT) - return verdict; - *i = (*i)->prev; - } - } - return NF_ACCEPT; -} - -int nf_register_queue_handler(int pf, nf_queue_outfn_t outfn, void *data) -{ - int ret; - - if (pf >= NPROTO) - return -EINVAL; - - write_lock_bh(&queue_handler_lock); - if (queue_handler[pf].outfn) - ret = -EBUSY; - else { - queue_handler[pf].outfn = outfn; - queue_handler[pf].data = data; - ret = 0; - } - write_unlock_bh(&queue_handler_lock); - - return ret; -} - -/* The caller must flush their queue before this */ -int nf_unregister_queue_handler(int pf) -{ - if (pf >= NPROTO) - return -EINVAL; - - write_lock_bh(&queue_handler_lock); - queue_handler[pf].outfn = NULL; - queue_handler[pf].data = NULL; - write_unlock_bh(&queue_handler_lock); - - return 0; -} - -int nf_register_queue_rerouter(int pf, struct nf_queue_rerouter *rer) -{ - if (pf >= NPROTO) - return -EINVAL; - - write_lock_bh(&queue_handler_lock); - memcpy(&queue_rerouter[pf], rer, sizeof(queue_rerouter[pf])); - write_unlock_bh(&queue_handler_lock); - - return 0; -} - -int nf_unregister_queue_rerouter(int pf) -{ - if (pf >= NPROTO) - return -EINVAL; - - write_lock_bh(&queue_handler_lock); - memset(&queue_rerouter[pf], 0, sizeof(queue_rerouter[pf])); - write_unlock_bh(&queue_handler_lock); - return 0; -} - -void nf_unregister_queue_handlers(nf_queue_outfn_t outfn) -{ - int pf; - - write_lock_bh(&queue_handler_lock); - for (pf = 0; pf < NPROTO; pf++) { - if (queue_handler[pf].outfn == outfn) { - queue_handler[pf].outfn = NULL; - queue_handler[pf].data = NULL; - } - } - write_unlock_bh(&queue_handler_lock); -} - -/* - * Any packet that leaves via this function must come back - * through nf_reinject(). - */ -static int nf_queue(struct sk_buff **skb, - struct list_head *elem, - int pf, unsigned int hook, - struct net_device *indev, - struct net_device *outdev, - int (*okfn)(struct sk_buff *), - unsigned int queuenum) -{ - int status; - struct nf_info *info; -#ifdef CONFIG_BRIDGE_NETFILTER - struct net_device *physindev = NULL; - struct net_device *physoutdev = NULL; -#endif - - /* QUEUE == DROP if noone is waiting, to be safe. */ - read_lock(&queue_handler_lock); - if (!queue_handler[pf].outfn) { - read_unlock(&queue_handler_lock); - kfree_skb(*skb); - return 1; - } - - info = kmalloc(sizeof(*info)+queue_rerouter[pf].rer_size, GFP_ATOMIC); - if (!info) { - if (net_ratelimit()) - printk(KERN_ERR "OOM queueing packet %p\n", - *skb); - read_unlock(&queue_handler_lock); - kfree_skb(*skb); - return 1; - } - - *info = (struct nf_info) { - (struct nf_hook_ops *)elem, pf, hook, indev, outdev, okfn }; - - /* If it's going away, ignore hook. */ - if (!try_module_get(info->elem->owner)) { - read_unlock(&queue_handler_lock); - kfree(info); - return 0; - } - - /* Bump dev refs so they don't vanish while packet is out */ - if (indev) dev_hold(indev); - if (outdev) dev_hold(outdev); - -#ifdef CONFIG_BRIDGE_NETFILTER - if ((*skb)->nf_bridge) { - physindev = (*skb)->nf_bridge->physindev; - if (physindev) dev_hold(physindev); - physoutdev = (*skb)->nf_bridge->physoutdev; - if (physoutdev) dev_hold(physoutdev); - } -#endif - if (queue_rerouter[pf].save) - queue_rerouter[pf].save(*skb, info); - - status = queue_handler[pf].outfn(*skb, info, queuenum, - queue_handler[pf].data); - - if (status >= 0 && queue_rerouter[pf].reroute) - status = queue_rerouter[pf].reroute(skb, info); - - read_unlock(&queue_handler_lock); - - if (status < 0) { - /* James M doesn't say fuck enough. */ - if (indev) dev_put(indev); - if (outdev) dev_put(outdev); -#ifdef CONFIG_BRIDGE_NETFILTER - if (physindev) dev_put(physindev); - if (physoutdev) dev_put(physoutdev); -#endif - module_put(info->elem->owner); - kfree(info); - kfree_skb(*skb); - - return 1; - } - - return 1; -} - -/* Returns 1 if okfn() needs to be executed by the caller, - * -EPERM for NF_DROP, 0 otherwise. */ -int nf_hook_slow(int pf, unsigned int hook, struct sk_buff **pskb, - struct net_device *indev, - struct net_device *outdev, - int (*okfn)(struct sk_buff *), - int hook_thresh) -{ - struct list_head *elem; - unsigned int verdict; - int ret = 0; - - /* We may already have this, but read-locks nest anyway */ - rcu_read_lock(); - - elem = &nf_hooks[pf][hook]; -next_hook: - verdict = nf_iterate(&nf_hooks[pf][hook], pskb, hook, indev, - outdev, &elem, okfn, hook_thresh); - if (verdict == NF_ACCEPT || verdict == NF_STOP) { - ret = 1; - goto unlock; - } else if (verdict == NF_DROP) { - kfree_skb(*pskb); - ret = -EPERM; - } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) { - NFDEBUG("nf_hook: Verdict = QUEUE.\n"); - if (!nf_queue(pskb, elem, pf, hook, indev, outdev, okfn, - verdict >> NF_VERDICT_BITS)) - goto next_hook; - } -unlock: - rcu_read_unlock(); - return ret; -} - -void nf_reinject(struct sk_buff *skb, struct nf_info *info, - unsigned int verdict) -{ - struct list_head *elem = &info->elem->list; - struct list_head *i; - - rcu_read_lock(); - - /* Release those devices we held, or Alexey will kill me. */ - if (info->indev) dev_put(info->indev); - if (info->outdev) dev_put(info->outdev); -#ifdef CONFIG_BRIDGE_NETFILTER - if (skb->nf_bridge) { - if (skb->nf_bridge->physindev) - dev_put(skb->nf_bridge->physindev); - if (skb->nf_bridge->physoutdev) - dev_put(skb->nf_bridge->physoutdev); - } -#endif - - /* Drop reference to owner of hook which queued us. */ - module_put(info->elem->owner); - - list_for_each_rcu(i, &nf_hooks[info->pf][info->hook]) { - if (i == elem) - break; - } - - if (elem == &nf_hooks[info->pf][info->hook]) { - /* The module which sent it to userspace is gone. */ - NFDEBUG("%s: module disappeared, dropping packet.\n", - __FUNCTION__); - verdict = NF_DROP; - } - - /* Continue traversal iff userspace said ok... */ - if (verdict == NF_REPEAT) { - elem = elem->prev; - verdict = NF_ACCEPT; - } - - if (verdict == NF_ACCEPT) { - next_hook: - verdict = nf_iterate(&nf_hooks[info->pf][info->hook], - &skb, info->hook, - info->indev, info->outdev, &elem, - info->okfn, INT_MIN); - } - - switch (verdict & NF_VERDICT_MASK) { - case NF_ACCEPT: - info->okfn(skb); - break; - - case NF_QUEUE: - if (!nf_queue(&skb, elem, info->pf, info->hook, - info->indev, info->outdev, info->okfn, - verdict >> NF_VERDICT_BITS)) - goto next_hook; - break; - } - rcu_read_unlock(); - - if (verdict == NF_DROP) - kfree_skb(skb); - - kfree(info); - return; -} - -int skb_make_writable(struct sk_buff **pskb, unsigned int writable_len) -{ - struct sk_buff *nskb; - - if (writable_len > (*pskb)->len) - return 0; - - /* Not exclusive use of packet? Must copy. */ - if (skb_shared(*pskb) || skb_cloned(*pskb)) - goto copy_skb; - - return pskb_may_pull(*pskb, writable_len); - -copy_skb: - nskb = skb_copy(*pskb, GFP_ATOMIC); - if (!nskb) - return 0; - BUG_ON(skb_is_nonlinear(nskb)); - - /* Rest of kernel will get very unhappy if we pass it a - suddenly-orphaned skbuff */ - if ((*pskb)->sk) - skb_set_owner_w(nskb, (*pskb)->sk); - kfree_skb(*pskb); - *pskb = nskb; - return 1; -} -EXPORT_SYMBOL(skb_make_writable); - -/* Internal logging interface, which relies on the real - LOG target modules */ - -#define NF_LOG_PREFIXLEN 128 - -static struct nf_logger *nf_logging[NPROTO]; /* = NULL */ -static DEFINE_SPINLOCK(nf_log_lock); - -int nf_log_register(int pf, struct nf_logger *logger) -{ - int ret = -EBUSY; - - /* Any setup of logging members must be done before - * substituting pointer. */ - spin_lock(&nf_log_lock); - if (!nf_logging[pf]) { - rcu_assign_pointer(nf_logging[pf], logger); - ret = 0; - } - spin_unlock(&nf_log_lock); - return ret; -} - -void nf_log_unregister_pf(int pf) -{ - spin_lock(&nf_log_lock); - nf_logging[pf] = NULL; - spin_unlock(&nf_log_lock); - - /* Give time to concurrent readers. */ - synchronize_net(); -} - -void nf_log_unregister_logger(struct nf_logger *logger) -{ - int i; - - spin_lock(&nf_log_lock); - for (i = 0; i < NPROTO; i++) { - if (nf_logging[i] == logger) - nf_logging[i] = NULL; - } - spin_unlock(&nf_log_lock); - - synchronize_net(); -} - -void nf_log_packet(int pf, - unsigned int hooknum, - const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - struct nf_loginfo *loginfo, - const char *fmt, ...) -{ - va_list args; - char prefix[NF_LOG_PREFIXLEN]; - struct nf_logger *logger; - - rcu_read_lock(); - logger = rcu_dereference(nf_logging[pf]); - if (logger) { - va_start(args, fmt); - vsnprintf(prefix, sizeof(prefix), fmt, args); - va_end(args); - /* We must read logging before nf_logfn[pf] */ - logger->logfn(pf, hooknum, skb, in, out, loginfo, prefix); - } else if (net_ratelimit()) { - printk(KERN_WARNING "nf_log_packet: can\'t log since " - "no backend logging module loaded in! Please either " - "load one, or disable logging explicitly\n"); - } - rcu_read_unlock(); -} -EXPORT_SYMBOL(nf_log_register); -EXPORT_SYMBOL(nf_log_unregister_pf); -EXPORT_SYMBOL(nf_log_unregister_logger); -EXPORT_SYMBOL(nf_log_packet); - -#ifdef CONFIG_PROC_FS -struct proc_dir_entry *proc_net_netfilter; -EXPORT_SYMBOL(proc_net_netfilter); - -static void *seq_start(struct seq_file *seq, loff_t *pos) -{ - rcu_read_lock(); - - if (*pos >= NPROTO) - return NULL; - - return pos; -} - -static void *seq_next(struct seq_file *s, void *v, loff_t *pos) -{ - (*pos)++; - - if (*pos >= NPROTO) - return NULL; - - return pos; -} - -static void seq_stop(struct seq_file *s, void *v) -{ - rcu_read_unlock(); -} - -static int seq_show(struct seq_file *s, void *v) -{ - loff_t *pos = v; - const struct nf_logger *logger; - - logger = rcu_dereference(nf_logging[*pos]); - - if (!logger) - return seq_printf(s, "%2lld NONE\n", *pos); - - return seq_printf(s, "%2lld %s\n", *pos, logger->name); -} - -static struct seq_operations nflog_seq_ops = { - .start = seq_start, - .next = seq_next, - .stop = seq_stop, - .show = seq_show, -}; - -static int nflog_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &nflog_seq_ops); -} - -static struct file_operations nflog_file_ops = { - .owner = THIS_MODULE, - .open = nflog_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -#endif /* PROC_FS */ - - -/* This does not belong here, but locally generated errors need it if connection - tracking in use: without this, connection may not be in hash table, and hence - manufactured ICMP or RST packets will not be associated with it. */ -void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *); - -void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) -{ - void (*attach)(struct sk_buff *, struct sk_buff *); - - if (skb->nfct && (attach = ip_ct_attach) != NULL) { - mb(); /* Just to be sure: must be read before executing this */ - attach(new, skb); - } -} - -void __init netfilter_init(void) -{ - int i, h; -#ifdef CONFIG_PROC_FS - struct proc_dir_entry *pde; -#endif - - queue_rerouter = kmalloc(NPROTO * sizeof(struct nf_queue_rerouter), - GFP_KERNEL); - if (!queue_rerouter) - panic("netfilter: cannot allocate queue rerouter array\n"); - memset(queue_rerouter, 0, NPROTO * sizeof(struct nf_queue_rerouter)); - - for (i = 0; i < NPROTO; i++) { - for (h = 0; h < NF_MAX_HOOKS; h++) - INIT_LIST_HEAD(&nf_hooks[i][h]); - } - -#ifdef CONFIG_PROC_FS - proc_net_netfilter = proc_mkdir("netfilter", proc_net); - if (!proc_net_netfilter) - panic("cannot create netfilter proc entry"); - pde = create_proc_entry("nf_log", S_IRUGO, proc_net_netfilter); - if (!pde) - panic("cannot create /proc/net/netfilter/nf_log"); - pde->proc_fops = &nflog_file_ops; -#endif -} - -EXPORT_SYMBOL(ip_ct_attach); -EXPORT_SYMBOL(nf_ct_attach); -EXPORT_SYMBOL(nf_getsockopt); -EXPORT_SYMBOL(nf_hook_slow); -EXPORT_SYMBOL(nf_hooks); -EXPORT_SYMBOL(nf_register_hook); -EXPORT_SYMBOL(nf_register_queue_handler); -EXPORT_SYMBOL(nf_register_sockopt); -EXPORT_SYMBOL(nf_reinject); -EXPORT_SYMBOL(nf_setsockopt); -EXPORT_SYMBOL(nf_unregister_hook); -EXPORT_SYMBOL(nf_unregister_queue_handler); -EXPORT_SYMBOL_GPL(nf_unregister_queue_handlers); -EXPORT_SYMBOL_GPL(nf_register_queue_rerouter); -EXPORT_SYMBOL_GPL(nf_unregister_queue_rerouter); -EXPORT_SYMBOL(nf_unregister_sockopt); diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index c41caeb..b3b44f8 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -1,3 +1,7 @@ +netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o + +obj-$(CONFIG_NETFILTER) = netfilter.o + obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o diff --git a/net/netfilter/core.c b/net/netfilter/core.c new file mode 100644 index 0000000..1ceb1a6 --- /dev/null +++ b/net/netfilter/core.c @@ -0,0 +1,216 @@ +/* netfilter.c: look after the filters for various protocols. + * Heavily influenced by the old firewall.c by David Bonn and Alan Cox. + * + * Thanks to Rob `CmdrTaco' Malda for not influencing this code in any + * way. + * + * Rusty Russell (C)2000 -- This code is GPL. + * + * February 2000: Modified by James Morris to have 1 queue per protocol. + * 15-Mar-2000: Added NF_REPEAT --RR. + * 08-May-2003: Internal logging interface added by Jozsef Kadlecsik. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nf_internals.h" + +/* In this code, we can be waiting indefinitely for userspace to + * service a packet if a hook returns NF_QUEUE. We could keep a count + * of skbuffs queued for userspace, and not deregister a hook unless + * this is zero, but that sucks. Now, we simply check when the + * packets come back: if the hook is gone, the packet is discarded. */ +struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS]; +EXPORT_SYMBOL(nf_hooks); +static DEFINE_SPINLOCK(nf_hook_lock); + +int nf_register_hook(struct nf_hook_ops *reg) +{ + struct list_head *i; + + spin_lock_bh(&nf_hook_lock); + list_for_each(i, &nf_hooks[reg->pf][reg->hooknum]) { + if (reg->priority < ((struct nf_hook_ops *)i)->priority) + break; + } + list_add_rcu(®->list, i->prev); + spin_unlock_bh(&nf_hook_lock); + + synchronize_net(); + return 0; +} +EXPORT_SYMBOL(nf_register_hook); + +void nf_unregister_hook(struct nf_hook_ops *reg) +{ + spin_lock_bh(&nf_hook_lock); + list_del_rcu(®->list); + spin_unlock_bh(&nf_hook_lock); + + synchronize_net(); +} +EXPORT_SYMBOL(nf_unregister_hook); + +unsigned int nf_iterate(struct list_head *head, + struct sk_buff **skb, + int hook, + const struct net_device *indev, + const struct net_device *outdev, + struct list_head **i, + int (*okfn)(struct sk_buff *), + int hook_thresh) +{ + unsigned int verdict; + + /* + * The caller must not block between calls to this + * function because of risk of continuing from deleted element. + */ + list_for_each_continue_rcu(*i, head) { + struct nf_hook_ops *elem = (struct nf_hook_ops *)*i; + + if (hook_thresh > elem->priority) + continue; + + /* Optimization: we don't need to hold module + reference here, since function can't sleep. --RR */ + verdict = elem->hook(hook, skb, indev, outdev, okfn); + if (verdict != NF_ACCEPT) { +#ifdef CONFIG_NETFILTER_DEBUG + if (unlikely((verdict & NF_VERDICT_MASK) + > NF_MAX_VERDICT)) { + NFDEBUG("Evil return from %p(%u).\n", + elem->hook, hook); + continue; + } +#endif + if (verdict != NF_REPEAT) + return verdict; + *i = (*i)->prev; + } + } + return NF_ACCEPT; +} + + +/* Returns 1 if okfn() needs to be executed by the caller, + * -EPERM for NF_DROP, 0 otherwise. */ +int nf_hook_slow(int pf, unsigned int hook, struct sk_buff **pskb, + struct net_device *indev, + struct net_device *outdev, + int (*okfn)(struct sk_buff *), + int hook_thresh) +{ + struct list_head *elem; + unsigned int verdict; + int ret = 0; + + /* We may already have this, but read-locks nest anyway */ + rcu_read_lock(); + + elem = &nf_hooks[pf][hook]; +next_hook: + verdict = nf_iterate(&nf_hooks[pf][hook], pskb, hook, indev, + outdev, &elem, okfn, hook_thresh); + if (verdict == NF_ACCEPT || verdict == NF_STOP) { + ret = 1; + goto unlock; + } else if (verdict == NF_DROP) { + kfree_skb(*pskb); + ret = -EPERM; + } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) { + NFDEBUG("nf_hook: Verdict = QUEUE.\n"); + if (!nf_queue(pskb, elem, pf, hook, indev, outdev, okfn, + verdict >> NF_VERDICT_BITS)) + goto next_hook; + } +unlock: + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL(nf_hook_slow); + + +int skb_make_writable(struct sk_buff **pskb, unsigned int writable_len) +{ + struct sk_buff *nskb; + + if (writable_len > (*pskb)->len) + return 0; + + /* Not exclusive use of packet? Must copy. */ + if (skb_shared(*pskb) || skb_cloned(*pskb)) + goto copy_skb; + + return pskb_may_pull(*pskb, writable_len); + +copy_skb: + nskb = skb_copy(*pskb, GFP_ATOMIC); + if (!nskb) + return 0; + BUG_ON(skb_is_nonlinear(nskb)); + + /* Rest of kernel will get very unhappy if we pass it a + suddenly-orphaned skbuff */ + if ((*pskb)->sk) + skb_set_owner_w(nskb, (*pskb)->sk); + kfree_skb(*pskb); + *pskb = nskb; + return 1; +} +EXPORT_SYMBOL(skb_make_writable); + + +/* This does not belong here, but locally generated errors need it if connection + tracking in use: without this, connection may not be in hash table, and hence + manufactured ICMP or RST packets will not be associated with it. */ +void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *); +EXPORT_SYMBOL(ip_ct_attach); + +void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) +{ + void (*attach)(struct sk_buff *, struct sk_buff *); + + if (skb->nfct && (attach = ip_ct_attach) != NULL) { + mb(); /* Just to be sure: must be read before executing this */ + attach(new, skb); + } +} +EXPORT_SYMBOL(nf_ct_attach); + +#ifdef CONFIG_PROC_FS +struct proc_dir_entry *proc_net_netfilter; +EXPORT_SYMBOL(proc_net_netfilter); +#endif + +void __init netfilter_init(void) +{ + int i, h; + for (i = 0; i < NPROTO; i++) { + for (h = 0; h < NF_MAX_HOOKS; h++) + INIT_LIST_HEAD(&nf_hooks[i][h]); + } + +#ifdef CONFIG_PROC_FS + proc_net_netfilter = proc_mkdir("netfilter", proc_net); + if (!proc_net_netfilter) + panic("cannot create netfilter proc entry"); +#endif + + if (netfilter_queue_init() < 0) + panic("cannot initialize nf_queue"); + if (netfilter_log_init() < 0) + panic("cannot initialize nf_log"); +} diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h new file mode 100644 index 0000000..6bdee29 --- /dev/null +++ b/net/netfilter/nf_internals.h @@ -0,0 +1,39 @@ +#ifndef _NF_INTERNALS_H +#define _NF_INTERNALS_H + +#include +#include +#include +#include + +#ifdef CONFIG_NETFILTER_DEBUG +#define NFDEBUG(format, args...) printk(format , ## args) +#else +#define NFDEBUG(format, args...) +#endif + + +/* core.c */ +extern unsigned int nf_iterate(struct list_head *head, + struct sk_buff **skb, + int hook, + const struct net_device *indev, + const struct net_device *outdev, + struct list_head **i, + int (*okfn)(struct sk_buff *), + int hook_thresh); + +/* nf_queue.c */ +extern int nf_queue(struct sk_buff **skb, + struct list_head *elem, + int pf, unsigned int hook, + struct net_device *indev, + struct net_device *outdev, + int (*okfn)(struct sk_buff *), + unsigned int queuenum); +extern int __init netfilter_queue_init(void); + +/* nf_log.c */ +extern int __init netfilter_log_init(void); + +#endif diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c new file mode 100644 index 0000000..ec58c4d --- /dev/null +++ b/net/netfilter/nf_log.c @@ -0,0 +1,165 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nf_internals.h" + +/* Internal logging interface, which relies on the real + LOG target modules */ + +#define NF_LOG_PREFIXLEN 128 + +static struct nf_logger *nf_logging[NPROTO]; /* = NULL */ +static DEFINE_SPINLOCK(nf_log_lock); + +int nf_log_register(int pf, struct nf_logger *logger) +{ + int ret = -EBUSY; + + /* Any setup of logging members must be done before + * substituting pointer. */ + spin_lock(&nf_log_lock); + if (!nf_logging[pf]) { + rcu_assign_pointer(nf_logging[pf], logger); + ret = 0; + } + spin_unlock(&nf_log_lock); + return ret; +} +EXPORT_SYMBOL(nf_log_register); + +void nf_log_unregister_pf(int pf) +{ + spin_lock(&nf_log_lock); + nf_logging[pf] = NULL; + spin_unlock(&nf_log_lock); + + /* Give time to concurrent readers. */ + synchronize_net(); +} +EXPORT_SYMBOL(nf_log_unregister_pf); + +void nf_log_unregister_logger(struct nf_logger *logger) +{ + int i; + + spin_lock(&nf_log_lock); + for (i = 0; i < NPROTO; i++) { + if (nf_logging[i] == logger) + nf_logging[i] = NULL; + } + spin_unlock(&nf_log_lock); + + synchronize_net(); +} +EXPORT_SYMBOL(nf_log_unregister_logger); + +void nf_log_packet(int pf, + unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_loginfo *loginfo, + const char *fmt, ...) +{ + va_list args; + char prefix[NF_LOG_PREFIXLEN]; + struct nf_logger *logger; + + rcu_read_lock(); + logger = rcu_dereference(nf_logging[pf]); + if (logger) { + va_start(args, fmt); + vsnprintf(prefix, sizeof(prefix), fmt, args); + va_end(args); + /* We must read logging before nf_logfn[pf] */ + logger->logfn(pf, hooknum, skb, in, out, loginfo, prefix); + } else if (net_ratelimit()) { + printk(KERN_WARNING "nf_log_packet: can\'t log since " + "no backend logging module loaded in! Please either " + "load one, or disable logging explicitly\n"); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL(nf_log_packet); + +#ifdef CONFIG_PROC_FS +static void *seq_start(struct seq_file *seq, loff_t *pos) +{ + rcu_read_lock(); + + if (*pos >= NPROTO) + return NULL; + + return pos; +} + +static void *seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + (*pos)++; + + if (*pos >= NPROTO) + return NULL; + + return pos; +} + +static void seq_stop(struct seq_file *s, void *v) +{ + rcu_read_unlock(); +} + +static int seq_show(struct seq_file *s, void *v) +{ + loff_t *pos = v; + const struct nf_logger *logger; + + logger = rcu_dereference(nf_logging[*pos]); + + if (!logger) + return seq_printf(s, "%2lld NONE\n", *pos); + + return seq_printf(s, "%2lld %s\n", *pos, logger->name); +} + +static struct seq_operations nflog_seq_ops = { + .start = seq_start, + .next = seq_next, + .stop = seq_stop, + .show = seq_show, +}; + +static int nflog_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &nflog_seq_ops); +} + +static struct file_operations nflog_file_ops = { + .owner = THIS_MODULE, + .open = nflog_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +#endif /* PROC_FS */ + + +int __init netfilter_log_init(void) +{ +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *pde; + pde = create_proc_entry("nf_log", S_IRUGO, proc_net_netfilter); +#endif + if (!pde) + return -1; + + pde->proc_fops = &nflog_file_ops; + + return 0; +} diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c new file mode 100644 index 0000000..5586f84 --- /dev/null +++ b/net/netfilter/nf_queue.c @@ -0,0 +1,273 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nf_internals.h" + +/* + * A queue handler may be registered for each protocol. Each is protected by + * long term mutex. The handler must provide an an outfn() to accept packets + * for queueing and must reinject all packets it receives, no matter what. + */ +static struct nf_queue_handler_t { + nf_queue_outfn_t outfn; + void *data; +} queue_handler[NPROTO]; + +static struct nf_queue_rerouter *queue_rerouter; + +static DEFINE_RWLOCK(queue_handler_lock); + + +int nf_register_queue_handler(int pf, nf_queue_outfn_t outfn, void *data) +{ + int ret; + + if (pf >= NPROTO) + return -EINVAL; + + write_lock_bh(&queue_handler_lock); + if (queue_handler[pf].outfn) + ret = -EBUSY; + else { + queue_handler[pf].outfn = outfn; + queue_handler[pf].data = data; + ret = 0; + } + write_unlock_bh(&queue_handler_lock); + + return ret; +} +EXPORT_SYMBOL(nf_register_queue_handler); + +/* The caller must flush their queue before this */ +int nf_unregister_queue_handler(int pf) +{ + if (pf >= NPROTO) + return -EINVAL; + + write_lock_bh(&queue_handler_lock); + queue_handler[pf].outfn = NULL; + queue_handler[pf].data = NULL; + write_unlock_bh(&queue_handler_lock); + + return 0; +} +EXPORT_SYMBOL(nf_unregister_queue_handler); + +int nf_register_queue_rerouter(int pf, struct nf_queue_rerouter *rer) +{ + if (pf >= NPROTO) + return -EINVAL; + + write_lock_bh(&queue_handler_lock); + memcpy(&queue_rerouter[pf], rer, sizeof(queue_rerouter[pf])); + write_unlock_bh(&queue_handler_lock); + + return 0; +} +EXPORT_SYMBOL_GPL(nf_register_queue_rerouter); + +int nf_unregister_queue_rerouter(int pf) +{ + if (pf >= NPROTO) + return -EINVAL; + + write_lock_bh(&queue_handler_lock); + memset(&queue_rerouter[pf], 0, sizeof(queue_rerouter[pf])); + write_unlock_bh(&queue_handler_lock); + return 0; +} +EXPORT_SYMBOL_GPL(nf_unregister_queue_rerouter); + +void nf_unregister_queue_handlers(nf_queue_outfn_t outfn) +{ + int pf; + + write_lock_bh(&queue_handler_lock); + for (pf = 0; pf < NPROTO; pf++) { + if (queue_handler[pf].outfn == outfn) { + queue_handler[pf].outfn = NULL; + queue_handler[pf].data = NULL; + } + } + write_unlock_bh(&queue_handler_lock); +} +EXPORT_SYMBOL_GPL(nf_unregister_queue_handlers); + +/* + * Any packet that leaves via this function must come back + * through nf_reinject(). + */ +int nf_queue(struct sk_buff **skb, + struct list_head *elem, + int pf, unsigned int hook, + struct net_device *indev, + struct net_device *outdev, + int (*okfn)(struct sk_buff *), + unsigned int queuenum) +{ + int status; + struct nf_info *info; +#ifdef CONFIG_BRIDGE_NETFILTER + struct net_device *physindev = NULL; + struct net_device *physoutdev = NULL; +#endif + + /* QUEUE == DROP if noone is waiting, to be safe. */ + read_lock(&queue_handler_lock); + if (!queue_handler[pf].outfn) { + read_unlock(&queue_handler_lock); + kfree_skb(*skb); + return 1; + } + + info = kmalloc(sizeof(*info)+queue_rerouter[pf].rer_size, GFP_ATOMIC); + if (!info) { + if (net_ratelimit()) + printk(KERN_ERR "OOM queueing packet %p\n", + *skb); + read_unlock(&queue_handler_lock); + kfree_skb(*skb); + return 1; + } + + *info = (struct nf_info) { + (struct nf_hook_ops *)elem, pf, hook, indev, outdev, okfn }; + + /* If it's going away, ignore hook. */ + if (!try_module_get(info->elem->owner)) { + read_unlock(&queue_handler_lock); + kfree(info); + return 0; + } + + /* Bump dev refs so they don't vanish while packet is out */ + if (indev) dev_hold(indev); + if (outdev) dev_hold(outdev); + +#ifdef CONFIG_BRIDGE_NETFILTER + if ((*skb)->nf_bridge) { + physindev = (*skb)->nf_bridge->physindev; + if (physindev) dev_hold(physindev); + physoutdev = (*skb)->nf_bridge->physoutdev; + if (physoutdev) dev_hold(physoutdev); + } +#endif + if (queue_rerouter[pf].save) + queue_rerouter[pf].save(*skb, info); + + status = queue_handler[pf].outfn(*skb, info, queuenum, + queue_handler[pf].data); + + if (status >= 0 && queue_rerouter[pf].reroute) + status = queue_rerouter[pf].reroute(skb, info); + + read_unlock(&queue_handler_lock); + + if (status < 0) { + /* James M doesn't say fuck enough. */ + if (indev) dev_put(indev); + if (outdev) dev_put(outdev); +#ifdef CONFIG_BRIDGE_NETFILTER + if (physindev) dev_put(physindev); + if (physoutdev) dev_put(physoutdev); +#endif + module_put(info->elem->owner); + kfree(info); + kfree_skb(*skb); + + return 1; + } + + return 1; +} + +void nf_reinject(struct sk_buff *skb, struct nf_info *info, + unsigned int verdict) +{ + struct list_head *elem = &info->elem->list; + struct list_head *i; + + rcu_read_lock(); + + /* Release those devices we held, or Alexey will kill me. */ + if (info->indev) dev_put(info->indev); + if (info->outdev) dev_put(info->outdev); +#ifdef CONFIG_BRIDGE_NETFILTER + if (skb->nf_bridge) { + if (skb->nf_bridge->physindev) + dev_put(skb->nf_bridge->physindev); + if (skb->nf_bridge->physoutdev) + dev_put(skb->nf_bridge->physoutdev); + } +#endif + + /* Drop reference to owner of hook which queued us. */ + module_put(info->elem->owner); + + list_for_each_rcu(i, &nf_hooks[info->pf][info->hook]) { + if (i == elem) + break; + } + + if (elem == &nf_hooks[info->pf][info->hook]) { + /* The module which sent it to userspace is gone. */ + NFDEBUG("%s: module disappeared, dropping packet.\n", + __FUNCTION__); + verdict = NF_DROP; + } + + /* Continue traversal iff userspace said ok... */ + if (verdict == NF_REPEAT) { + elem = elem->prev; + verdict = NF_ACCEPT; + } + + if (verdict == NF_ACCEPT) { + next_hook: + verdict = nf_iterate(&nf_hooks[info->pf][info->hook], + &skb, info->hook, + info->indev, info->outdev, &elem, + info->okfn, INT_MIN); + } + + switch (verdict & NF_VERDICT_MASK) { + case NF_ACCEPT: + info->okfn(skb); + break; + + case NF_QUEUE: + if (!nf_queue(&skb, elem, info->pf, info->hook, + info->indev, info->outdev, info->okfn, + verdict >> NF_VERDICT_BITS)) + goto next_hook; + break; + } + rcu_read_unlock(); + + if (verdict == NF_DROP) + kfree_skb(skb); + + kfree(info); + return; +} +EXPORT_SYMBOL(nf_reinject); + +int __init netfilter_queue_init(void) +{ + queue_rerouter = kmalloc(NPROTO * sizeof(struct nf_queue_rerouter), + GFP_KERNEL); + if (!queue_rerouter) + return -ENOMEM; + + memset(queue_rerouter, 0, NPROTO * sizeof(struct nf_queue_rerouter)); + + return 0; +} + diff --git a/net/netfilter/nf_sockopt.c b/net/netfilter/nf_sockopt.c new file mode 100644 index 0000000..61a833a --- /dev/null +++ b/net/netfilter/nf_sockopt.c @@ -0,0 +1,132 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "nf_internals.h" + +/* Sockopts only registered and called from user context, so + net locking would be overkill. Also, [gs]etsockopt calls may + sleep. */ +static DECLARE_MUTEX(nf_sockopt_mutex); +static LIST_HEAD(nf_sockopts); + +/* Do exclusive ranges overlap? */ +static inline int overlap(int min1, int max1, int min2, int max2) +{ + return max1 > min2 && min1 < max2; +} + +/* Functions to register sockopt ranges (exclusive). */ +int nf_register_sockopt(struct nf_sockopt_ops *reg) +{ + struct list_head *i; + int ret = 0; + + if (down_interruptible(&nf_sockopt_mutex) != 0) + return -EINTR; + + list_for_each(i, &nf_sockopts) { + struct nf_sockopt_ops *ops = (struct nf_sockopt_ops *)i; + if (ops->pf == reg->pf + && (overlap(ops->set_optmin, ops->set_optmax, + reg->set_optmin, reg->set_optmax) + || overlap(ops->get_optmin, ops->get_optmax, + reg->get_optmin, reg->get_optmax))) { + NFDEBUG("nf_sock overlap: %u-%u/%u-%u v %u-%u/%u-%u\n", + ops->set_optmin, ops->set_optmax, + ops->get_optmin, ops->get_optmax, + reg->set_optmin, reg->set_optmax, + reg->get_optmin, reg->get_optmax); + ret = -EBUSY; + goto out; + } + } + + list_add(®->list, &nf_sockopts); +out: + up(&nf_sockopt_mutex); + return ret; +} +EXPORT_SYMBOL(nf_register_sockopt); + +void nf_unregister_sockopt(struct nf_sockopt_ops *reg) +{ + /* No point being interruptible: we're probably in cleanup_module() */ + restart: + down(&nf_sockopt_mutex); + if (reg->use != 0) { + /* To be woken by nf_sockopt call... */ + /* FIXME: Stuart Young's name appears gratuitously. */ + set_current_state(TASK_UNINTERRUPTIBLE); + reg->cleanup_task = current; + up(&nf_sockopt_mutex); + schedule(); + goto restart; + } + list_del(®->list); + up(&nf_sockopt_mutex); +} +EXPORT_SYMBOL(nf_unregister_sockopt); + +/* Call get/setsockopt() */ +static int nf_sockopt(struct sock *sk, int pf, int val, + char __user *opt, int *len, int get) +{ + struct list_head *i; + struct nf_sockopt_ops *ops; + int ret; + + if (down_interruptible(&nf_sockopt_mutex) != 0) + return -EINTR; + + list_for_each(i, &nf_sockopts) { + ops = (struct nf_sockopt_ops *)i; + if (ops->pf == pf) { + if (get) { + if (val >= ops->get_optmin + && val < ops->get_optmax) { + ops->use++; + up(&nf_sockopt_mutex); + ret = ops->get(sk, val, opt, len); + goto out; + } + } else { + if (val >= ops->set_optmin + && val < ops->set_optmax) { + ops->use++; + up(&nf_sockopt_mutex); + ret = ops->set(sk, val, opt, *len); + goto out; + } + } + } + } + up(&nf_sockopt_mutex); + return -ENOPROTOOPT; + + out: + down(&nf_sockopt_mutex); + ops->use--; + if (ops->cleanup_task) + wake_up_process(ops->cleanup_task); + up(&nf_sockopt_mutex); + return ret; +} + +int nf_setsockopt(struct sock *sk, int pf, int val, char __user *opt, + int len) +{ + return nf_sockopt(sk, pf, val, opt, &len, 0); +} +EXPORT_SYMBOL(nf_setsockopt); + +int nf_getsockopt(struct sock *sk, int pf, int val, char __user *opt, int *len) +{ + return nf_sockopt(sk, pf, val, opt, len, 1); +} +EXPORT_SYMBOL(nf_getsockopt); + -- cgit v0.10.2 From fbcd923c3e0c8ec9e4ed64f5a4e5766807b32729 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 20:22:10 -0700 Subject: [NETFILTER]: add correct bridging support to nfnetlink_{queue,log} This patch adds support for passing the real 'physical' device ifindex down to userspace via nfnetlink_log and nfnetlink_queue. This feature basically obsoletes net/bridge/netfilter/ebt_ulog.c, and it is likely ebt_ulog.c will die with one of the next couple of patches. Signed-off-by: Harald Welte Signed-off-by: David S. Miller diff --git a/include/linux/netfilter/nfnetlink_log.h b/include/linux/netfilter/nfnetlink_log.h index 420ff46..a61836a 100644 --- a/include/linux/netfilter/nfnetlink_log.h +++ b/include/linux/netfilter/nfnetlink_log.h @@ -40,6 +40,8 @@ enum nfulnl_attr_type { NFULA_TIMESTAMP, /* nfulnl_msg_packet_timestamp */ NFULA_IFINDEX_INDEV, /* u_int32_t ifindex */ NFULA_IFINDEX_OUTDEV, /* u_int32_t ifindex */ + NFULA_IFINDEX_PHYSINDEV, /* u_int32_t ifindex */ + NFULA_IFINDEX_PHYSOUTDEV, /* u_int32_t ifindex */ NFULA_HWADDR, /* nfulnl_msg_packet_hw */ NFULA_PAYLOAD, /* opaque data payload */ NFULA_PREFIX, /* string prefix */ diff --git a/include/linux/netfilter/nfnetlink_queue.h b/include/linux/netfilter/nfnetlink_queue.h index e142b0f..2d8d2b2 100644 --- a/include/linux/netfilter/nfnetlink_queue.h +++ b/include/linux/netfilter/nfnetlink_queue.h @@ -36,6 +36,8 @@ enum nfqnl_attr_type { NFQA_TIMESTAMP, /* nfqnl_msg_packet_timestamp */ NFQA_IFINDEX_INDEV, /* u_int32_t ifindex */ NFQA_IFINDEX_OUTDEV, /* u_int32_t ifindex */ + NFQA_IFINDEX_PHYSINDEV, /* u_int32_t ifindex */ + NFQA_IFINDEX_PHYSOUTDEV, /* u_int32_t ifindex */ NFQA_HWADDR, /* nfqnl_msg_packet_hw */ NFQA_PAYLOAD, /* opaque data payload */ diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 1158428..464c9fa 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -33,6 +33,10 @@ #include +#ifdef CONFIG_BRIDGE_NETFILTER +#include "../bridge/br_private.h" +#endif + #define NFULNL_NLBUFSIZ_DEFAULT 4096 #define NFULNL_TIMEOUT_DEFAULT 100 /* every second */ #define NFULNL_QTHRESH_DEFAULT 100 /* 100 packets */ @@ -412,14 +416,64 @@ __build_packet_message(struct nfulnl_instance *inst, if (indev) { tmp_uint = htonl(indev->ifindex); +#ifndef CONFIG_BRIDGE_NETFILTER NFA_PUT(inst->skb, NFULA_IFINDEX_INDEV, sizeof(tmp_uint), &tmp_uint); +#else + if (pf == PF_BRIDGE) { + /* Case 1: outdev is physical input device, we need to + * look for bridge group (when called from + * netfilter_bridge) */ + NFA_PUT(inst->skb, NFULA_IFINDEX_PHYSINDEV, + sizeof(tmp_uint), &tmp_uint); + /* this is the bridge group "brX" */ + tmp_uint = htonl(indev->br_port->br->dev->ifindex); + NFA_PUT(inst->skb, NFULA_IFINDEX_INDEV, + sizeof(tmp_uint), &tmp_uint); + } else { + /* Case 2: indev is bridge group, we need to look for + * physical device (when called from ipv4) */ + NFA_PUT(inst->skb, NFULA_IFINDEX_INDEV, + sizeof(tmp_uint), &tmp_uint); + if (skb->nf_bridge && skb->nf_bridge->physindev) { + tmp_uint = + htonl(skb->nf_bridge->physindev->ifindex); + NFA_PUT(inst->skb, NFULA_IFINDEX_PHYSINDEV, + sizeof(tmp_uint), &tmp_uint); + } + } +#endif } if (outdev) { tmp_uint = htonl(outdev->ifindex); +#ifndef CONFIG_BRIDGE_NETFILTER NFA_PUT(inst->skb, NFULA_IFINDEX_OUTDEV, sizeof(tmp_uint), &tmp_uint); +#else + if (pf == PF_BRIDGE) { + /* Case 1: outdev is physical output device, we need to + * look for bridge group (when called from + * netfilter_bridge) */ + NFA_PUT(inst->skb, NFULA_IFINDEX_PHYSOUTDEV, + sizeof(tmp_uint), &tmp_uint); + /* this is the bridge group "brX" */ + tmp_uint = htonl(outdev->br_port->br->dev->ifindex); + NFA_PUT(inst->skb, NFULA_IFINDEX_OUTDEV, + sizeof(tmp_uint), &tmp_uint); + } else { + /* Case 2: indev is a bridge group, we need to look + * for physical device (when called from ipv4) */ + NFA_PUT(inst->skb, NFULA_IFINDEX_OUTDEV, + sizeof(tmp_uint), &tmp_uint); + if (skb->nf_bridge) { + tmp_uint = + htonl(skb->nf_bridge->physoutdev->ifindex); + NFA_PUT(inst->skb, NFULA_IFINDEX_PHYSOUTDEV, + sizeof(tmp_uint), &tmp_uint); + } + } +#endif } if (skb->nfmark) { @@ -536,6 +590,10 @@ nfulnl_log_packet(unsigned int pf, + NFA_SPACE(sizeof(struct nfulnl_msg_packet_hdr)) + NFA_SPACE(sizeof(u_int32_t)) /* ifindex */ + NFA_SPACE(sizeof(u_int32_t)) /* ifindex */ +#ifdef CONFIG_BRIDGE_NETFILTER + + NFA_SPACE(sizeof(u_int32_t)) /* ifindex */ + + NFA_SPACE(sizeof(u_int32_t)) /* ifindex */ +#endif + NFA_SPACE(sizeof(u_int32_t)) /* mark */ + NFA_SPACE(sizeof(u_int32_t)) /* uid */ + NFA_SPACE(NFULNL_PREFIXLEN) /* prefix */ diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 04323ee..bf92230 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -30,6 +30,10 @@ #include +#ifdef CONFIG_BRIDGE_NETFILTER +#include "../bridge/br_private.h" +#endif + #define NFQNL_QMAX_DEFAULT 1024 #if 0 @@ -361,6 +365,10 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, size = NLMSG_SPACE(sizeof(struct nfqnl_msg_packet_hdr)) + NLMSG_SPACE(sizeof(u_int32_t)) /* ifindex */ + NLMSG_SPACE(sizeof(u_int32_t)) /* ifindex */ +#ifdef CONFIG_BRIDGE_NETFILTER + + NLMSG_SPACE(sizeof(u_int32_t)) /* ifindex */ + + NLMSG_SPACE(sizeof(u_int32_t)) /* ifindex */ +#endif + NLMSG_SPACE(sizeof(u_int32_t)) /* mark */ + NLMSG_SPACE(sizeof(struct nfqnl_msg_packet_hw)) + NLMSG_SPACE(sizeof(struct nfqnl_msg_packet_timestamp)); @@ -412,12 +420,62 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, if (entry->info->indev) { tmp_uint = htonl(entry->info->indev->ifindex); +#ifndef CONFIG_BRIDGE_NETFILTER NFA_PUT(skb, NFQA_IFINDEX_INDEV, sizeof(tmp_uint), &tmp_uint); +#else + if (entry->info->pf == PF_BRIDGE) { + /* Case 1: indev is physical input device, we need to + * look for bridge group (when called from + * netfilter_bridge) */ + NFA_PUT(skb, NFQA_IFINDEX_PHYSINDEV, sizeof(tmp_uint), + &tmp_uint); + /* this is the bridge group "brX" */ + tmp_uint = htonl(entry->info->indev->br_port->br->dev->ifindex); + NFA_PUT(skb, NFQA_IFINDEX_INDEV, sizeof(tmp_uint), + &tmp_uint); + } else { + /* Case 2: indev is bridge group, we need to look for + * physical device (when called from ipv4) */ + NFA_PUT(skb, NFQA_IFINDEX_INDEV, sizeof(tmp_uint), + &tmp_uint); + if (entry->skb->nf_bridge + && entry->skb->nf_bridge->physindev) { + tmp_uint = htonl(entry->skb->nf_bridge->physindev->ifindex); + NFA_PUT(skb, NFQA_IFINDEX_PHYSINDEV, + sizeof(tmp_uint), &tmp_uint); + } + } +#endif } if (entry->info->outdev) { tmp_uint = htonl(entry->info->outdev->ifindex); +#ifndef CONFIG_BRIDGE_NETFILTER NFA_PUT(skb, NFQA_IFINDEX_OUTDEV, sizeof(tmp_uint), &tmp_uint); +#else + if (entry->info->pf == PF_BRIDGE) { + /* Case 1: outdev is physical output device, we need to + * look for bridge group (when called from + * netfilter_bridge) */ + NFA_PUT(skb, NFQA_IFINDEX_PHYSOUTDEV, sizeof(tmp_uint), + &tmp_uint); + /* this is the bridge group "brX" */ + tmp_uint = htonl(entry->info->outdev->br_port->br->dev->ifindex); + NFA_PUT(skb, NFQA_IFINDEX_OUTDEV, sizeof(tmp_uint), + &tmp_uint); + } else { + /* Case 2: outdev is bridge group, we need to look for + * physical output device (when called from ipv4) */ + NFA_PUT(skb, NFQA_IFINDEX_OUTDEV, sizeof(tmp_uint), + &tmp_uint); + if (entry->skb->nf_bridge + && entry->skb->nf_bridge->physoutdev) { + tmp_uint = htonl(entry->skb->nf_bridge->physoutdev->ifindex); + NFA_PUT(skb, NFQA_IFINDEX_PHYSOUTDEV, + sizeof(tmp_uint), &tmp_uint); + } + } +#endif } if (entry->skb->nfmark) { -- cgit v0.10.2 From bbd86b9fc469b7e91dc7444e6abb8930811d79cb Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 20:23:11 -0700 Subject: [NETFILTER]: add /proc/net/netfilter interface to nf_queue This patch adds a /proc/net/netfilter/nf_queue file, similar to the recently-added /proc/net/netfilter/nf_log. It indicates which queue handler is registered to which protocol family. This is useful since there are now multiple queue handlers in the treee (ip[6]_queue, nfnetlink_queue). Signed-off-by: Harald Welte Signed-off-by: David S. Miller diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 815583a..bf430fc 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -225,13 +225,16 @@ int nf_getsockopt(struct sock *sk, int pf, int optval, char __user *opt, int *len); /* Packet queuing */ -typedef int (*nf_queue_outfn_t)(struct sk_buff *skb, - struct nf_info *info, - unsigned int queuenum, void *data); +struct nf_queue_handler { + int (*outfn)(struct sk_buff *skb, struct nf_info *info, + unsigned int queuenum, void *data); + void *data; + char *name; +}; extern int nf_register_queue_handler(int pf, - nf_queue_outfn_t outfn, void *data); + struct nf_queue_handler *qh); extern int nf_unregister_queue_handler(int pf); -extern void nf_unregister_queue_handlers(nf_queue_outfn_t outfn); +extern void nf_unregister_queue_handlers(struct nf_queue_handler *qh); extern void nf_reinject(struct sk_buff *skb, struct nf_info *info, unsigned int verdict); diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index 629de64..1c49833 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -656,6 +656,11 @@ ipq_get_info(char *buffer, char **start, off_t offset, int length) } #endif /* CONFIG_PROC_FS */ +static struct nf_queue_handler nfqh = { + .name = "ip_queue", + .outfn = &ipq_enqueue_packet, +}; + static int init_or_cleanup(int init) { @@ -684,7 +689,7 @@ init_or_cleanup(int init) register_netdevice_notifier(&ipq_dev_notifier); ipq_sysctl_header = register_sysctl_table(ipq_root_table, 0); - status = nf_register_queue_handler(PF_INET, ipq_enqueue_packet, NULL); + status = nf_register_queue_handler(PF_INET, &nfqh); if (status < 0) { printk(KERN_ERR "ip_queue: failed to register queue handler\n"); goto cleanup_sysctl; @@ -692,7 +697,7 @@ init_or_cleanup(int init) return status; cleanup: - nf_unregister_queue_handlers(&ipq_enqueue_packet); + nf_unregister_queue_handlers(&nfqh); synchronize_net(); ipq_flush(NF_DROP); diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c index 56ffec3..7ecb91e 100644 --- a/net/ipv6/netfilter/ip6_queue.c +++ b/net/ipv6/netfilter/ip6_queue.c @@ -652,6 +652,11 @@ ipq_get_info(char *buffer, char **start, off_t offset, int length) return len; } +static struct nf_queue_handler nfqh = { + .name = "ip6_queue", + .outfn = &ipq_enqueue_packet, +}; + static int init_or_cleanup(int init) { @@ -679,7 +684,7 @@ init_or_cleanup(int init) register_netdevice_notifier(&ipq_dev_notifier); ipq_sysctl_header = register_sysctl_table(ipq_root_table, 0); - status = nf_register_queue_handler(PF_INET6, ipq_enqueue_packet, NULL); + status = nf_register_queue_handler(PF_INET6, &nfqh); if (status < 0) { printk(KERN_ERR "ip6_queue: failed to register queue handler\n"); goto cleanup_sysctl; @@ -687,7 +692,7 @@ init_or_cleanup(int init) return status; cleanup: - nf_unregister_queue_handlers(&ipq_enqueue_packet); + nf_unregister_queue_handlers(&nfqh); synchronize_net(); ipq_flush(NF_DROP); diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index ec58c4d..31a9d63 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include "nf_internals.h" diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 5586f84..8a67bde 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include "nf_internals.h" @@ -14,17 +15,12 @@ * long term mutex. The handler must provide an an outfn() to accept packets * for queueing and must reinject all packets it receives, no matter what. */ -static struct nf_queue_handler_t { - nf_queue_outfn_t outfn; - void *data; -} queue_handler[NPROTO]; - +static struct nf_queue_handler *queue_handler[NPROTO]; static struct nf_queue_rerouter *queue_rerouter; static DEFINE_RWLOCK(queue_handler_lock); - -int nf_register_queue_handler(int pf, nf_queue_outfn_t outfn, void *data) +int nf_register_queue_handler(int pf, struct nf_queue_handler *qh) { int ret; @@ -32,11 +28,10 @@ int nf_register_queue_handler(int pf, nf_queue_outfn_t outfn, void *data) return -EINVAL; write_lock_bh(&queue_handler_lock); - if (queue_handler[pf].outfn) + if (queue_handler[pf]) ret = -EBUSY; else { - queue_handler[pf].outfn = outfn; - queue_handler[pf].data = data; + queue_handler[pf] = qh; ret = 0; } write_unlock_bh(&queue_handler_lock); @@ -52,8 +47,7 @@ int nf_unregister_queue_handler(int pf) return -EINVAL; write_lock_bh(&queue_handler_lock); - queue_handler[pf].outfn = NULL; - queue_handler[pf].data = NULL; + queue_handler[pf] = NULL; write_unlock_bh(&queue_handler_lock); return 0; @@ -85,16 +79,14 @@ int nf_unregister_queue_rerouter(int pf) } EXPORT_SYMBOL_GPL(nf_unregister_queue_rerouter); -void nf_unregister_queue_handlers(nf_queue_outfn_t outfn) +void nf_unregister_queue_handlers(struct nf_queue_handler *qh) { int pf; write_lock_bh(&queue_handler_lock); for (pf = 0; pf < NPROTO; pf++) { - if (queue_handler[pf].outfn == outfn) { - queue_handler[pf].outfn = NULL; - queue_handler[pf].data = NULL; - } + if (queue_handler[pf] == qh) + queue_handler[pf] = NULL; } write_unlock_bh(&queue_handler_lock); } @@ -121,7 +113,7 @@ int nf_queue(struct sk_buff **skb, /* QUEUE == DROP if noone is waiting, to be safe. */ read_lock(&queue_handler_lock); - if (!queue_handler[pf].outfn) { + if (!queue_handler[pf]->outfn) { read_unlock(&queue_handler_lock); kfree_skb(*skb); return 1; @@ -162,8 +154,8 @@ int nf_queue(struct sk_buff **skb, if (queue_rerouter[pf].save) queue_rerouter[pf].save(*skb, info); - status = queue_handler[pf].outfn(*skb, info, queuenum, - queue_handler[pf].data); + status = queue_handler[pf]->outfn(*skb, info, queuenum, + queue_handler[pf]->data); if (status >= 0 && queue_rerouter[pf].reroute) status = queue_rerouter[pf].reroute(skb, info); @@ -259,13 +251,87 @@ void nf_reinject(struct sk_buff *skb, struct nf_info *info, } EXPORT_SYMBOL(nf_reinject); +#ifdef CONFIG_PROC_FS +static void *seq_start(struct seq_file *seq, loff_t *pos) +{ + if (*pos >= NPROTO) + return NULL; + + return pos; +} + +static void *seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + (*pos)++; + + if (*pos >= NPROTO) + return NULL; + + return pos; +} + +static void seq_stop(struct seq_file *s, void *v) +{ + +} + +static int seq_show(struct seq_file *s, void *v) +{ + int ret; + loff_t *pos = v; + struct nf_queue_handler *qh; + + read_lock_bh(&queue_handler_lock); + qh = queue_handler[*pos]; + if (!qh) + ret = seq_printf(s, "%2lld NONE\n", *pos); + else + ret = seq_printf(s, "%2lld %s\n", *pos, qh->name); + read_unlock_bh(&queue_handler_lock); + + return ret; +} + +static struct seq_operations nfqueue_seq_ops = { + .start = seq_start, + .next = seq_next, + .stop = seq_stop, + .show = seq_show, +}; + +static int nfqueue_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &nfqueue_seq_ops); +} + +static struct file_operations nfqueue_file_ops = { + .owner = THIS_MODULE, + .open = nfqueue_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#endif /* PROC_FS */ + + int __init netfilter_queue_init(void) { +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *pde; +#endif queue_rerouter = kmalloc(NPROTO * sizeof(struct nf_queue_rerouter), GFP_KERNEL); if (!queue_rerouter) return -ENOMEM; +#ifdef CONFIG_PROC_FS + pde = create_proc_entry("nf_queue", S_IRUGO, proc_net_netfilter); + if (!pde) { + kfree(queue_rerouter); + return -1; + } + pde->proc_fops = &nfqueue_file_ops; +#endif memset(queue_rerouter, 0, NPROTO * sizeof(struct nf_queue_rerouter)); return 0; diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index bf92230..741686f 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -845,6 +845,11 @@ static const int nfqa_cfg_min[NFQA_CFG_MAX] = { [NFQA_CFG_PARAMS-1] = sizeof(struct nfqnl_msg_config_params), }; +static struct nf_queue_handler nfqh = { + .name = "nf_queue", + .outfn = &nfqnl_enqueue_packet, +}; + static int nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb, struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp) @@ -890,10 +895,7 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb, case NFQNL_CFG_CMD_PF_BIND: QDEBUG("registering queue handler for pf=%u\n", ntohs(cmd->pf)); - ret = nf_register_queue_handler(ntohs(cmd->pf), - nfqnl_enqueue_packet, - NULL); - + ret = nf_register_queue_handler(ntohs(cmd->pf), &nfqh); break; case NFQNL_CFG_CMD_PF_UNBIND: QDEBUG("unregistering queue handler for pf=%u\n", @@ -1098,7 +1100,7 @@ init_or_cleanup(int init) return status; cleanup: - nf_unregister_queue_handlers(nfqnl_enqueue_packet); + nf_unregister_queue_handlers(&nfqh); unregister_netdevice_notifier(&nfqnl_dev_notifier); #ifdef CONFIG_PROC_FS remove_proc_entry("nfnetlink_queue", proc_net_netfilter); -- cgit v0.10.2 From d72367b6f36e557f122beefaa8c6b80eb1c7f245 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 20:23:36 -0700 Subject: [NETFILTER]: more verbose return codes from nf_{log,queue} This adds EEXIST to distinguish between the following return values: 0: nobody was registered, registration successful EEXIST: the exact same handler was already registered, no registration required EBUSY: somebody else is registered, registration unsuccessful. Signed-off-by: Harald Welte Signed-off-by: David S. Miller diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 31a9d63..e104760 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -18,6 +18,8 @@ static struct nf_logger *nf_logging[NPROTO]; /* = NULL */ static DEFINE_SPINLOCK(nf_log_lock); +/* return EBUSY if somebody else is registered, EEXIST if the same logger + * is registred, 0 on success. */ int nf_log_register(int pf, struct nf_logger *logger) { int ret = -EBUSY; @@ -28,7 +30,9 @@ int nf_log_register(int pf, struct nf_logger *logger) if (!nf_logging[pf]) { rcu_assign_pointer(nf_logging[pf], logger); ret = 0; - } + } else if (nf_logging[pf] == logger) + ret = -EEXIST; + spin_unlock(&nf_log_lock); return ret; } diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 8a67bde..d10d552 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -20,6 +20,8 @@ static struct nf_queue_rerouter *queue_rerouter; static DEFINE_RWLOCK(queue_handler_lock); +/* return EBUSY when somebody else is registered, return EEXIST if the + * same handler is registered, return 0 in case of success. */ int nf_register_queue_handler(int pf, struct nf_queue_handler *qh) { int ret; @@ -28,7 +30,9 @@ int nf_register_queue_handler(int pf, struct nf_queue_handler *qh) return -EINVAL; write_lock_bh(&queue_handler_lock); - if (queue_handler[pf]) + if (queue_handler[pf] == qh) + ret = -EEXIST; + else if (queue_handler[pf]) ret = -EBUSY; else { queue_handler[pf] = qh; -- cgit v0.10.2 From 8a61fadb3908454ccfa538aaa75eb1d22def5700 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 20:23:53 -0700 Subject: [NETFILTER]: check nf_log function call arguments Check whether pf is too large in order to prevent array overflow. Signed-off-by: Harald Welte Signed-off-by: David S. Miller diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index bf430fc..ac3c614 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -157,7 +157,7 @@ struct nf_logger { /* Function to register/unregister log function. */ int nf_log_register(int pf, struct nf_logger *logger); -void nf_log_unregister_pf(int pf); +int nf_log_unregister_pf(int pf); void nf_log_unregister_logger(struct nf_logger *logger); /* Calls the registered backend logging function */ diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index e104760..573e76a 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -24,6 +24,9 @@ int nf_log_register(int pf, struct nf_logger *logger) { int ret = -EBUSY; + if (pf >= NPROTO) + return -EINVAL; + /* Any setup of logging members must be done before * substituting pointer. */ spin_lock(&nf_log_lock); @@ -38,14 +41,19 @@ int nf_log_register(int pf, struct nf_logger *logger) } EXPORT_SYMBOL(nf_log_register); -void nf_log_unregister_pf(int pf) +int nf_log_unregister_pf(int pf) { + if (pf >= NPROTO) + return -EINVAL; + spin_lock(&nf_log_lock); nf_logging[pf] = NULL; spin_unlock(&nf_log_lock); /* Give time to concurrent readers. */ synchronize_net(); + + return 0; } EXPORT_SYMBOL(nf_log_unregister_pf); -- cgit v0.10.2 From 7663f18807805f02608457af8e2f59eee5d910fd Mon Sep 17 00:00:00 2001 From: Yasuyuki Kozakai Date: Tue, 9 Aug 2005 20:24:15 -0700 Subject: [NETFILTER]: return ENOMEM when ip_conntrack_alloc() fails. This patch fixes the bug which doesn't return ERR_PTR(-ENOMEM) if it failed to allocate memory space from slab cache. This bug leads to erroneously not dropped packets under stress, and wrong statistic counters ('invalid' is incremented instead of 'drop'). It was introduced during the ctnetlink merge in the net-2.6.14 tree, so no stable or mainline releases affected. Signed-off-by: Yasuyuki Kozakai Signed-off-by: Harald Welte Signed-off-by: David S. Miller diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index 9261388..285743b 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -655,7 +655,7 @@ struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig, conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC); if (!conntrack) { DEBUGP("Can't allocate conntrack.\n"); - return NULL; + return ERR_PTR(-ENOMEM); } memset(conntrack, 0, sizeof(*conntrack)); @@ -696,8 +696,9 @@ init_conntrack(struct ip_conntrack_tuple *tuple, return NULL; } - if (!(conntrack = ip_conntrack_alloc(tuple, &repl_tuple))) - return NULL; + conntrack = ip_conntrack_alloc(tuple, &repl_tuple); + if (conntrack == NULL || IS_ERR(conntrack)) + return (struct ip_conntrack_tuple_hash *)conntrack; if (!protocol->new(conntrack, skb)) { ip_conntrack_free(conntrack); -- cgit v0.10.2 From 91b9a277fc4d207249e459a455abf804ebb5499d Mon Sep 17 00:00:00 2001 From: Olof Johansson Date: Tue, 9 Aug 2005 20:24:39 -0700 Subject: [IPV4]: FIB Trie cleanups. Below is a patch that cleans up some of this, supposedly without changing any behaviour: * Whitespace cleanups * Introduce DBG() * BUG_ON() instead of if () { BUG(); } * Remove some of the deep nesting to make the code flow more comprehensible * Some mask operations were simplified Signed-off-by: Olof Johansson Signed-off-by: Robert Olsson Signed-off-by: David S. Miller diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 45efd5f..6f818cc 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -89,27 +89,27 @@ typedef unsigned int t_key; #define T_TNODE 0 #define T_LEAF 1 #define NODE_TYPE_MASK 0x1UL -#define NODE_PARENT(_node) \ - ((struct tnode *)((_node)->_parent & ~NODE_TYPE_MASK)) -#define NODE_SET_PARENT(_node, _ptr) \ - ((_node)->_parent = (((unsigned long)(_ptr)) | \ - ((_node)->_parent & NODE_TYPE_MASK))) -#define NODE_INIT_PARENT(_node, _type) \ - ((_node)->_parent = (_type)) -#define NODE_TYPE(_node) \ - ((_node)->_parent & NODE_TYPE_MASK) - -#define IS_TNODE(n) (!(n->_parent & T_LEAF)) -#define IS_LEAF(n) (n->_parent & T_LEAF) +#define NODE_PARENT(node) \ + ((struct tnode *)((node)->parent & ~NODE_TYPE_MASK)) +#define NODE_SET_PARENT(node, ptr) \ + ((node)->parent = (((unsigned long)(ptr)) | \ + ((node)->parent & NODE_TYPE_MASK))) +#define NODE_INIT_PARENT(node, type) \ + ((node)->parent = (type)) +#define NODE_TYPE(node) \ + ((node)->parent & NODE_TYPE_MASK) + +#define IS_TNODE(n) (!(n->parent & T_LEAF)) +#define IS_LEAF(n) (n->parent & T_LEAF) struct node { - t_key key; - unsigned long _parent; + t_key key; + unsigned long parent; }; struct leaf { - t_key key; - unsigned long _parent; + t_key key; + unsigned long parent; struct hlist_head list; }; @@ -120,13 +120,13 @@ struct leaf_info { }; struct tnode { - t_key key; - unsigned long _parent; - unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */ - unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */ - unsigned short full_children; /* KEYLENGTH bits needed */ - unsigned short empty_children; /* KEYLENGTH bits needed */ - struct node *child[0]; + t_key key; + unsigned long parent; + unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */ + unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */ + unsigned short full_children; /* KEYLENGTH bits needed */ + unsigned short empty_children; /* KEYLENGTH bits needed */ + struct node *child[0]; }; #ifdef CONFIG_IP_FIB_TRIE_STATS @@ -150,16 +150,18 @@ struct trie_stat { }; struct trie { - struct node *trie; + struct node *trie; #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats stats; #endif - int size; + int size; unsigned int revision; }; static int trie_debug = 0; +#define DBG(x...) do { if (trie_debug) printk(x); } while (0) + static int tnode_full(struct tnode *tn, struct node *n); static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n); static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull); @@ -171,56 +173,31 @@ static void tnode_free(struct tnode *tn); static void trie_dump_seq(struct seq_file *seq, struct trie *t); extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio); extern int fib_detect_death(struct fib_info *fi, int order, - struct fib_info **last_resort, int *last_idx, int *dflt); + struct fib_info **last_resort, int *last_idx, int *dflt); extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa, int z, int tb_id, - struct nlmsghdr *n, struct netlink_skb_parms *req); + struct nlmsghdr *n, struct netlink_skb_parms *req); static kmem_cache_t *fn_alias_kmem; static struct trie *trie_local = NULL, *trie_main = NULL; -static void trie_bug(char *err) -{ - printk("Trie Bug: %s\n", err); - BUG(); -} - static inline struct node *tnode_get_child(struct tnode *tn, int i) { - if (i >= 1<bits) - trie_bug("tnode_get_child"); + BUG_ON(i >= 1 << tn->bits); - return tn->child[i]; + return tn->child[i]; } static inline int tnode_child_length(struct tnode *tn) { - return 1<bits; + return 1 << tn->bits; } -/* - _________________________________________________________________ - | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C | - ---------------------------------------------------------------- - 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 - - _________________________________________________________________ - | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u | - ----------------------------------------------------------------- - 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 - - tp->pos = 7 - tp->bits = 3 - n->pos = 15 - n->bits=4 - KEYLENGTH=32 -*/ - static inline t_key tkey_extract_bits(t_key a, int offset, int bits) { - if (offset < KEYLENGTH) + if (offset < KEYLENGTH) return ((t_key)(a << offset)) >> (KEYLENGTH - bits); - else + else return 0; } @@ -233,8 +210,8 @@ static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b) { if (bits == 0 || offset >= KEYLENGTH) return 1; - bits = bits > KEYLENGTH ? KEYLENGTH : bits; - return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0; + bits = bits > KEYLENGTH ? KEYLENGTH : bits; + return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0; } static inline int tkey_mismatch(t_key a, int offset, t_key b) @@ -249,7 +226,7 @@ static inline int tkey_mismatch(t_key a, int offset, t_key b) return i; } -/* Candiate for fib_semantics */ +/* Candidate for fib_semantics */ static void fn_free_alias(struct fib_alias *fa) { @@ -295,7 +272,7 @@ static void fn_free_alias(struct fib_alias *fa) tp->pos = 7 tp->bits = 3 n->pos = 15 - n->bits=4 + n->bits = 4 First, let's just ignore the bits that come before the parent tp, that is the bits from 0 to (tp->pos-1). They are *known* but at this point we do @@ -343,10 +320,13 @@ static struct leaf *leaf_new(void) static struct leaf_info *leaf_info_new(int plen) { struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL); - if (li) { - li->plen = plen; - INIT_LIST_HEAD(&li->falh); - } + + if (!li) + return NULL; + + li->plen = plen; + INIT_LIST_HEAD(&li->falh); + return li; } @@ -373,7 +353,7 @@ static struct tnode *tnode_alloc(unsigned int size) static void __tnode_free(struct tnode *tn) { unsigned int size = sizeof(struct tnode) + - (1<bits) * sizeof(struct node *); + (1 << tn->bits) * sizeof(struct node *); if (size <= PAGE_SIZE) kfree(tn); @@ -387,7 +367,7 @@ static struct tnode* tnode_new(t_key key, int pos, int bits) int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *); struct tnode *tn = tnode_alloc(sz); - if (tn) { + if (tn) { memset(tn, 0, sz); NODE_INIT_PARENT(tn, T_TNODE); tn->pos = pos; @@ -397,29 +377,21 @@ static struct tnode* tnode_new(t_key key, int pos, int bits) tn->empty_children = 1< 0) - printk("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode), - (unsigned int) (sizeof(struct node) * 1< 0 ) - printk("FL %p \n", tn); - } - else if (IS_TNODE(tn)) { + DBG("FL %p \n", tn); + } else { __tnode_free(tn); - if (trie_debug > 0 ) - printk("FT %p \n", tn); - } - else { - trie_bug("tnode_free\n"); + DBG("FT %p \n", tn); } } @@ -453,7 +425,7 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int w if (i >= 1<bits) { printk("bits=%d, i=%d\n", tn->bits, i); - trie_bug("tnode_put_child_reorg bits"); + BUG(); } write_lock_bh(&fib_lock); chi = tn->child[i]; @@ -465,15 +437,15 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int w tn->empty_children--; /* update fullChildren */ - if (wasfull == -1) + if (wasfull == -1) wasfull = tnode_full(tn, chi); isfull = tnode_full(tn, n); if (wasfull && !isfull) tn->full_children--; - else if (!wasfull && isfull) tn->full_children++; + if (n) NODE_SET_PARENT(n, tn); @@ -489,9 +461,8 @@ static struct node *resize(struct trie *t, struct tnode *tn) if (!tn) return NULL; - if (trie_debug) - printk("In tnode_resize %p inflate_threshold=%d threshold=%d\n", - tn, inflate_threshold, halve_threshold); + DBG("In tnode_resize %p inflate_threshold=%d threshold=%d\n", + tn, inflate_threshold, halve_threshold); /* No children */ if (tn->empty_children == tnode_child_length(tn)) { @@ -501,20 +472,21 @@ static struct node *resize(struct trie *t, struct tnode *tn) /* One child */ if (tn->empty_children == tnode_child_length(tn) - 1) for (i = 0; i < tnode_child_length(tn); i++) { + struct node *n; write_lock_bh(&fib_lock); - if (tn->child[i] != NULL) { - - /* compress one level */ - struct node *n = tn->child[i]; - if (n) - NODE_INIT_PARENT(n, NODE_TYPE(n)); - + n = tn->child[i]; + if (!n) { write_unlock_bh(&fib_lock); - tnode_free(tn); - return n; + continue; } + + /* compress one level */ + NODE_INIT_PARENT(n, NODE_TYPE(n)); + write_unlock_bh(&fib_lock); + tnode_free(tn); + return n; } /* * Double as long as the resulting node has a number of @@ -566,16 +538,16 @@ static struct node *resize(struct trie *t, struct tnode *tn) * * expand not_to_be_doubled and to_be_doubled, and shorten: * 100 * (tnode_child_length(tn) - tn->empty_children + - * tn->full_children ) >= inflate_threshold * new_child_length + * tn->full_children) >= inflate_threshold * new_child_length * * expand new_child_length: * 100 * (tnode_child_length(tn) - tn->empty_children + - * tn->full_children ) >= + * tn->full_children) >= * inflate_threshold * tnode_child_length(tn) * 2 * * shorten again: * 50 * (tn->full_children + tnode_child_length(tn) - - * tn->empty_children ) >= inflate_threshold * + * tn->empty_children) >= inflate_threshold * * tnode_child_length(tn) * */ @@ -624,20 +596,23 @@ static struct node *resize(struct trie *t, struct tnode *tn) if (tn->empty_children == tnode_child_length(tn) - 1) for (i = 0; i < tnode_child_length(tn); i++) { - - write_lock_bh(&fib_lock); - if (tn->child[i] != NULL) { - /* compress one level */ - struct node *n = tn->child[i]; + struct node *n; - if (n) - NODE_INIT_PARENT(n, NODE_TYPE(n)); + write_lock_bh(&fib_lock); + n = tn->child[i]; + if (!n) { write_unlock_bh(&fib_lock); - tnode_free(tn); - return n; + continue; } + + /* compress one level */ + + NODE_INIT_PARENT(n, NODE_TYPE(n)); + write_unlock_bh(&fib_lock); + tnode_free(tn); + return n; } return (struct node *) tn; @@ -650,8 +625,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err) int olen = tnode_child_length(tn); int i; - if (trie_debug) - printk("In inflate\n"); + DBG("In inflate\n"); tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1); @@ -666,8 +640,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err) * fails. In case of failure we return the oldnode and inflate * of tnode is ignored. */ - - for(i = 0; i < olen; i++) { + + for (i = 0; i < olen; i++) { struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i); if (inode && @@ -675,7 +649,6 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err) inode->pos == oldtnode->pos + oldtnode->bits && inode->bits > 1) { struct tnode *left, *right; - t_key m = TKEY_GET_MASK(inode->pos, 1); left = tnode_new(inode->key&(~m), inode->pos + 1, @@ -685,7 +658,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err) *err = -ENOMEM; break; } - + right = tnode_new(inode->key|m, inode->pos + 1, inode->bits - 1); @@ -703,18 +676,20 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err) int size = tnode_child_length(tn); int j; - for(j = 0; j < size; j++) + for (j = 0; j < size; j++) if (tn->child[j]) tnode_free((struct tnode *)tn->child[j]); tnode_free(tn); - + *err = -ENOMEM; return oldtnode; } - for(i = 0; i < olen; i++) { + for (i = 0; i < olen; i++) { struct node *node = tnode_get_child(oldtnode, i); + struct tnode *left, *right; + int size, j; /* An empty child */ if (node == NULL) @@ -740,56 +715,51 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err) put_child(t, tn, 2*i+1, inode->child[1]); tnode_free(inode); + continue; } - /* An internal node with more than two children */ - else { - struct tnode *left, *right; - int size, j; - - /* We will replace this node 'inode' with two new - * ones, 'left' and 'right', each with half of the - * original children. The two new nodes will have - * a position one bit further down the key and this - * means that the "significant" part of their keys - * (see the discussion near the top of this file) - * will differ by one bit, which will be "0" in - * left's key and "1" in right's key. Since we are - * moving the key position by one step, the bit that - * we are moving away from - the bit at position - * (inode->pos) - is the one that will differ between - * left and right. So... we synthesize that bit in the - * two new keys. - * The mask 'm' below will be a single "one" bit at - * the position (inode->pos) - */ - - /* Use the old key, but set the new significant - * bit to zero. - */ + /* An internal node with more than two children */ + + /* We will replace this node 'inode' with two new + * ones, 'left' and 'right', each with half of the + * original children. The two new nodes will have + * a position one bit further down the key and this + * means that the "significant" part of their keys + * (see the discussion near the top of this file) + * will differ by one bit, which will be "0" in + * left's key and "1" in right's key. Since we are + * moving the key position by one step, the bit that + * we are moving away from - the bit at position + * (inode->pos) - is the one that will differ between + * left and right. So... we synthesize that bit in the + * two new keys. + * The mask 'm' below will be a single "one" bit at + * the position (inode->pos) + */ - left = (struct tnode *) tnode_get_child(tn, 2*i); - put_child(t, tn, 2*i, NULL); + /* Use the old key, but set the new significant + * bit to zero. + */ - if (!left) - BUG(); + left = (struct tnode *) tnode_get_child(tn, 2*i); + put_child(t, tn, 2*i, NULL); - right = (struct tnode *) tnode_get_child(tn, 2*i+1); - put_child(t, tn, 2*i+1, NULL); + BUG_ON(!left); - if (!right) - BUG(); + right = (struct tnode *) tnode_get_child(tn, 2*i+1); + put_child(t, tn, 2*i+1, NULL); - size = tnode_child_length(left); - for(j = 0; j < size; j++) { - put_child(t, left, j, inode->child[j]); - put_child(t, right, j, inode->child[j + size]); - } - put_child(t, tn, 2*i, resize(t, left)); - put_child(t, tn, 2*i+1, resize(t, right)); + BUG_ON(!right); - tnode_free(inode); + size = tnode_child_length(left); + for (j = 0; j < size; j++) { + put_child(t, left, j, inode->child[j]); + put_child(t, right, j, inode->child[j + size]); } + put_child(t, tn, 2*i, resize(t, left)); + put_child(t, tn, 2*i+1, resize(t, right)); + + tnode_free(inode); } tnode_free(oldtnode); return tn; @@ -802,7 +772,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn, int *err) int i; int olen = tnode_child_length(tn); - if (trie_debug) printk("In halve\n"); + DBG("In halve\n"); tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1); @@ -818,7 +788,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn, int *err) * of tnode is ignored. */ - for(i = 0; i < olen; i += 2) { + for (i = 0; i < olen; i += 2) { left = tnode_get_child(oldtnode, i); right = tnode_get_child(oldtnode, i+1); @@ -839,17 +809,19 @@ static struct tnode *halve(struct trie *t, struct tnode *tn, int *err) int size = tnode_child_length(tn); int j; - for(j = 0; j < size; j++) + for (j = 0; j < size; j++) if (tn->child[j]) tnode_free((struct tnode *)tn->child[j]); tnode_free(tn); - + *err = -ENOMEM; return oldtnode; } - for(i = 0; i < olen; i += 2) { + for (i = 0; i < olen; i += 2) { + struct tnode *newBinNode; + left = tnode_get_child(oldtnode, i); right = tnode_get_child(oldtnode, i+1); @@ -858,38 +830,39 @@ static struct tnode *halve(struct trie *t, struct tnode *tn, int *err) if (right == NULL) /* Both are empty */ continue; put_child(t, tn, i/2, right); - } else if (right == NULL) + continue; + } + + if (right == NULL) { put_child(t, tn, i/2, left); + continue; + } /* Two nonempty children */ - else { - struct tnode *newBinNode = - (struct tnode *) tnode_get_child(tn, i/2); - put_child(t, tn, i/2, NULL); + newBinNode = (struct tnode *) tnode_get_child(tn, i/2); + put_child(t, tn, i/2, NULL); - if (!newBinNode) - BUG(); + BUG_ON(!newBinNode); - put_child(t, newBinNode, 0, left); - put_child(t, newBinNode, 1, right); - put_child(t, tn, i/2, resize(t, newBinNode)); - } + put_child(t, newBinNode, 0, left); + put_child(t, newBinNode, 1, right); + put_child(t, tn, i/2, resize(t, newBinNode)); } tnode_free(oldtnode); return tn; } -static void *trie_init(struct trie *t) +static void trie_init(struct trie *t) { - if (t) { - t->size = 0; - t->trie = NULL; - t->revision = 0; + if (!t) + return; + + t->size = 0; + t->trie = NULL; + t->revision = 0; #ifdef CONFIG_IP_FIB_TRIE_STATS - memset(&t->stats, 0, sizeof(struct trie_use_stats)); + memset(&t->stats, 0, sizeof(struct trie_use_stats)); #endif - } - return t; } static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen) @@ -897,39 +870,37 @@ static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen) struct hlist_node *node; struct leaf_info *li; - hlist_for_each_entry(li, node, head, hlist) { + hlist_for_each_entry(li, node, head, hlist) if (li->plen == plen) return li; - } + return NULL; } static inline struct list_head * get_fa_head(struct leaf *l, int plen) { - struct list_head *fa_head = NULL; struct leaf_info *li = find_leaf_info(&l->list, plen); - if (li) - fa_head = &li->falh; + if (!li) + return NULL; - return fa_head; + return &li->falh; } static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new) { struct leaf_info *li = NULL, *last = NULL; - struct hlist_node *node, *tmp; + struct hlist_node *node; write_lock_bh(&fib_lock); - if (hlist_empty(head)) + if (hlist_empty(head)) { hlist_add_head(&new->hlist, head); - else { - hlist_for_each_entry_safe(li, node, tmp, head, hlist) { - + } else { + hlist_for_each_entry(li, node, head, hlist) { if (new->plen > li->plen) break; - + last = li; } if (last) @@ -952,49 +923,47 @@ fib_find_node(struct trie *t, u32 key) while (n != NULL && NODE_TYPE(n) == T_TNODE) { tn = (struct tnode *) n; - + check_tnode(tn); - + if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { - pos=tn->pos + tn->bits; + pos = tn->pos + tn->bits; n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits)); - } - else + } else break; } /* Case we have found a leaf. Compare prefixes */ - if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) { - struct leaf *l = (struct leaf *) n; - return l; - } + if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) + return (struct leaf *)n; + return NULL; } static struct node *trie_rebalance(struct trie *t, struct tnode *tn) { - int i = 0; + int i; int wasfull; t_key cindex, key; struct tnode *tp = NULL; - if (!tn) - BUG(); + BUG_ON(!tn); key = tn->key; i = 0; while (tn != NULL && NODE_PARENT(tn) != NULL) { - if (i > 10) { printk("Rebalance tn=%p \n", tn); - if (tn) printk("tn->parent=%p \n", NODE_PARENT(tn)); - + if (tn) + printk("tn->parent=%p \n", NODE_PARENT(tn)); + printk("Rebalance tp=%p \n", tp); - if (tp) printk("tp->parent=%p \n", NODE_PARENT(tp)); + if (tp) + printk("tp->parent=%p \n", NODE_PARENT(tp)); } - if (i > 12) BUG(); + BUG_ON(i > 12); /* Why is this a bug? -ojn */ i++; tp = NODE_PARENT(tn); @@ -1002,7 +971,7 @@ static struct node *trie_rebalance(struct trie *t, struct tnode *tn) wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); tn = (struct tnode *) resize (t, (struct tnode *)tn); tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull); - + if (!NODE_PARENT(tn)) break; @@ -1050,20 +1019,19 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen) while (n != NULL && NODE_TYPE(n) == T_TNODE) { tn = (struct tnode *) n; - + check_tnode(tn); - + if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { tp = tn; - pos=tn->pos + tn->bits; + pos = tn->pos + tn->bits; n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits)); if (n && NODE_PARENT(n) != tn) { printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n)); BUG(); } - } - else + } else break; } @@ -1073,17 +1041,15 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen) * tp is n's (parent) ----> NULL or TNODE */ - if (tp && IS_LEAF(tp)) - BUG(); - + BUG_ON(tp && IS_LEAF(tp)); /* Case 1: n is a leaf. Compare prefixes */ if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) { - struct leaf *l = ( struct leaf *) n; - + struct leaf *l = (struct leaf *) n; + li = leaf_info_new(plen); - + if (!li) { *err = -ENOMEM; goto err; @@ -1113,35 +1079,31 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen) fa_head = &li->falh; insert_leaf_info(&l->list, li); - /* Case 2: n is NULL, and will just insert a new leaf */ if (t->trie && n == NULL) { + /* Case 2: n is NULL, and will just insert a new leaf */ NODE_SET_PARENT(l, tp); - - if (!tp) - BUG(); - else { - cindex = tkey_extract_bits(key, tp->pos, tp->bits); - put_child(t, (struct tnode *)tp, cindex, (struct node *)l); - } - } - /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ - else { + BUG_ON(!tp); + + cindex = tkey_extract_bits(key, tp->pos, tp->bits); + put_child(t, (struct tnode *)tp, cindex, (struct node *)l); + } else { + /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ /* * Add a new tnode here * first tnode need some special handling */ if (tp) - pos=tp->pos+tp->bits; + pos = tp->pos+tp->bits; else - pos=0; + pos = 0; + if (n) { newpos = tkey_mismatch(key, pos, n->key); tn = tnode_new(n->key, newpos, 1); - } - else { + } else { newpos = 0; tn = tnode_new(key, newpos, 1); /* First tnode */ } @@ -1151,32 +1113,32 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen) tnode_free((struct tnode *) l); *err = -ENOMEM; goto err; - } - + } + NODE_SET_PARENT(tn, tp); - missbit=tkey_extract_bits(key, newpos, 1); + missbit = tkey_extract_bits(key, newpos, 1); put_child(t, tn, missbit, (struct node *)l); put_child(t, tn, 1-missbit, n); if (tp) { cindex = tkey_extract_bits(key, tp->pos, tp->bits); put_child(t, (struct tnode *)tp, cindex, (struct node *)tn); - } - else { + } else { t->trie = (struct node*) tn; /* First tnode */ tp = tn; } } - if (tp && tp->pos+tp->bits > 32) { + + if (tp && tp->pos + tp->bits > 32) printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n", tp, tp->pos, tp->bits, key, plen); - } + /* Rebalance the trie */ t->trie = trie_rebalance(t, tp); done: t->revision++; -err:; +err: return fa_head; } @@ -1204,17 +1166,18 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, key = ntohl(key); - if (trie_debug) - printk("Insert table=%d %08x/%d\n", tb->tb_id, key, plen); + DBG("Insert table=%d %08x/%d\n", tb->tb_id, key, plen); - mask = ntohl( inet_make_mask(plen) ); + mask = ntohl(inet_make_mask(plen)); if (key & ~mask) return -EINVAL; key = key & mask; - if ((fi = fib_create_info(r, rta, nlhdr, &err)) == NULL) + fi = fib_create_info(r, rta, nlhdr, &err); + + if (!fi) goto err; l = fib_find_node(t, key); @@ -1236,8 +1199,7 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, * and we need to allocate a new one of those as well. */ - if (fa && - fa->fa_info->fib_priority == fi->fib_priority) { + if (fa && fa->fa_info->fib_priority == fi->fib_priority) { struct fib_alias *fa_orig; err = -EEXIST; @@ -1261,9 +1223,9 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, fib_release_info(fi_drop); if (state & FA_S_ACCESSED) - rt_cache_flush(-1); + rt_cache_flush(-1); - goto succeeded; + goto succeeded; } /* Error if we find a perfect match which * uses the same scope, type, and nexthop @@ -1285,7 +1247,7 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, fa = fa_orig; } err = -ENOENT; - if (!(nlhdr->nlmsg_flags&NLM_F_CREATE)) + if (!(nlhdr->nlmsg_flags & NLM_F_CREATE)) goto out; err = -ENOBUFS; @@ -1298,9 +1260,6 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, new_fa->fa_type = type; new_fa->fa_scope = r->rtm_scope; new_fa->fa_state = 0; -#if 0 - new_fa->dst = NULL; -#endif /* * Insert new entry to the list. */ @@ -1314,8 +1273,7 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, write_lock_bh(&fib_lock); - list_add_tail(&new_fa->fa_list, - (fa ? &fa->fa_list : fa_head)); + list_add_tail(&new_fa->fa_list, (fa ? &fa->fa_list : fa_head)); write_unlock_bh(&fib_lock); @@ -1328,7 +1286,7 @@ out_free_new_fa: kmem_cache_free(fn_alias_kmem, new_fa); out: fib_release_info(fi); -err:; +err: return err; } @@ -1342,7 +1300,6 @@ static inline int check_leaf(struct trie *t, struct leaf *l, t_key key, int *pl struct hlist_node *node; hlist_for_each_entry(li, node, hhead, hlist) { - i = li->plen; mask = ntohl(inet_make_mask(i)); if (l->key != (key & mask)) @@ -1370,13 +1327,18 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result struct node *n; struct tnode *pn; int pos, bits; - t_key key=ntohl(flp->fl4_dst); + t_key key = ntohl(flp->fl4_dst); int chopped_off; t_key cindex = 0; int current_prefix_length = KEYLENGTH; + struct tnode *cn; + t_key node_prefix, key_prefix, pref_mismatch; + int mp; + n = t->trie; read_lock(&fib_lock); + if (!n) goto failed; @@ -1393,8 +1355,7 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result pn = (struct tnode *) n; chopped_off = 0; - while (pn) { - + while (pn) { pos = pn->pos; bits = pn->bits; @@ -1410,130 +1371,129 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result goto backtrace; } - if (IS_TNODE(n)) { + if (IS_LEAF(n)) { + if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0) + goto found; + else + goto backtrace; + } + #define HL_OPTIMIZE #ifdef HL_OPTIMIZE - struct tnode *cn = (struct tnode *)n; - t_key node_prefix, key_prefix, pref_mismatch; - int mp; + cn = (struct tnode *)n; - /* - * It's a tnode, and we can do some extra checks here if we - * like, to avoid descending into a dead-end branch. - * This tnode is in the parent's child array at index - * key[p_pos..p_pos+p_bits] but potentially with some bits - * chopped off, so in reality the index may be just a - * subprefix, padded with zero at the end. - * We can also take a look at any skipped bits in this - * tnode - everything up to p_pos is supposed to be ok, - * and the non-chopped bits of the index (se previous - * paragraph) are also guaranteed ok, but the rest is - * considered unknown. - * - * The skipped bits are key[pos+bits..cn->pos]. - */ - - /* If current_prefix_length < pos+bits, we are already doing - * actual prefix matching, which means everything from - * pos+(bits-chopped_off) onward must be zero along some - * branch of this subtree - otherwise there is *no* valid - * prefix present. Here we can only check the skipped - * bits. Remember, since we have already indexed into the - * parent's child array, we know that the bits we chopped of - * *are* zero. - */ + /* + * It's a tnode, and we can do some extra checks here if we + * like, to avoid descending into a dead-end branch. + * This tnode is in the parent's child array at index + * key[p_pos..p_pos+p_bits] but potentially with some bits + * chopped off, so in reality the index may be just a + * subprefix, padded with zero at the end. + * We can also take a look at any skipped bits in this + * tnode - everything up to p_pos is supposed to be ok, + * and the non-chopped bits of the index (se previous + * paragraph) are also guaranteed ok, but the rest is + * considered unknown. + * + * The skipped bits are key[pos+bits..cn->pos]. + */ - /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */ - - if (current_prefix_length < pos+bits) { - if (tkey_extract_bits(cn->key, current_prefix_length, - cn->pos - current_prefix_length) != 0 || - !(cn->child[0])) - goto backtrace; - } + /* If current_prefix_length < pos+bits, we are already doing + * actual prefix matching, which means everything from + * pos+(bits-chopped_off) onward must be zero along some + * branch of this subtree - otherwise there is *no* valid + * prefix present. Here we can only check the skipped + * bits. Remember, since we have already indexed into the + * parent's child array, we know that the bits we chopped of + * *are* zero. + */ - /* - * If chopped_off=0, the index is fully validated and we - * only need to look at the skipped bits for this, the new, - * tnode. What we actually want to do is to find out if - * these skipped bits match our key perfectly, or if we will - * have to count on finding a matching prefix further down, - * because if we do, we would like to have some way of - * verifying the existence of such a prefix at this point. - */ + /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */ - /* The only thing we can do at this point is to verify that - * any such matching prefix can indeed be a prefix to our - * key, and if the bits in the node we are inspecting that - * do not match our key are not ZERO, this cannot be true. - * Thus, find out where there is a mismatch (before cn->pos) - * and verify that all the mismatching bits are zero in the - * new tnode's key. - */ + if (current_prefix_length < pos+bits) { + if (tkey_extract_bits(cn->key, current_prefix_length, + cn->pos - current_prefix_length) != 0 || + !(cn->child[0])) + goto backtrace; + } - /* Note: We aren't very concerned about the piece of the key - * that precede pn->pos+pn->bits, since these have already been - * checked. The bits after cn->pos aren't checked since these are - * by definition "unknown" at this point. Thus, what we want to - * see is if we are about to enter the "prefix matching" state, - * and in that case verify that the skipped bits that will prevail - * throughout this subtree are zero, as they have to be if we are - * to find a matching prefix. - */ + /* + * If chopped_off=0, the index is fully validated and we + * only need to look at the skipped bits for this, the new, + * tnode. What we actually want to do is to find out if + * these skipped bits match our key perfectly, or if we will + * have to count on finding a matching prefix further down, + * because if we do, we would like to have some way of + * verifying the existence of such a prefix at this point. + */ - node_prefix = MASK_PFX(cn->key, cn->pos); - key_prefix = MASK_PFX(key, cn->pos); - pref_mismatch = key_prefix^node_prefix; - mp = 0; + /* The only thing we can do at this point is to verify that + * any such matching prefix can indeed be a prefix to our + * key, and if the bits in the node we are inspecting that + * do not match our key are not ZERO, this cannot be true. + * Thus, find out where there is a mismatch (before cn->pos) + * and verify that all the mismatching bits are zero in the + * new tnode's key. + */ - /* In short: If skipped bits in this node do not match the search - * key, enter the "prefix matching" state.directly. - */ - if (pref_mismatch) { - while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) { - mp++; - pref_mismatch = pref_mismatch <<1; - } - key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp); - - if (key_prefix != 0) - goto backtrace; - - if (current_prefix_length >= cn->pos) - current_prefix_length=mp; - } -#endif - pn = (struct tnode *)n; /* Descend */ - chopped_off = 0; - continue; + /* Note: We aren't very concerned about the piece of the key + * that precede pn->pos+pn->bits, since these have already been + * checked. The bits after cn->pos aren't checked since these are + * by definition "unknown" at this point. Thus, what we want to + * see is if we are about to enter the "prefix matching" state, + * and in that case verify that the skipped bits that will prevail + * throughout this subtree are zero, as they have to be if we are + * to find a matching prefix. + */ + + node_prefix = MASK_PFX(cn->key, cn->pos); + key_prefix = MASK_PFX(key, cn->pos); + pref_mismatch = key_prefix^node_prefix; + mp = 0; + + /* In short: If skipped bits in this node do not match the search + * key, enter the "prefix matching" state.directly. + */ + if (pref_mismatch) { + while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) { + mp++; + pref_mismatch = pref_mismatch <<1; + } + key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp); + + if (key_prefix != 0) + goto backtrace; + + if (current_prefix_length >= cn->pos) + current_prefix_length = mp; } - if (IS_LEAF(n)) { - if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0) - goto found; - } +#endif + pn = (struct tnode *)n; /* Descend */ + chopped_off = 0; + continue; + backtrace: chopped_off++; /* As zero don't change the child key (cindex) */ - while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1)))) { + while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1)))) chopped_off++; - } /* Decrease current_... with bits chopped off */ if (current_prefix_length > pn->pos + pn->bits - chopped_off) current_prefix_length = pn->pos + pn->bits - chopped_off; - + /* * Either we do the actual chop off according or if we have * chopped off all bits in this tnode walk up to our parent. */ - if (chopped_off <= pn->bits) + if (chopped_off <= pn->bits) { cindex &= ~(1 << (chopped_off-1)); - else { + } else { if (NODE_PARENT(pn) == NULL) goto failed; - + /* Get Child's index */ cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits); pn = NODE_PARENT(pn); @@ -1559,24 +1519,23 @@ static int trie_leaf_remove(struct trie *t, t_key key) struct node *n = t->trie; struct leaf *l; - if (trie_debug) - printk("entering trie_leaf_remove(%p)\n", n); + DBG("entering trie_leaf_remove(%p)\n", n); /* Note that in the case skipped bits, those bits are *not* checked! * When we finish this, we will have NULL or a T_LEAF, and the * T_LEAF may or may not match our key. */ - while (n != NULL && IS_TNODE(n)) { + while (n != NULL && IS_TNODE(n)) { struct tnode *tn = (struct tnode *) n; check_tnode(tn); n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits)); - if (n && NODE_PARENT(n) != tn) { - printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n)); - BUG(); - } - } + if (n && NODE_PARENT(n) != tn) { + printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n)); + BUG(); + } + } l = (struct leaf *) n; if (!n || !tkey_equals(l->key, key)) @@ -1597,8 +1556,7 @@ static int trie_leaf_remove(struct trie *t, t_key key) cindex = tkey_extract_bits(key, tp->pos, tp->bits); put_child(t, (struct tnode *)tp, cindex, NULL); t->trie = trie_rebalance(t, tp); - } - else + } else t->trie = NULL; return 1; @@ -1606,7 +1564,7 @@ static int trie_leaf_remove(struct trie *t, t_key key) static int fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, - struct nlmsghdr *nlhdr, struct netlink_skb_parms *req) + struct nlmsghdr *nlhdr, struct netlink_skb_parms *req) { struct trie *t = (struct trie *) tb->tb_data; u32 key, mask; @@ -1615,6 +1573,9 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, struct fib_alias *fa, *fa_to_delete; struct list_head *fa_head; struct leaf *l; + int kill_li = 0; + struct leaf_info *li; + if (plen > 32) return -EINVAL; @@ -1624,7 +1585,7 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, memcpy(&key, rta->rta_dst, 4); key = ntohl(key); - mask = ntohl( inet_make_mask(plen) ); + mask = ntohl(inet_make_mask(plen)); if (key & ~mask) return -EINVAL; @@ -1641,8 +1602,7 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, if (!fa) return -ESRCH; - if (trie_debug) - printk("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t); + DBG("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t); fa_to_delete = NULL; fa_head = fa->fa_list.prev; @@ -1664,39 +1624,36 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, } } - if (fa_to_delete) { - int kill_li = 0; - struct leaf_info *li; + if (!fa_to_delete) + return -ESRCH; - fa = fa_to_delete; - rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req); + fa = fa_to_delete; + rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req); + + l = fib_find_node(t, key); + li = find_leaf_info(&l->list, plen); - l = fib_find_node(t, key); - li = find_leaf_info(&l->list, plen); + write_lock_bh(&fib_lock); - write_lock_bh(&fib_lock); + list_del(&fa->fa_list); - list_del(&fa->fa_list); + if (list_empty(fa_head)) { + hlist_del(&li->hlist); + kill_li = 1; + } + write_unlock_bh(&fib_lock); - if (list_empty(fa_head)) { - hlist_del(&li->hlist); - kill_li = 1; - } - write_unlock_bh(&fib_lock); - - if (kill_li) - free_leaf_info(li); + if (kill_li) + free_leaf_info(li); - if (hlist_empty(&l->list)) - trie_leaf_remove(t, key); + if (hlist_empty(&l->list)) + trie_leaf_remove(t, key); - if (fa->fa_state & FA_S_ACCESSED) - rt_cache_flush(-1); + if (fa->fa_state & FA_S_ACCESSED) + rt_cache_flush(-1); - fn_free_alias(fa); - return 0; - } - return -ESRCH; + fn_free_alias(fa); + return 0; } static int trie_flush_list(struct trie *t, struct list_head *head) @@ -1706,9 +1663,8 @@ static int trie_flush_list(struct trie *t, struct list_head *head) list_for_each_entry_safe(fa, fa_node, head, fa_list) { struct fib_info *fi = fa->fa_info; - - if (fi && (fi->fib_flags&RTNH_F_DEAD)) { + if (fi && (fi->fib_flags&RTNH_F_DEAD)) { write_lock_bh(&fib_lock); list_del(&fa->fa_list); write_unlock_bh(&fib_lock); @@ -1728,11 +1684,9 @@ static int trie_flush_leaf(struct trie *t, struct leaf *l) struct leaf_info *li = NULL; hlist_for_each_entry_safe(li, node, tmp, lih, hlist) { - found += trie_flush_list(t, &li->falh); if (list_empty(&li->falh)) { - write_lock_bh(&fib_lock); hlist_del(&li->hlist); write_unlock_bh(&fib_lock); @@ -1757,8 +1711,7 @@ static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf) return (struct leaf *) t->trie; p = (struct tnode*) t->trie; /* Start */ - } - else + } else p = (struct tnode *) NODE_PARENT(c); while (p) { @@ -1771,29 +1724,28 @@ static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf) pos = 0; last = 1 << p->bits; - for(idx = pos; idx < last ; idx++) { - if (p->child[idx]) { - - /* Decend if tnode */ - - while (IS_TNODE(p->child[idx])) { - p = (struct tnode*) p->child[idx]; - idx = 0; - - /* Rightmost non-NULL branch */ - if (p && IS_TNODE(p)) - while (p->child[idx] == NULL && idx < (1 << p->bits)) idx++; - - /* Done with this tnode? */ - if (idx >= (1 << p->bits) || p->child[idx] == NULL ) - goto up; - } - return (struct leaf*) p->child[idx]; + for (idx = pos; idx < last ; idx++) { + if (!p->child[idx]) + continue; + + /* Decend if tnode */ + while (IS_TNODE(p->child[idx])) { + p = (struct tnode*) p->child[idx]; + idx = 0; + + /* Rightmost non-NULL branch */ + if (p && IS_TNODE(p)) + while (p->child[idx] == NULL && idx < (1 << p->bits)) idx++; + + /* Done with this tnode? */ + if (idx >= (1 << p->bits) || p->child[idx] == NULL) + goto up; } + return (struct leaf*) p->child[idx]; } up: /* No more children go up one step */ - c = (struct node*) p; + c = (struct node *) p; p = (struct tnode *) NODE_PARENT(p); } return NULL; /* Ready. Root of trie */ @@ -1807,7 +1759,7 @@ static int fn_trie_flush(struct fib_table *tb) t->revision++; - for (h=0; (l = nextleaf(t, l)) != NULL; h++) { + for (h = 0; (l = nextleaf(t, l)) != NULL; h++) { found += trie_flush_leaf(t, l); if (ll && hlist_empty(&ll->list)) @@ -1818,12 +1770,11 @@ static int fn_trie_flush(struct fib_table *tb) if (ll && hlist_empty(&ll->list)) trie_leaf_remove(t, ll->key); - if (trie_debug) - printk("trie_flush found=%d\n", found); + DBG("trie_flush found=%d\n", found); return found; } -static int trie_last_dflt=-1; +static int trie_last_dflt = -1; static void fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) @@ -1855,18 +1806,18 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib list_for_each_entry(fa, fa_head, fa_list) { struct fib_info *next_fi = fa->fa_info; - + if (fa->fa_scope != res->scope || fa->fa_type != RTN_UNICAST) continue; - + if (next_fi->fib_priority > res->fi->fib_priority) break; if (!next_fi->fib_nh[0].nh_gw || next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) continue; fa->fa_state |= FA_S_ACCESSED; - + if (fi == NULL) { if (next_fi != res->fi) break; @@ -1913,9 +1864,9 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fi int i, s_i; struct fib_alias *fa; - u32 xkey=htonl(key); + u32 xkey = htonl(key); - s_i=cb->args[3]; + s_i = cb->args[3]; i = 0; list_for_each_entry(fa, fah, fa_list) { @@ -1946,10 +1897,10 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fi fa->fa_info, 0) < 0) { cb->args[3] = i; return -1; - } + } i++; } - cb->args[3]=i; + cb->args[3] = i; return skb->len; } @@ -1959,10 +1910,10 @@ static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, str int h, s_h; struct list_head *fa_head; struct leaf *l = NULL; - s_h=cb->args[2]; - for (h=0; (l = nextleaf(t, l)) != NULL; h++) { + s_h = cb->args[2]; + for (h = 0; (l = nextleaf(t, l)) != NULL; h++) { if (h < s_h) continue; if (h > s_h) @@ -1970,7 +1921,7 @@ static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, str sizeof(cb->args) - 3*sizeof(cb->args[0])); fa_head = get_fa_head(l, plen); - + if (!fa_head) continue; @@ -1978,11 +1929,11 @@ static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, str continue; if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) { - cb->args[2]=h; + cb->args[2] = h; return -1; } } - cb->args[2]=h; + cb->args[2] = h; return skb->len; } @@ -1994,13 +1945,12 @@ static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlin s_m = cb->args[1]; read_lock(&fib_lock); - for (m=0; m<=32; m++) { - + for (m = 0; m <= 32; m++) { if (m < s_m) continue; if (m > s_m) memset(&cb->args[2], 0, - sizeof(cb->args) - 2*sizeof(cb->args[0])); + sizeof(cb->args) - 2*sizeof(cb->args[0])); if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) { cb->args[1] = m; @@ -2010,7 +1960,7 @@ static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlin read_unlock(&fib_lock); cb->args[1] = m; return skb->len; - out: +out: read_unlock(&fib_lock); return -1; } @@ -2051,9 +2001,9 @@ struct fib_table * __init fib_hash_init(int id) trie_init(t); if (id == RT_TABLE_LOCAL) - trie_local = t; + trie_local = t; else if (id == RT_TABLE_MAIN) - trie_main = t; + trie_main = t; if (id == RT_TABLE_LOCAL) printk("IPv4 FIB: Using LC-trie version %s\n", VERSION); @@ -2065,7 +2015,8 @@ struct fib_table * __init fib_hash_init(int id) static void putspace_seq(struct seq_file *seq, int n) { - while (n--) seq_printf(seq, " "); + while (n--) + seq_printf(seq, " "); } static void printbin_seq(struct seq_file *seq, unsigned int v, int bits) @@ -2086,29 +2037,22 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n, seq_printf(seq, "%d/", cindex); printbin_seq(seq, cindex, bits); seq_printf(seq, ": "); - } - else + } else seq_printf(seq, ": "); seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n); - if (IS_LEAF(n)) - seq_printf(seq, "key=%d.%d.%d.%d\n", - n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256); - else { - int plen = ((struct tnode *)n)->pos; - t_key prf=MASK_PFX(n->key, plen); - seq_printf(seq, "key=%d.%d.%d.%d/%d\n", - prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen); - } if (IS_LEAF(n)) { - struct leaf *l=(struct leaf *)n; + struct leaf *l = (struct leaf *)n; struct fib_alias *fa; int i; - for (i=32; i>=0; i--) - if (find_leaf_info(&l->list, i)) { - + + seq_printf(seq, "key=%d.%d.%d.%d\n", + n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256); + + for (i = 32; i >= 0; i--) + if (find_leaf_info(&l->list, i)) { struct list_head *fa_head = get_fa_head(l, i); - + if (!fa_head) continue; @@ -2118,17 +2062,16 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n, putspace_seq(seq, indent+2); seq_printf(seq, "{/%d...dumping}\n", i); - list_for_each_entry(fa, fa_head, fa_list) { putspace_seq(seq, indent+2); - if (fa->fa_info->fib_nh == NULL) { - seq_printf(seq, "Error _fib_nh=NULL\n"); - continue; - } if (fa->fa_info == NULL) { seq_printf(seq, "Error fa_info=NULL\n"); continue; } + if (fa->fa_info->fib_nh == NULL) { + seq_printf(seq, "Error _fib_nh=NULL\n"); + continue; + } seq_printf(seq, "{type=%d scope=%d TOS=%d}\n", fa->fa_type, @@ -2136,11 +2079,16 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n, fa->fa_tos); } } - } - else if (IS_TNODE(n)) { + } else { struct tnode *tn = (struct tnode *)n; + int plen = ((struct tnode *)n)->pos; + t_key prf = MASK_PFX(n->key, plen); + + seq_printf(seq, "key=%d.%d.%d.%d/%d\n", + prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen); + putspace_seq(seq, indent); seq_printf(seq, "| "); - seq_printf(seq, "{key prefix=%08x/", tn->key&TKEY_GET_MASK(0, tn->pos)); + seq_printf(seq, "{key prefix=%08x/", tn->key & TKEY_GET_MASK(0, tn->pos)); printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos); seq_printf(seq, "}\n"); putspace_seq(seq, indent); seq_printf(seq, "| "); @@ -2155,100 +2103,103 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n, static void trie_dump_seq(struct seq_file *seq, struct trie *t) { struct node *n = t->trie; - int cindex=0; - int indent=1; - int pend=0; + int cindex = 0; + int indent = 1; + int pend = 0; int depth = 0; + struct tnode *tn; read_lock(&fib_lock); seq_printf(seq, "------ trie_dump of t=%p ------\n", t); - if (n) { - printnode_seq(seq, indent, n, pend, cindex, 0); - if (IS_TNODE(n)) { - struct tnode *tn = (struct tnode *)n; - pend = tn->pos+tn->bits; - putspace_seq(seq, indent); seq_printf(seq, "\\--\n"); - indent += 3; - depth++; - - while (tn && cindex < (1 << tn->bits)) { - if (tn->child[cindex]) { - - /* Got a child */ - - printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits); - if (IS_LEAF(tn->child[cindex])) { - cindex++; - - } - else { - /* - * New tnode. Decend one level - */ - - depth++; - n = tn->child[cindex]; - tn = (struct tnode *)n; - pend = tn->pos+tn->bits; - putspace_seq(seq, indent); seq_printf(seq, "\\--\n"); - indent+=3; - cindex=0; - } - } - else - cindex++; + if (!n) { + seq_printf(seq, "------ trie is empty\n"); + + read_unlock(&fib_lock); + return; + } + + printnode_seq(seq, indent, n, pend, cindex, 0); + + if (!IS_TNODE(n)) { + read_unlock(&fib_lock); + return; + } + + tn = (struct tnode *)n; + pend = tn->pos+tn->bits; + putspace_seq(seq, indent); seq_printf(seq, "\\--\n"); + indent += 3; + depth++; + + while (tn && cindex < (1 << tn->bits)) { + if (tn->child[cindex]) { + /* Got a child */ + + printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits); + if (IS_LEAF(tn->child[cindex])) { + cindex++; + } else { /* - * Test if we are done + * New tnode. Decend one level */ - - while (cindex >= (1 << tn->bits)) { - /* - * Move upwards and test for root - * pop off all traversed nodes - */ - - if (NODE_PARENT(tn) == NULL) { - tn = NULL; - n = NULL; - break; - } - else { - cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits); - tn = NODE_PARENT(tn); - cindex++; - n = (struct node *)tn; - pend = tn->pos+tn->bits; - indent-=3; - depth--; - } - } + depth++; + tn = (struct tnode *)tn->child[cindex]; + pend = tn->pos + tn->bits; + putspace_seq(seq, indent); seq_printf(seq, "\\--\n"); + indent += 3; + cindex = 0; } + } else + cindex++; + + /* + * Test if we are done + */ + + while (cindex >= (1 << tn->bits)) { + /* + * Move upwards and test for root + * pop off all traversed nodes + */ + + if (NODE_PARENT(tn) == NULL) { + tn = NULL; + break; + } + + cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits); + cindex++; + tn = NODE_PARENT(tn); + pend = tn->pos + tn->bits; + indent -= 3; + depth--; } - else n = NULL; } - else seq_printf(seq, "------ trie is empty\n"); read_unlock(&fib_lock); } static struct trie_stat *trie_stat_new(void) { - struct trie_stat *s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL); + struct trie_stat *s; int i; - if (s) { - s->totdepth = 0; - s->maxdepth = 0; - s->tnodes = 0; - s->leaves = 0; - s->nullpointers = 0; - - for(i=0; i< MAX_CHILDS; i++) - s->nodesizes[i] = 0; - } + s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL); + if (!s) + return NULL; + + s->totdepth = 0; + s->maxdepth = 0; + s->tnodes = 0; + s->leaves = 0; + s->nullpointers = 0; + + for (i = 0; i < MAX_CHILDS; i++) + s->nodesizes[i] = 0; + return s; } @@ -2257,91 +2208,81 @@ static struct trie_stat *trie_collect_stats(struct trie *t) struct node *n = t->trie; struct trie_stat *s = trie_stat_new(); int cindex = 0; - int indent = 1; int pend = 0; int depth = 0; - read_lock(&fib_lock); + if (!s) + return NULL; + if (!n) + return s; - if (s) { - if (n) { - if (IS_TNODE(n)) { - struct tnode *tn = (struct tnode *)n; - pend = tn->pos+tn->bits; - indent += 3; - s->nodesizes[tn->bits]++; - depth++; + read_lock(&fib_lock); - while (tn && cindex < (1 << tn->bits)) { - if (tn->child[cindex]) { - /* Got a child */ - - if (IS_LEAF(tn->child[cindex])) { - cindex++; - - /* stats */ - if (depth > s->maxdepth) - s->maxdepth = depth; - s->totdepth += depth; - s->leaves++; - } - - else { - /* - * New tnode. Decend one level - */ - - s->tnodes++; - s->nodesizes[tn->bits]++; - depth++; - - n = tn->child[cindex]; - tn = (struct tnode *)n; - pend = tn->pos+tn->bits; - - indent += 3; - cindex = 0; - } - } - else { - cindex++; - s->nullpointers++; - } + if (IS_TNODE(n)) { + struct tnode *tn = (struct tnode *)n; + pend = tn->pos+tn->bits; + s->nodesizes[tn->bits]++; + depth++; + + while (tn && cindex < (1 << tn->bits)) { + if (tn->child[cindex]) { + /* Got a child */ + if (IS_LEAF(tn->child[cindex])) { + cindex++; + + /* stats */ + if (depth > s->maxdepth) + s->maxdepth = depth; + s->totdepth += depth; + s->leaves++; + } else { /* - * Test if we are done + * New tnode. Decend one level */ - - while (cindex >= (1 << tn->bits)) { - - /* - * Move upwards and test for root - * pop off all traversed nodes - */ - - - if (NODE_PARENT(tn) == NULL) { - tn = NULL; - n = NULL; - break; - } - else { - cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits); - tn = NODE_PARENT(tn); - cindex++; - n = (struct node *)tn; - pend = tn->pos+tn->bits; - indent -= 3; - depth--; - } - } + + s->tnodes++; + s->nodesizes[tn->bits]++; + depth++; + + n = tn->child[cindex]; + tn = (struct tnode *)n; + pend = tn->pos+tn->bits; + + cindex = 0; } + } else { + cindex++; + s->nullpointers++; } - else n = NULL; + + /* + * Test if we are done + */ + + while (cindex >= (1 << tn->bits)) { + /* + * Move upwards and test for root + * pop off all traversed nodes + */ + + if (NODE_PARENT(tn) == NULL) { + tn = NULL; + n = NULL; + break; + } + + cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits); + tn = NODE_PARENT(tn); + cindex++; + n = (struct node *)tn; + pend = tn->pos+tn->bits; + depth--; + } } } - read_unlock(&fib_lock); + read_unlock(&fib_lock); return s; } @@ -2359,17 +2300,22 @@ static struct fib_alias *fib_triestat_get_next(struct seq_file *seq) static void *fib_triestat_seq_start(struct seq_file *seq, loff_t *pos) { - void *v = NULL; + if (!ip_fib_main_table) + return NULL; - if (ip_fib_main_table) - v = *pos ? fib_triestat_get_next(seq) : SEQ_START_TOKEN; - return v; + if (*pos) + return fib_triestat_get_next(seq); + else + return SEQ_START_TOKEN; } static void *fib_triestat_seq_next(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; - return v == SEQ_START_TOKEN ? fib_triestat_get_first(seq) : fib_triestat_get_next(seq); + if (v == SEQ_START_TOKEN) + return fib_triestat_get_first(seq); + else + return fib_triestat_get_next(seq); } static void fib_triestat_seq_stop(struct seq_file *seq, void *v) @@ -2388,22 +2334,22 @@ static void collect_and_show(struct trie *t, struct seq_file *seq) { int bytes = 0; /* How many bytes are used, a ref is 4 bytes */ int i, max, pointers; - struct trie_stat *stat; + struct trie_stat *stat; int avdepth; stat = trie_collect_stats(t); - bytes=0; + bytes = 0; seq_printf(seq, "trie=%p\n", t); if (stat) { if (stat->leaves) - avdepth=stat->totdepth*100 / stat->leaves; + avdepth = stat->totdepth*100 / stat->leaves; else - avdepth=0; - seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100 ); + avdepth = 0; + seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100); seq_printf(seq, "Max depth: %4d\n", stat->maxdepth); - + seq_printf(seq, "Leaves: %d\n", stat->leaves); bytes += sizeof(struct leaf) * stat->leaves; seq_printf(seq, "Internal nodes: %d\n", stat->tnodes); @@ -2455,11 +2401,9 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v) if (trie_main) collect_and_show(trie_main, seq); - } - else { - snprintf(bf, sizeof(bf), - "*\t%08X\t%08X", 200, 400); - + } else { + snprintf(bf, sizeof(bf), "*\t%08X\t%08X", 200, 400); + seq_printf(seq, "%-127s\n", bf); } return 0; @@ -2520,22 +2464,27 @@ static struct fib_alias *fib_trie_get_next(struct seq_file *seq) static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos) { - void *v = NULL; + if (!ip_fib_main_table) + return NULL; - if (ip_fib_main_table) - v = *pos ? fib_trie_get_next(seq) : SEQ_START_TOKEN; - return v; + if (*pos) + return fib_trie_get_next(seq); + else + return SEQ_START_TOKEN; } static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; - return v == SEQ_START_TOKEN ? fib_trie_get_first(seq) : fib_trie_get_next(seq); + if (v == SEQ_START_TOKEN) + return fib_trie_get_first(seq); + else + return fib_trie_get_next(seq); + } static void fib_trie_seq_stop(struct seq_file *seq, void *v) { - } /* @@ -2555,9 +2504,7 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v) if (trie_main) trie_dump_seq(seq, trie_main); - } - - else { + } else { snprintf(bf, sizeof(bf), "*\t%08X\t%08X", 200, 400); seq_printf(seq, "%-127s\n", bf); -- cgit v0.10.2 From 2f80b3c8262d0d646812f776db024d88d569a0c1 Mon Sep 17 00:00:00 2001 From: Robert Olsson Date: Tue, 9 Aug 2005 20:25:06 -0700 Subject: [IPV4]: fib_trie: Use ERR_PTR to handle errno return Signed-off-by: Robert Olsson Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 6f818cc..914a4c2 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -167,8 +167,8 @@ static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n); static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull); static int tnode_child_length(struct tnode *tn); static struct node *resize(struct trie *t, struct tnode *tn); -static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err); -static struct tnode *halve(struct trie *t, struct tnode *tn, int *err); +static struct tnode *inflate(struct trie *t, struct tnode *tn); +static struct tnode *halve(struct trie *t, struct tnode *tn); static void tnode_free(struct tnode *tn); static void trie_dump_seq(struct seq_file *seq, struct trie *t); extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio); @@ -457,6 +457,7 @@ static struct node *resize(struct trie *t, struct tnode *tn) { int i; int err = 0; + struct tnode *old_tn; if (!tn) return NULL; @@ -559,9 +560,10 @@ static struct node *resize(struct trie *t, struct tnode *tn) 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >= inflate_threshold * tnode_child_length(tn))) { - tn = inflate(t, tn, &err); - - if (err) { + old_tn = tn; + tn = inflate(t, tn); + if (IS_ERR(tn)) { + tn = old_tn; #ifdef CONFIG_IP_FIB_TRIE_STATS t->stats.resize_node_skipped++; #endif @@ -581,9 +583,10 @@ static struct node *resize(struct trie *t, struct tnode *tn) 100 * (tnode_child_length(tn) - tn->empty_children) < halve_threshold * tnode_child_length(tn)) { - tn = halve(t, tn, &err); - - if (err) { + old_tn = tn; + tn = halve(t, tn); + if (IS_ERR(tn)) { + tn = old_tn; #ifdef CONFIG_IP_FIB_TRIE_STATS t->stats.resize_node_skipped++; #endif @@ -618,7 +621,7 @@ static struct node *resize(struct trie *t, struct tnode *tn) return (struct node *) tn; } -static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err) +static struct tnode *inflate(struct trie *t, struct tnode *tn) { struct tnode *inode; struct tnode *oldtnode = tn; @@ -629,10 +632,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err) tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1); - if (!tn) { - *err = -ENOMEM; - return oldtnode; - } + if (!tn) + return ERR_PTR(-ENOMEM); /* * Preallocate and store tnodes before the actual work so we @@ -653,39 +654,22 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err) left = tnode_new(inode->key&(~m), inode->pos + 1, inode->bits - 1); - - if (!left) { - *err = -ENOMEM; - break; - } + if (!left) + goto nomem; right = tnode_new(inode->key|m, inode->pos + 1, inode->bits - 1); - if (!right) { - *err = -ENOMEM; - break; - } + if (!right) { + tnode_free(left); + goto nomem; + } put_child(t, tn, 2*i, (struct node *) left); put_child(t, tn, 2*i+1, (struct node *) right); } } - if (*err) { - int size = tnode_child_length(tn); - int j; - - for (j = 0; j < size; j++) - if (tn->child[j]) - tnode_free((struct tnode *)tn->child[j]); - - tnode_free(tn); - - *err = -ENOMEM; - return oldtnode; - } - for (i = 0; i < olen; i++) { struct node *node = tnode_get_child(oldtnode, i); struct tnode *left, *right; @@ -763,9 +747,22 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err) } tnode_free(oldtnode); return tn; +nomem: + { + int size = tnode_child_length(tn); + int j; + + for(j = 0; j < size; j++) + if (tn->child[j]) + tnode_free((struct tnode *)tn->child[j]); + + tnode_free(tn); + + return ERR_PTR(-ENOMEM); + } } -static struct tnode *halve(struct trie *t, struct tnode *tn, int *err) +static struct tnode *halve(struct trie *t, struct tnode *tn) { struct tnode *oldtnode = tn; struct node *left, *right; @@ -776,10 +773,8 @@ static struct tnode *halve(struct trie *t, struct tnode *tn, int *err) tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1); - if (!tn) { - *err = -ENOMEM; - return oldtnode; - } + if (!tn) + return ERR_PTR(-ENOMEM); /* * Preallocate and store tnodes before the actual work so we @@ -794,29 +789,16 @@ static struct tnode *halve(struct trie *t, struct tnode *tn, int *err) /* Two nonempty children */ if (left && right) { - struct tnode *newBinNode = - tnode_new(left->key, tn->pos + tn->bits, 1); - - if (!newBinNode) { - *err = -ENOMEM; - break; - } - put_child(t, tn, i/2, (struct node *)newBinNode); + struct tnode *newn; + + newn = tnode_new(left->key, tn->pos + tn->bits, 1); + + if (!newn) + goto nomem; + + put_child(t, tn, i/2, (struct node *)newn); } - } - if (*err) { - int size = tnode_child_length(tn); - int j; - - for (j = 0; j < size; j++) - if (tn->child[j]) - tnode_free((struct tnode *)tn->child[j]); - - tnode_free(tn); - - *err = -ENOMEM; - return oldtnode; } for (i = 0; i < olen; i += 2) { @@ -850,6 +832,19 @@ static struct tnode *halve(struct trie *t, struct tnode *tn, int *err) } tnode_free(oldtnode); return tn; +nomem: + { + int size = tnode_child_length(tn); + int j; + + for(j = 0; j < size; j++) + if (tn->child[j]) + tnode_free((struct tnode *)tn->child[j]); + + tnode_free(tn); + + return ERR_PTR(-ENOMEM); + } } static void trie_init(struct trie *t) -- cgit v0.10.2 From bb435b8d816582064ee0ddb1e2a6fbca67f34108 Mon Sep 17 00:00:00 2001 From: Stephen Hemmigner Date: Tue, 9 Aug 2005 20:25:39 -0700 Subject: [IPV4]: fib_trie: Use const Use const where possible and get rid of EXTRACT() macro that was never used. Signed-off-by: Stephen Hemmigner Signed-off-by: Robert Olsson Signed-off-by: David S. Miller diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 914a4c2..395f64d 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -77,7 +77,6 @@ #undef CONFIG_IP_FIB_TRIE_STATS #define MAX_CHILDS 16384 -#define EXTRACT(p, n, str) ((str)<<(p)>>(32-(n))) #define KEYLENGTH (8*sizeof(t_key)) #define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l)) #define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset)) @@ -162,10 +161,8 @@ static int trie_debug = 0; #define DBG(x...) do { if (trie_debug) printk(x); } while (0) -static int tnode_full(struct tnode *tn, struct node *n); static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n); static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull); -static int tnode_child_length(struct tnode *tn); static struct node *resize(struct trie *t, struct tnode *tn); static struct tnode *inflate(struct trie *t, struct tnode *tn); static struct tnode *halve(struct trie *t, struct tnode *tn); @@ -188,7 +185,7 @@ static inline struct node *tnode_get_child(struct tnode *tn, int i) return tn->child[i]; } -static inline int tnode_child_length(struct tnode *tn) +static inline int tnode_child_length(const struct tnode *tn) { return 1 << tn->bits; } @@ -400,7 +397,7 @@ static void tnode_free(struct tnode *tn) * and no bits are skipped. See discussion in dyntree paper p. 6 */ -static inline int tnode_full(struct tnode *tn, struct node *n) +static inline int tnode_full(const struct tnode *tn, const struct node *n) { if (n == NULL || IS_LEAF(n)) return 0; -- cgit v0.10.2 From 5a47a470e602eecb168ddd3b78841b84ceddd319 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 20:26:03 -0700 Subject: [DCCP]: make include-able from userspace The protocol header files in are usually structured in a way to be included by userspace code. The top section consists of general protocol structure definitions, typedefs, enums - followed by an #ifdef __KERNEL__ section. Currently doesn't follow that convention and can therefore not be used from userspace. However, for example iptables' libipt_dccp.c actually needs various definitions from there. Signed-off-by: Harald Welte Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/linux/dccp.h b/include/linux/dccp.h index e3b4bf7..add4908b 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -1,16 +1,8 @@ #ifndef _LINUX_DCCP_H #define _LINUX_DCCP_H -#include -#include #include -#include -#include - -#include -#include -#include -#include +#include /* FIXME: this is utterly wrong */ struct sockaddr_dccp { @@ -18,40 +10,6 @@ struct sockaddr_dccp { unsigned int service; }; -enum dccp_state { - DCCP_OPEN = TCP_ESTABLISHED, - DCCP_REQUESTING = TCP_SYN_SENT, - DCCP_PARTOPEN = TCP_FIN_WAIT1, /* FIXME: - This mapping is horrible, but TCP has - no matching state for DCCP_PARTOPEN, - as TCP_SYN_RECV is already used by - DCCP_RESPOND, why don't stop using TCP - mapping of states? OK, now we don't use - sk_stream_sendmsg anymore, so doesn't - seem to exist any reason for us to - do the TCP mapping here */ - DCCP_LISTEN = TCP_LISTEN, - DCCP_RESPOND = TCP_SYN_RECV, - DCCP_CLOSING = TCP_CLOSING, - DCCP_TIME_WAIT = TCP_TIME_WAIT, - DCCP_CLOSED = TCP_CLOSE, - DCCP_MAX_STATES = TCP_MAX_STATES, -}; - -#define DCCP_STATE_MASK 0xf -#define DCCP_ACTION_FIN (1<<7) - -enum { - DCCPF_OPEN = TCPF_ESTABLISHED, - DCCPF_REQUESTING = TCPF_SYN_SENT, - DCCPF_PARTOPEN = TCPF_FIN_WAIT1, - DCCPF_LISTEN = TCPF_LISTEN, - DCCPF_RESPOND = TCPF_SYN_RECV, - DCCPF_CLOSING = TCPF_CLOSING, - DCCPF_TIME_WAIT = TCPF_TIME_WAIT, - DCCPF_CLOSED = TCPF_CLOSE, -}; - /** * struct dccp_hdr - generic part of DCCP packet header * @@ -94,11 +52,6 @@ struct dccp_hdr { #endif }; -static inline struct dccp_hdr *dccp_hdr(const struct sk_buff *skb) -{ - return (struct dccp_hdr *)skb->h.raw; -} - /** * struct dccp_hdr_ext - the low bits of a 48 bit seq packet * @@ -108,34 +61,6 @@ struct dccp_hdr_ext { __u32 dccph_seq_low; }; -static inline struct dccp_hdr_ext *dccp_hdrx(const struct sk_buff *skb) -{ - return (struct dccp_hdr_ext *)(skb->h.raw + sizeof(struct dccp_hdr)); -} - -static inline unsigned int dccp_basic_hdr_len(const struct sk_buff *skb) -{ - const struct dccp_hdr *dh = dccp_hdr(skb); - return sizeof(*dh) + (dh->dccph_x ? sizeof(struct dccp_hdr_ext) : 0); -} - -static inline __u64 dccp_hdr_seq(const struct sk_buff *skb) -{ - const struct dccp_hdr *dh = dccp_hdr(skb); -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 seq_nr = ntohl(dh->dccph_seq << 8); -#elif defined(__BIG_ENDIAN_BITFIELD) - __u64 seq_nr = ntohl(dh->dccph_seq); -#else -#error "Adjust your defines" -#endif - - if (dh->dccph_x != 0) - seq_nr = (seq_nr << 32) + ntohl(dccp_hdrx(skb)->dccph_seq_low); - - return seq_nr; -} - /** * struct dccp_hdr_request - Conection initiation request header * @@ -145,12 +70,6 @@ static inline __u64 dccp_hdr_seq(const struct sk_buff *skb) struct dccp_hdr_request { __u32 dccph_req_service; }; - -static inline struct dccp_hdr_request *dccp_hdr_request(struct sk_buff *skb) -{ - return (struct dccp_hdr_request *)(skb->h.raw + dccp_basic_hdr_len(skb)); -} - /** * struct dccp_hdr_ack_bits - acknowledgment bits common to most packets * @@ -162,24 +81,6 @@ struct dccp_hdr_ack_bits { dccph_ack_nr_high:24; __u32 dccph_ack_nr_low; }; - -static inline struct dccp_hdr_ack_bits *dccp_hdr_ack_bits(const struct sk_buff *skb) -{ - return (struct dccp_hdr_ack_bits *)(skb->h.raw + dccp_basic_hdr_len(skb)); -} - -static inline u64 dccp_hdr_ack_seq(const struct sk_buff *skb) -{ - const struct dccp_hdr_ack_bits *dhack = dccp_hdr_ack_bits(skb); -#if defined(__LITTLE_ENDIAN_BITFIELD) - return (((u64)ntohl(dhack->dccph_ack_nr_high << 8)) << 32) + ntohl(dhack->dccph_ack_nr_low); -#elif defined(__BIG_ENDIAN_BITFIELD) - return (((u64)ntohl(dhack->dccph_ack_nr_high)) << 32) + ntohl(dhack->dccph_ack_nr_low); -#else -#error "Adjust your defines" -#endif -} - /** * struct dccp_hdr_response - Conection initiation response header * @@ -193,11 +94,6 @@ struct dccp_hdr_response { __u32 dccph_resp_service; }; -static inline struct dccp_hdr_response *dccp_hdr_response(struct sk_buff *skb) -{ - return (struct dccp_hdr_response *)(skb->h.raw + dccp_basic_hdr_len(skb)); -} - /** * struct dccp_hdr_reset - Unconditionally shut down a connection * @@ -210,11 +106,6 @@ struct dccp_hdr_reset { dccph_reset_data[3]; }; -static inline struct dccp_hdr_reset *dccp_hdr_reset(struct sk_buff *skb) -{ - return (struct dccp_hdr_reset *)(skb->h.raw + dccp_basic_hdr_len(skb)); -} - enum dccp_pkt_type { DCCP_PKT_REQUEST = 0, DCCP_PKT_RESPONSE, @@ -248,13 +139,6 @@ static inline unsigned int dccp_packet_hdr_len(const __u8 type) return sizeof(struct dccp_hdr_response); return sizeof(struct dccp_hdr_reset); } - -static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) -{ - return dccp_basic_hdr_len(skb) + - dccp_packet_hdr_len(dccp_hdr(skb)->dccph_type); -} - enum dccp_reset_codes { DCCP_RESET_CODE_UNSPECIFIED = 0, DCCP_RESET_CODE_CLOSED, @@ -298,6 +182,124 @@ enum { DCCPF_MAX_CCID_SPECIFIC = 255, }; +#ifdef __KERNEL__ + +#include +#include +#include +#include + +#include +#include +#include +#include + +enum dccp_state { + DCCP_OPEN = TCP_ESTABLISHED, + DCCP_REQUESTING = TCP_SYN_SENT, + DCCP_PARTOPEN = TCP_FIN_WAIT1, /* FIXME: + This mapping is horrible, but TCP has + no matching state for DCCP_PARTOPEN, + as TCP_SYN_RECV is already used by + DCCP_RESPOND, why don't stop using TCP + mapping of states? OK, now we don't use + sk_stream_sendmsg anymore, so doesn't + seem to exist any reason for us to + do the TCP mapping here */ + DCCP_LISTEN = TCP_LISTEN, + DCCP_RESPOND = TCP_SYN_RECV, + DCCP_CLOSING = TCP_CLOSING, + DCCP_TIME_WAIT = TCP_TIME_WAIT, + DCCP_CLOSED = TCP_CLOSE, + DCCP_MAX_STATES = TCP_MAX_STATES, +}; + +#define DCCP_STATE_MASK 0xf +#define DCCP_ACTION_FIN (1<<7) + +enum { + DCCPF_OPEN = TCPF_ESTABLISHED, + DCCPF_REQUESTING = TCPF_SYN_SENT, + DCCPF_PARTOPEN = TCPF_FIN_WAIT1, + DCCPF_LISTEN = TCPF_LISTEN, + DCCPF_RESPOND = TCPF_SYN_RECV, + DCCPF_CLOSING = TCPF_CLOSING, + DCCPF_TIME_WAIT = TCPF_TIME_WAIT, + DCCPF_CLOSED = TCPF_CLOSE, +}; + +static inline struct dccp_hdr *dccp_hdr(const struct sk_buff *skb) +{ + return (struct dccp_hdr *)skb->h.raw; +} + +static inline struct dccp_hdr_ext *dccp_hdrx(const struct sk_buff *skb) +{ + return (struct dccp_hdr_ext *)(skb->h.raw + sizeof(struct dccp_hdr)); +} + +static inline unsigned int dccp_basic_hdr_len(const struct sk_buff *skb) +{ + const struct dccp_hdr *dh = dccp_hdr(skb); + return sizeof(*dh) + (dh->dccph_x ? sizeof(struct dccp_hdr_ext) : 0); +} + +static inline __u64 dccp_hdr_seq(const struct sk_buff *skb) +{ + const struct dccp_hdr *dh = dccp_hdr(skb); +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 seq_nr = ntohl(dh->dccph_seq << 8); +#elif defined(__BIG_ENDIAN_BITFIELD) + __u64 seq_nr = ntohl(dh->dccph_seq); +#else +#error "Adjust your defines" +#endif + + if (dh->dccph_x != 0) + seq_nr = (seq_nr << 32) + ntohl(dccp_hdrx(skb)->dccph_seq_low); + + return seq_nr; +} + +static inline struct dccp_hdr_request *dccp_hdr_request(struct sk_buff *skb) +{ + return (struct dccp_hdr_request *)(skb->h.raw + dccp_basic_hdr_len(skb)); +} + +static inline struct dccp_hdr_ack_bits *dccp_hdr_ack_bits(const struct sk_buff *skb) +{ + return (struct dccp_hdr_ack_bits *)(skb->h.raw + dccp_basic_hdr_len(skb)); +} + +static inline u64 dccp_hdr_ack_seq(const struct sk_buff *skb) +{ + const struct dccp_hdr_ack_bits *dhack = dccp_hdr_ack_bits(skb); +#if defined(__LITTLE_ENDIAN_BITFIELD) + return (((u64)ntohl(dhack->dccph_ack_nr_high << 8)) << 32) + ntohl(dhack->dccph_ack_nr_low); +#elif defined(__BIG_ENDIAN_BITFIELD) + return (((u64)ntohl(dhack->dccph_ack_nr_high)) << 32) + ntohl(dhack->dccph_ack_nr_low); +#else +#error "Adjust your defines" +#endif +} + +static inline struct dccp_hdr_response *dccp_hdr_response(struct sk_buff *skb) +{ + return (struct dccp_hdr_response *)(skb->h.raw + dccp_basic_hdr_len(skb)); +} + +static inline struct dccp_hdr_reset *dccp_hdr_reset(struct sk_buff *skb) +{ + return (struct dccp_hdr_reset *)(skb->h.raw + dccp_basic_hdr_len(skb)); +} + +static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) +{ + return dccp_basic_hdr_len(skb) + + dccp_packet_hdr_len(dccp_hdr(skb)->dccph_type); +} + + /* initial values for each feature */ #define DCCPF_INITIAL_SEQUENCE_WINDOW 100 /* FIXME: for now we're using CCID 3 (TFRC) */ @@ -429,4 +431,6 @@ static inline const char *dccp_role(const struct sock *sk) return NULL; } +#endif /* __KERNEL__ */ + #endif /* _LINUX_DCCP_H */ -- cgit v0.10.2 From e2e268665f6c01686b477a6b0cc5a70bab689d54 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:26:28 -0700 Subject: [DCCP]: Fix struct sockaddr_dccp definition Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/linux/dccp.h b/include/linux/dccp.h index add4908b..fd1412d 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -4,10 +4,14 @@ #include #include -/* FIXME: this is utterly wrong */ +/* Structure describing an Internet (DCCP) socket address. */ struct sockaddr_dccp { - struct sockaddr_in in; - unsigned int service; + __u16 sdccp_family; /* Address family */ + __u16 sdccp_port; /* Port number */ + __u32 sdccp_addr; /* Internet address */ + __u32 sdccp_service; /* Service */ + /* Pad to size of `struct sockaddr': 16 bytes . */ + __u32 sdccp_pad; }; /** -- cgit v0.10.2 From 1d3de414eb20d937d82c5219fd13ee4cedc499cb Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 20:26:55 -0700 Subject: [NETFILTER]: New iptables DCCP protocol header match Using this new iptables DCCP protocol header match, it is possible to create simplistic stateless packet filtering rules for DCCP. It permits matching of port numbers, packet type and options. Signed-off-by: Harald Welte Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/linux/dccp.h b/include/linux/dccp.h index fd1412d..431d589 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -242,10 +242,15 @@ static inline struct dccp_hdr_ext *dccp_hdrx(const struct sk_buff *skb) return (struct dccp_hdr_ext *)(skb->h.raw + sizeof(struct dccp_hdr)); } +static inline unsigned int __dccp_basic_hdr_len(const struct dccp_hdr *dh) +{ + return sizeof(*dh) + (dh->dccph_x ? sizeof(struct dccp_hdr_ext) : 0); +} + static inline unsigned int dccp_basic_hdr_len(const struct sk_buff *skb) { const struct dccp_hdr *dh = dccp_hdr(skb); - return sizeof(*dh) + (dh->dccph_x ? sizeof(struct dccp_hdr_ext) : 0); + return __dccp_basic_hdr_len(dh); } static inline __u64 dccp_hdr_seq(const struct sk_buff *skb) @@ -297,10 +302,15 @@ static inline struct dccp_hdr_reset *dccp_hdr_reset(struct sk_buff *skb) return (struct dccp_hdr_reset *)(skb->h.raw + dccp_basic_hdr_len(skb)); } +static inline unsigned int __dccp_hdr_len(const struct dccp_hdr *dh) +{ + return __dccp_basic_hdr_len(dh) + + dccp_packet_hdr_len(dh->dccph_type); +} + static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) { - return dccp_basic_hdr_len(skb) + - dccp_packet_hdr_len(dccp_hdr(skb)->dccph_type); + return __dccp_hdr_len(dccp_hdr(skb)); } diff --git a/include/linux/netfilter_ipv4/ipt_dccp.h b/include/linux/netfilter_ipv4/ipt_dccp.h new file mode 100644 index 0000000..3cb3a52 --- /dev/null +++ b/include/linux/netfilter_ipv4/ipt_dccp.h @@ -0,0 +1,23 @@ +#ifndef _IPT_DCCP_H_ +#define _IPT_DCCP_H_ + +#define IPT_DCCP_SRC_PORTS 0x01 +#define IPT_DCCP_DEST_PORTS 0x02 +#define IPT_DCCP_TYPE 0x04 +#define IPT_DCCP_OPTION 0x08 + +#define IPT_DCCP_VALID_FLAGS 0x0f + +struct ipt_dccp_info { + u_int16_t dpts[2]; /* Min, Max */ + u_int16_t spts[2]; /* Min, Max */ + + u_int16_t flags; + u_int16_t invflags; + + u_int16_t typemask; + u_int8_t option; +}; + +#endif /* _IPT_DCCP_H_ */ + diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 2fa26a4..9f5e1d7 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -354,6 +354,17 @@ config IP_NF_MATCH_SCTP If you want to compile it as a module, say M here and read . If unsure, say `N'. +config IP_NF_MATCH_DCCP + tristate 'DCCP protocol match support' + depends on IP_NF_IPTABLES + help + With this option enabled, you will be able to use the iptables + `dccp' match in order to match on DCCP source/destination ports + and DCCP flags. + + If you want to compile it as a module, say M here and read + . If unsure, say `N'. + config IP_NF_MATCH_COMMENT tristate 'comment match support' depends on IP_NF_IPTABLES diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index c2ae663..58aa7c61 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -42,6 +42,7 @@ obj-$(CONFIG_IP_NF_MATCH_HELPER) += ipt_helper.o obj-$(CONFIG_IP_NF_MATCH_LIMIT) += ipt_limit.o obj-$(CONFIG_IP_NF_MATCH_HASHLIMIT) += ipt_hashlimit.o obj-$(CONFIG_IP_NF_MATCH_SCTP) += ipt_sctp.o +obj-$(CONFIG_IP_NF_MATCH_DCCP) += ipt_dccp.o obj-$(CONFIG_IP_NF_MATCH_MARK) += ipt_mark.o obj-$(CONFIG_IP_NF_MATCH_MAC) += ipt_mac.o obj-$(CONFIG_IP_NF_MATCH_IPRANGE) += ipt_iprange.o diff --git a/net/ipv4/netfilter/ipt_dccp.c b/net/ipv4/netfilter/ipt_dccp.c new file mode 100644 index 0000000..ad3278b --- /dev/null +++ b/net/ipv4/netfilter/ipt_dccp.c @@ -0,0 +1,176 @@ +/* + * iptables module for DCCP protocol header matching + * + * (C) 2005 by Harald Welte + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include + +#include +#include + +#define DCCHECK(cond, option, flag, invflag) (!((flag) & (option)) \ + || (!!((invflag) & (option)) ^ (cond))) + +static unsigned char *dccp_optbuf; +static DEFINE_SPINLOCK(dccp_buflock); + +static inline int +dccp_find_option(u_int8_t option, + const struct sk_buff *skb, + const struct dccp_hdr *dh, + int *hotdrop) +{ + /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */ + unsigned char *op; + unsigned int optoff = __dccp_hdr_len(dh); + unsigned int optlen = dh->dccph_doff*4 - __dccp_hdr_len(dh); + unsigned int i; + + if (dh->dccph_doff * 4 < __dccp_hdr_len(dh)) { + *hotdrop = 1; + return 0; + } + + if (!optlen) + return 0; + + spin_lock_bh(&dccp_buflock); + op = skb_header_pointer(skb, + skb->nh.iph->ihl*4 + optoff, + optlen, dccp_optbuf); + if (op == NULL) { + /* If we don't have the whole header, drop packet. */ + spin_unlock_bh(&dccp_buflock); + *hotdrop = 1; + return 0; + } + + for (i = 0; i < optlen; ) { + if (op[i] == option) { + spin_unlock_bh(&dccp_buflock); + return 1; + } + + if (op[i] < 2) + i++; + else + i += op[i+1]?:1; + } + + spin_unlock_bh(&dccp_buflock); + return 0; +} + + +static inline int +match_types(const struct dccp_hdr *dh, u_int16_t typemask) +{ + return (typemask & (1 << dh->dccph_type)); +} + +static inline int +match_option(u_int8_t option, const struct sk_buff *skb, + const struct dccp_hdr *dh, int *hotdrop) +{ + return dccp_find_option(option, skb, dh, hotdrop); +} + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_dccp_info *info = + (const struct ipt_dccp_info *)matchinfo; + struct dccp_hdr _dh, *dh; + + if (offset) + return 0; + + dh = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_dh), &_dh); + if (dh == NULL) { + *hotdrop = 1; + return 0; + } + + return DCCHECK(((ntohs(dh->dccph_sport) >= info->spts[0]) + && (ntohs(dh->dccph_sport) <= info->spts[1])), + IPT_DCCP_SRC_PORTS, info->flags, info->invflags) + && DCCHECK(((ntohs(dh->dccph_dport) >= info->dpts[0]) + && (ntohs(dh->dccph_dport) <= info->dpts[1])), + IPT_DCCP_DEST_PORTS, info->flags, info->invflags) + && DCCHECK(match_types(dh, info->typemask), + IPT_DCCP_TYPE, info->flags, info->invflags) + && DCCHECK(match_option(info->option, skb, dh, hotdrop), + IPT_DCCP_OPTION, info->flags, info->invflags); +} + +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + const struct ipt_dccp_info *info; + + info = (const struct ipt_dccp_info *)matchinfo; + + return ip->proto == IPPROTO_DCCP + && !(ip->invflags & IPT_INV_PROTO) + && matchsize == IPT_ALIGN(sizeof(struct ipt_dccp_info)) + && !(info->flags & ~IPT_DCCP_VALID_FLAGS) + && !(info->invflags & ~IPT_DCCP_VALID_FLAGS) + && !(info->invflags & ~info->flags); +} + +static struct ipt_match dccp_match = +{ + .name = "dccp", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + int ret; + + /* doff is 8 bits, so the maximum option size is (4*256). Don't put + * this in BSS since DaveM is worried about locked TLB's for kernel + * BSS. */ + dccp_optbuf = kmalloc(256 * 4, GFP_KERNEL); + if (!dccp_optbuf) + return -ENOMEM; + ret = ipt_register_match(&dccp_match); + if (ret) + kfree(dccp_optbuf); + + return ret; +} + +static void __exit fini(void) +{ + ipt_unregister_match(&dccp_match); + kfree(dccp_optbuf); +} + +module_init(init); +module_exit(fini); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Harald Welte "); +MODULE_DESCRIPTION("Match for DCCP protocol packets"); + -- cgit v0.10.2 From f6ccf55419c4f0021e7382f000f2fd14a29f3d3c Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 9 Aug 2005 20:27:14 -0700 Subject: [DCCP]: Fix u64 printf format warnings. Signed-off-by: David S. Miller diff --git a/net/dccp/input.c b/net/dccp/input.c index 622e976..76c3401 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -274,7 +274,9 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk, if (!between48(DCCP_SKB_CB(skb)->dccpd_ack_seq, dp->dccps_awl, dp->dccps_awh)) { dccp_pr_debug("invalid ackno: S.AWL=%llu, P.ackno=%llu, S.AWH=%llu \n", - dp->dccps_awl, DCCP_SKB_CB(skb)->dccpd_ack_seq, dp->dccps_awh); + (unsigned long long) dp->dccps_awl, + (unsigned long long) DCCP_SKB_CB(skb)->dccpd_ack_seq, + (unsigned long long) dp->dccps_awh); goto out_invalid_packet; } diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 7b90606..4fa56db 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -1092,14 +1092,16 @@ int dccp_v4_rcv(struct sk_buff *skb) dccp_packet_name(dh->dccph_type), NIPQUAD(skb->nh.iph->saddr), ntohs(dh->dccph_sport), NIPQUAD(skb->nh.iph->daddr), ntohs(dh->dccph_dport), - DCCP_SKB_CB(skb)->dccpd_seq); + (unsigned long long) DCCP_SKB_CB(skb)->dccpd_seq); if (dccp_packet_without_ack(skb)) { DCCP_SKB_CB(skb)->dccpd_ack_seq = DCCP_PKT_WITHOUT_ACK_SEQ; dccp_pr_debug_cat("\n"); } else { DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb); - dccp_pr_debug_cat(", ack=%llu\n", DCCP_SKB_CB(skb)->dccpd_ack_seq); + dccp_pr_debug_cat(", ack=%llu\n", + (unsigned long long) + DCCP_SKB_CB(skb)->dccpd_ack_seq); } /* Step 2: diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 810f0c2..e498e38 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -142,7 +142,10 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, /* Invalid ACK */ if (DCCP_SKB_CB(skb)->dccpd_ack_seq != dccp_rsk(req)->dreq_iss) { dccp_pr_debug("Invalid ACK number: ack_seq=%llu, dreq_iss=%llu\n", - DCCP_SKB_CB(skb)->dccpd_ack_seq, dccp_rsk(req)->dreq_iss); + (unsigned long long) + DCCP_SKB_CB(skb)->dccpd_ack_seq, + (unsigned long long) + dccp_rsk(req)->dreq_iss); goto drop; } diff --git a/net/dccp/options.c b/net/dccp/options.c index e186776..9ca32cb 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -119,7 +119,9 @@ int dccp_parse_options(struct sock *sk, struct sk_buff *skb) opt_recv->dccpor_ack_vector_idx = value - options; dccp_pr_debug("%sACK vector 0, len=%d, ack_ackno=%llu\n", - debug_prefix, len, DCCP_SKB_CB(skb)->dccpd_ack_seq); + debug_prefix, len, + (unsigned long long) + DCCP_SKB_CB(skb)->dccpd_ack_seq); dccp_ackvector_print(DCCP_SKB_CB(skb)->dccpd_ack_seq, value, len); dccp_ackpkts_check_rcv_ackvector(dp->dccps_hc_rx_ackpkts, sk, @@ -137,6 +139,7 @@ int dccp_parse_options(struct sock *sk, struct sk_buff *skb) dccp_pr_debug("%sTIMESTAMP=%u, ackno=%llu\n", debug_prefix, opt_recv->dccpor_timestamp, + (unsigned long long) DCCP_SKB_CB(skb)->dccpd_ack_seq); break; case DCCPO_TIMESTAMP_ECHO: @@ -147,7 +150,9 @@ int dccp_parse_options(struct sock *sk, struct sk_buff *skb) dccp_pr_debug("%sTIMESTAMP_ECHO=%u, len=%d, ackno=%llu, diff=%u\n", debug_prefix, opt_recv->dccpor_timestamp_echo, - len + 2, DCCP_SKB_CB(skb)->dccpd_ack_seq, + len + 2, + (unsigned long long) + DCCP_SKB_CB(skb)->dccpd_ack_seq, tcp_time_stamp - opt_recv->dccpor_timestamp_echo); opt_recv->dccpor_elapsed_time = dccp_decode_value_var(value + 4, len - 4); @@ -308,7 +313,8 @@ void dccp_insert_option_elapsed_time(struct sock *sk, dccp_pr_debug("%sELAPSED_TIME=%u, len=%d, seqno=%llu\n", debug_prefix, elapsed_time, - len, DCCP_SKB_CB(skb)->dccpd_seq); + len, + (unsigned long long) DCCP_SKB_CB(skb)->dccpd_seq); } EXPORT_SYMBOL(dccp_insert_option_elapsed_time); @@ -382,7 +388,8 @@ static void dccp_insert_option_ack_vector(struct sock *sk, struct sk_buff *skb) dccp_pr_debug("%sACK Vector 0, len=%d, ack_seqno=%llu, ack_ackno=%llu\n", debug_prefix, ap->dccpap_ack_vector_len, - ap->dccpap_ack_seqno, ap->dccpap_ack_ackno); + (unsigned long long) ap->dccpap_ack_seqno, + (unsigned long long) ap->dccpap_ack_ackno); } static inline void dccp_insert_option_timestamp(struct sock *sk, struct sk_buff *skb) @@ -422,7 +429,8 @@ static void dccp_insert_option_timestamp_echo(struct sock *sk, struct sk_buff *s dccp_pr_debug("%sTIMESTAMP_ECHO=%u, len=%d, seqno=%llu\n", debug_prefix, dp->dccps_timestamp_echo, - len, DCCP_SKB_CB(skb)->dccpd_seq); + len, + (unsigned long long) DCCP_SKB_CB(skb)->dccpd_seq); dp->dccps_timestamp_echo = 0; dp->dccps_timestamp_time = 0; @@ -607,7 +615,8 @@ int dccp_ackpkts_add(struct dccp_ackpkts *ap, u64 ackno, u8 state) */ if (state == DCCP_ACKPKTS_STATE_NOT_RECEIVED && len == 0 && delta == 0) { /* Found our reserved seat! */ - dccp_pr_debug("Found %llu reserved seat!\n", ackno); + dccp_pr_debug("Found %llu reserved seat!\n", + (unsigned long long) ackno); ap->dccpap_buf[index] = state; goto out; } @@ -630,7 +639,8 @@ out: out_duplicate: /* Duplicate packet */ - dccp_pr_debug("Received a dup or already considered lost packet: %llu\n", ackno); + dccp_pr_debug("Received a dup or already considered lost packet: %llu\n", + (unsigned long long) ackno); return -EILSEQ; } @@ -640,7 +650,8 @@ void dccp_ackvector_print(const u64 ackno, const unsigned char *vector, int len) if (!dccp_debug) return; - printk("ACK vector len=%d, ackno=%llu |", len, ackno); + printk("ACK vector len=%d, ackno=%llu |", len, + (unsigned long long) ackno); while (len--) { const u8 state = (*vector & DCCP_ACKPKTS_STATE_MASK) >> 6; @@ -693,7 +704,8 @@ void dccp_ackpkts_check_rcv_ackno(struct dccp_ackpkts *ap, struct sock *sk, #endif dccp_pr_debug("%sACK packet 0, len=%d, ack_seqno=%llu, ack_ackno=%llu, ACKED!\n", debug_prefix, 1, - ap->dccpap_ack_seqno, ap->dccpap_ack_ackno); + (unsigned long long) ap->dccpap_ack_seqno, + (unsigned long long) ap->dccpap_ack_ackno); dccp_ackpkts_trow_away_ack_record(ap); ap->dccpap_ack_seqno = DCCP_MAX_SEQNO + 1; } @@ -745,7 +757,10 @@ static void dccp_ackpkts_check_rcv_ackvector(struct dccp_ackpkts *ap, #endif dccp_pr_debug("%sACK vector 0, len=%d, ack_seqno=%llu, ack_ackno=%llu, ACKED!\n", debug_prefix, len, - ap->dccpap_ack_seqno, ap->dccpap_ack_ackno); + (unsigned long long) + ap->dccpap_ack_seqno, + (unsigned long long) + ap->dccpap_ack_ackno); dccp_ackpkts_trow_away_ack_record(ap); } /* -- cgit v0.10.2 From 4aa769b99724953a6f322c648c0cfbe8c6616382 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Tue, 9 Aug 2005 20:27:37 -0700 Subject: [Bluetooth]: Update and cleanup of the virtual HCI driver This patch cleans up the virtual HCI driver. It also adds support for the dynamic minor device number allocation. Signed-off-by: Marcel Holtmann Signed-off-by: David S. Miller diff --git a/drivers/bluetooth/hci_vhci.c b/drivers/bluetooth/hci_vhci.c index f9b956fb..4aa5dff 100644 --- a/drivers/bluetooth/hci_vhci.c +++ b/drivers/bluetooth/hci_vhci.c @@ -1,229 +1,220 @@ -/* - BlueZ - Bluetooth protocol stack for Linux - Copyright (C) 2000-2001 Qualcomm Incorporated - - Written 2000,2001 by Maxim Krasnyansky - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License version 2 as - published by the Free Software Foundation; - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. - IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY - CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES - WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - - ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, - COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS - SOFTWARE IS DISCLAIMED. -*/ - /* - * Bluetooth HCI virtual device driver. * - * $Id: hci_vhci.c,v 1.3 2002/04/17 17:37:20 maxk Exp $ + * Bluetooth virtual HCI driver + * + * Copyright (C) 2000-2001 Qualcomm Incorporated + * Copyright (C) 2002-2003 Maxim Krasnyansky + * Copyright (C) 2004-2005 Marcel Holtmann + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * */ -#define VERSION "1.1" #include #include -#include #include -#include -#include +#include #include +#include +#include +#include #include -#include -#include -#include #include #include -#include -#include - #include #include -#include "hci_vhci.h" -/* HCI device part */ +#ifndef CONFIG_BT_HCIVHCI_DEBUG +#undef BT_DBG +#define BT_DBG(D...) +#endif + +#define VERSION "1.2" + +static int minor = MISC_DYNAMIC_MINOR; -static int hci_vhci_open(struct hci_dev *hdev) +struct vhci_data { + struct hci_dev *hdev; + + unsigned long flags; + + wait_queue_head_t read_wait; + struct sk_buff_head readq; + + struct fasync_struct *fasync; +}; + +#define VHCI_FASYNC 0x0010 + +static struct miscdevice vhci_miscdev; + +static int vhci_open_dev(struct hci_dev *hdev) { set_bit(HCI_RUNNING, &hdev->flags); - return 0; -} -static int hci_vhci_flush(struct hci_dev *hdev) -{ - struct hci_vhci_struct *hci_vhci = (struct hci_vhci_struct *) hdev->driver_data; - skb_queue_purge(&hci_vhci->readq); return 0; } -static int hci_vhci_close(struct hci_dev *hdev) +static int vhci_close_dev(struct hci_dev *hdev) { + struct vhci_data *vhci = hdev->driver_data; + if (!test_and_clear_bit(HCI_RUNNING, &hdev->flags)) return 0; - hci_vhci_flush(hdev); + skb_queue_purge(&vhci->readq); + return 0; } -static void hci_vhci_destruct(struct hci_dev *hdev) +static int vhci_flush(struct hci_dev *hdev) { - struct hci_vhci_struct *vhci; + struct vhci_data *vhci = hdev->driver_data; - if (!hdev) return; + skb_queue_purge(&vhci->readq); - vhci = (struct hci_vhci_struct *) hdev->driver_data; - kfree(vhci); + return 0; } -static int hci_vhci_send_frame(struct sk_buff *skb) +static int vhci_send_frame(struct sk_buff *skb) { struct hci_dev* hdev = (struct hci_dev *) skb->dev; - struct hci_vhci_struct *hci_vhci; + struct vhci_data *vhci; if (!hdev) { - BT_ERR("Frame for uknown device (hdev=NULL)"); + BT_ERR("Frame for unknown HCI device (hdev=NULL)"); return -ENODEV; } if (!test_bit(HCI_RUNNING, &hdev->flags)) return -EBUSY; - hci_vhci = (struct hci_vhci_struct *) hdev->driver_data; + vhci = hdev->driver_data; memcpy(skb_push(skb, 1), &skb->pkt_type, 1); - skb_queue_tail(&hci_vhci->readq, skb); + skb_queue_tail(&vhci->readq, skb); + + if (vhci->flags & VHCI_FASYNC) + kill_fasync(&vhci->fasync, SIGIO, POLL_IN); - if (hci_vhci->flags & VHCI_FASYNC) - kill_fasync(&hci_vhci->fasync, SIGIO, POLL_IN); - wake_up_interruptible(&hci_vhci->read_wait); + wake_up_interruptible(&vhci->read_wait); return 0; } -/* Character device part */ - -/* Poll */ -static unsigned int hci_vhci_chr_poll(struct file *file, poll_table * wait) -{ - struct hci_vhci_struct *hci_vhci = (struct hci_vhci_struct *) file->private_data; - - poll_wait(file, &hci_vhci->read_wait, wait); - - if (!skb_queue_empty(&hci_vhci->readq)) - return POLLIN | POLLRDNORM; - - return POLLOUT | POLLWRNORM; +static void vhci_destruct(struct hci_dev *hdev) +{ + kfree(hdev->driver_data); } -/* Get packet from user space buffer(already verified) */ -static inline ssize_t hci_vhci_get_user(struct hci_vhci_struct *hci_vhci, const char __user *buf, size_t count) +static inline ssize_t vhci_get_user(struct vhci_data *vhci, + const char __user *buf, size_t count) { struct sk_buff *skb; if (count > HCI_MAX_FRAME_SIZE) return -EINVAL; - if (!(skb = bt_skb_alloc(count, GFP_KERNEL))) + skb = bt_skb_alloc(count, GFP_KERNEL); + if (!skb) return -ENOMEM; - + if (copy_from_user(skb_put(skb, count), buf, count)) { kfree_skb(skb); return -EFAULT; } - skb->dev = (void *) hci_vhci->hdev; + skb->dev = (void *) vhci->hdev; skb->pkt_type = *((__u8 *) skb->data); skb_pull(skb, 1); hci_recv_frame(skb); return count; -} - -/* Write */ -static ssize_t hci_vhci_chr_write(struct file * file, const char __user * buf, - size_t count, loff_t *pos) -{ - struct hci_vhci_struct *hci_vhci = (struct hci_vhci_struct *) file->private_data; - - if (!access_ok(VERIFY_READ, buf, count)) - return -EFAULT; - - return hci_vhci_get_user(hci_vhci, buf, count); } -/* Put packet to user space buffer(already verified) */ -static inline ssize_t hci_vhci_put_user(struct hci_vhci_struct *hci_vhci, - struct sk_buff *skb, char __user *buf, - int count) +static inline ssize_t vhci_put_user(struct vhci_data *vhci, + struct sk_buff *skb, char __user *buf, int count) { - int len = count, total = 0; char __user *ptr = buf; + int len, total = 0; + + len = min_t(unsigned int, skb->len, count); - len = min_t(unsigned int, skb->len, len); if (copy_to_user(ptr, skb->data, len)) return -EFAULT; + total += len; - hci_vhci->hdev->stat.byte_tx += len; + vhci->hdev->stat.byte_tx += len; + switch (skb->pkt_type) { case HCI_COMMAND_PKT: - hci_vhci->hdev->stat.cmd_tx++; + vhci->hdev->stat.cmd_tx++; break; case HCI_ACLDATA_PKT: - hci_vhci->hdev->stat.acl_tx++; + vhci->hdev->stat.acl_tx++; break; case HCI_SCODATA_PKT: - hci_vhci->hdev->stat.cmd_tx++; + vhci->hdev->stat.cmd_tx++; break; }; return total; } -/* Read */ -static ssize_t hci_vhci_chr_read(struct file * file, char __user * buf, size_t count, loff_t *pos) +static loff_t vhci_llseek(struct file * file, loff_t offset, int origin) +{ + return -ESPIPE; +} + +static ssize_t vhci_read(struct file * file, char __user * buf, size_t count, loff_t *pos) { - struct hci_vhci_struct *hci_vhci = (struct hci_vhci_struct *) file->private_data; DECLARE_WAITQUEUE(wait, current); + struct vhci_data *vhci = file->private_data; struct sk_buff *skb; ssize_t ret = 0; - add_wait_queue(&hci_vhci->read_wait, &wait); + add_wait_queue(&vhci->read_wait, &wait); while (count) { set_current_state(TASK_INTERRUPTIBLE); - /* Read frames from device queue */ - if (!(skb = skb_dequeue(&hci_vhci->readq))) { + skb = skb_dequeue(&vhci->readq); + if (!skb) { if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; break; } + if (signal_pending(current)) { ret = -ERESTARTSYS; break; } - /* Nothing to read, let's sleep */ schedule(); continue; } if (access_ok(VERIFY_WRITE, buf, count)) - ret = hci_vhci_put_user(hci_vhci, skb, buf, count); + ret = vhci_put_user(vhci, skb, buf, count); else ret = -EFAULT; @@ -231,84 +222,90 @@ static ssize_t hci_vhci_chr_read(struct file * file, char __user * buf, size_t c break; } set_current_state(TASK_RUNNING); - remove_wait_queue(&hci_vhci->read_wait, &wait); + remove_wait_queue(&vhci->read_wait, &wait); return ret; } -static loff_t hci_vhci_chr_lseek(struct file * file, loff_t offset, int origin) +static ssize_t vhci_write(struct file *file, + const char __user *buf, size_t count, loff_t *pos) { - return -ESPIPE; -} + struct vhci_data *vhci = file->private_data; -static int hci_vhci_chr_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg) -{ - return -EINVAL; + if (!access_ok(VERIFY_READ, buf, count)) + return -EFAULT; + + return vhci_get_user(vhci, buf, count); } -static int hci_vhci_chr_fasync(int fd, struct file *file, int on) +static unsigned int vhci_poll(struct file *file, poll_table *wait) { - struct hci_vhci_struct *hci_vhci = (struct hci_vhci_struct *) file->private_data; - int ret; + struct vhci_data *vhci = file->private_data; - if ((ret = fasync_helper(fd, file, on, &hci_vhci->fasync)) < 0) - return ret; - - if (on) - hci_vhci->flags |= VHCI_FASYNC; - else - hci_vhci->flags &= ~VHCI_FASYNC; + poll_wait(file, &vhci->read_wait, wait); - return 0; + if (!skb_queue_empty(&vhci->readq)) + return POLLIN | POLLRDNORM; + + return POLLOUT | POLLWRNORM; +} + +static int vhci_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + return -EINVAL; } -static int hci_vhci_chr_open(struct inode *inode, struct file * file) +static int vhci_open(struct inode *inode, struct file *file) { - struct hci_vhci_struct *hci_vhci = NULL; + struct vhci_data *vhci; struct hci_dev *hdev; - if (!(hci_vhci = kmalloc(sizeof(struct hci_vhci_struct), GFP_KERNEL))) + vhci = kmalloc(sizeof(struct vhci_data), GFP_KERNEL); + if (!vhci) return -ENOMEM; - memset(hci_vhci, 0, sizeof(struct hci_vhci_struct)); + memset(vhci, 0, sizeof(struct vhci_data)); - skb_queue_head_init(&hci_vhci->readq); - init_waitqueue_head(&hci_vhci->read_wait); + skb_queue_head_init(&vhci->readq); + init_waitqueue_head(&vhci->read_wait); - /* Initialize and register HCI device */ hdev = hci_alloc_dev(); if (!hdev) { - kfree(hci_vhci); + kfree(vhci); return -ENOMEM; } - hci_vhci->hdev = hdev; + vhci->hdev = hdev; hdev->type = HCI_VHCI; - hdev->driver_data = hci_vhci; + hdev->driver_data = vhci; + SET_HCIDEV_DEV(hdev, vhci_miscdev.dev); - hdev->open = hci_vhci_open; - hdev->close = hci_vhci_close; - hdev->flush = hci_vhci_flush; - hdev->send = hci_vhci_send_frame; - hdev->destruct = hci_vhci_destruct; + hdev->open = vhci_open_dev; + hdev->close = vhci_close_dev; + hdev->flush = vhci_flush; + hdev->send = vhci_send_frame; + hdev->destruct = vhci_destruct; hdev->owner = THIS_MODULE; - + if (hci_register_dev(hdev) < 0) { - kfree(hci_vhci); + BT_ERR("Can't register HCI device"); + kfree(vhci); hci_free_dev(hdev); return -EBUSY; } - file->private_data = hci_vhci; - return nonseekable_open(inode, file); + file->private_data = vhci; + + return nonseekable_open(inode, file); } -static int hci_vhci_chr_close(struct inode *inode, struct file *file) +static int vhci_release(struct inode *inode, struct file *file) { - struct hci_vhci_struct *hci_vhci = (struct hci_vhci_struct *) file->private_data; - struct hci_dev *hdev = hci_vhci->hdev; + struct vhci_data *vhci = file->private_data; + struct hci_dev *hdev = vhci->hdev; if (hci_unregister_dev(hdev) < 0) { BT_ERR("Can't unregister HCI device %s", hdev->name); @@ -317,48 +314,71 @@ static int hci_vhci_chr_close(struct inode *inode, struct file *file) hci_free_dev(hdev); file->private_data = NULL; + + return 0; +} + +static int vhci_fasync(int fd, struct file *file, int on) +{ + struct vhci_data *vhci = file->private_data; + int err; + + err = fasync_helper(fd, file, on, &vhci->fasync); + if (err < 0) + return err; + + if (on) + vhci->flags |= VHCI_FASYNC; + else + vhci->flags &= ~VHCI_FASYNC; + return 0; } -static struct file_operations hci_vhci_fops = { - .owner = THIS_MODULE, - .llseek = hci_vhci_chr_lseek, - .read = hci_vhci_chr_read, - .write = hci_vhci_chr_write, - .poll = hci_vhci_chr_poll, - .ioctl = hci_vhci_chr_ioctl, - .open = hci_vhci_chr_open, - .release = hci_vhci_chr_close, - .fasync = hci_vhci_chr_fasync +static struct file_operations vhci_fops = { + .owner = THIS_MODULE, + .llseek = vhci_llseek, + .read = vhci_read, + .write = vhci_write, + .poll = vhci_poll, + .ioctl = vhci_ioctl, + .open = vhci_open, + .release = vhci_release, + .fasync = vhci_fasync, }; -static struct miscdevice hci_vhci_miscdev= -{ - VHCI_MINOR, - "hci_vhci", - &hci_vhci_fops +static struct miscdevice vhci_miscdev= { + .name = "vhci", + .fops = &vhci_fops, }; -static int __init hci_vhci_init(void) +static int __init vhci_init(void) { - BT_INFO("VHCI driver ver %s", VERSION); + BT_INFO("Virtual HCI driver ver %s", VERSION); - if (misc_register(&hci_vhci_miscdev)) { - BT_ERR("Can't register misc device %d\n", VHCI_MINOR); + vhci_miscdev.minor = minor; + + if (misc_register(&vhci_miscdev) < 0) { + BT_ERR("Can't register misc device with minor %d", minor); return -EIO; } return 0; } -static void hci_vhci_cleanup(void) +static void __exit vhci_exit(void) { - misc_deregister(&hci_vhci_miscdev); + if (misc_deregister(&vhci_miscdev) < 0) + BT_ERR("Can't unregister misc device with minor %d", minor); } -module_init(hci_vhci_init); -module_exit(hci_vhci_cleanup); +module_init(vhci_init); +module_exit(vhci_exit); + +module_param(minor, int, 0444); +MODULE_PARM_DESC(minor, "Miscellaneous minor device number"); -MODULE_AUTHOR("Maxim Krasnyansky "); -MODULE_DESCRIPTION("Bluetooth VHCI driver ver " VERSION); -MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Maxim Krasnyansky , Marcel Holtmann "); +MODULE_DESCRIPTION("Bluetooth virtual HCI driver ver " VERSION); +MODULE_VERSION(VERSION); +MODULE_LICENSE("GPL"); diff --git a/drivers/bluetooth/hci_vhci.h b/drivers/bluetooth/hci_vhci.h deleted file mode 100644 index 53b11f9..0000000 --- a/drivers/bluetooth/hci_vhci.h +++ /dev/null @@ -1,50 +0,0 @@ -/* - BlueZ - Bluetooth protocol stack for Linux - Copyright (C) 2000-2001 Qualcomm Incorporated - - Written 2000,2001 by Maxim Krasnyansky - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License version 2 as - published by the Free Software Foundation; - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. - IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY - CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES - WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - - ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, - COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS - SOFTWARE IS DISCLAIMED. -*/ - -/* - * $Id: hci_vhci.h,v 1.1.1.1 2002/03/08 21:03:15 maxk Exp $ - */ - -#ifndef __HCI_VHCI_H -#define __HCI_VHCI_H - -#ifdef __KERNEL__ - -struct hci_vhci_struct { - struct hci_dev *hdev; - __u32 flags; - wait_queue_head_t read_wait; - struct sk_buff_head readq; - struct fasync_struct *fasync; -}; - -/* VHCI device flags */ -#define VHCI_FASYNC 0x0010 - -#endif /* __KERNEL__ */ - -#define VHCI_DEV "/dev/vhci" -#define VHCI_MINOR 250 - -#endif /* __HCI_VHCI_H */ -- cgit v0.10.2 From 45bb4bf08b9c16122af84d3f26a018c8022b24e5 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Tue, 9 Aug 2005 20:27:49 -0700 Subject: [Bluetooth]: Workaround for inquiry results with RSSI and page scan mode This patch implements a workaround for buggy Bluetooth 1.2 devices from Silicon Wave. Their inquiry results with RSSI contain the page scan mode field. This field was removed in the final Bluetooth 1.2 specification. Signed-off-by: Marcel Holtmann Signed-off-by: David S. Miller diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 6f0706f..cd075f1 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -453,6 +453,15 @@ struct inquiry_info_with_rssi { __u16 clock_offset; __s8 rssi; } __attribute__ ((packed)); +struct inquiry_info_with_rssi_and_pscan_mode { + bdaddr_t bdaddr; + __u8 pscan_rep_mode; + __u8 pscan_period_mode; + __u8 pscan_mode; + __u8 dev_class[3]; + __u16 clock_offset; + __s8 rssi; +} __attribute__ ((packed)); #define HCI_EV_CONN_COMPLETE 0x03 struct hci_ev_conn_complete { diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 46367bd..632f7a9 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -484,14 +484,18 @@ static inline void hci_inquiry_complete_evt(struct hci_dev *hdev, struct sk_buff /* Inquiry Result */ static inline void hci_inquiry_result_evt(struct hci_dev *hdev, struct sk_buff *skb) { + struct inquiry_data data; struct inquiry_info *info = (struct inquiry_info *) (skb->data + 1); int num_rsp = *((__u8 *) skb->data); BT_DBG("%s num_rsp %d", hdev->name, num_rsp); + if (!num_rsp) + return; + hci_dev_lock(hdev); + for (; num_rsp; num_rsp--) { - struct inquiry_data data; bacpy(&data.bdaddr, &info->bdaddr); data.pscan_rep_mode = info->pscan_rep_mode; data.pscan_period_mode = info->pscan_period_mode; @@ -502,30 +506,55 @@ static inline void hci_inquiry_result_evt(struct hci_dev *hdev, struct sk_buff * info++; hci_inquiry_cache_update(hdev, &data); } + hci_dev_unlock(hdev); } /* Inquiry Result With RSSI */ static inline void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev, struct sk_buff *skb) { - struct inquiry_info_with_rssi *info = (struct inquiry_info_with_rssi *) (skb->data + 1); + struct inquiry_data data; int num_rsp = *((__u8 *) skb->data); BT_DBG("%s num_rsp %d", hdev->name, num_rsp); + if (!num_rsp) + return; + hci_dev_lock(hdev); - for (; num_rsp; num_rsp--) { - struct inquiry_data data; - bacpy(&data.bdaddr, &info->bdaddr); - data.pscan_rep_mode = info->pscan_rep_mode; - data.pscan_period_mode = info->pscan_period_mode; - data.pscan_mode = 0x00; - memcpy(data.dev_class, info->dev_class, 3); - data.clock_offset = info->clock_offset; - data.rssi = info->rssi; - info++; - hci_inquiry_cache_update(hdev, &data); + + if ((skb->len - 1) / num_rsp != sizeof(struct inquiry_info_with_rssi)) { + struct inquiry_info_with_rssi_and_pscan_mode *info = + (struct inquiry_info_with_rssi_and_pscan_mode *) (skb->data + 1); + + for (; num_rsp; num_rsp--) { + bacpy(&data.bdaddr, &info->bdaddr); + data.pscan_rep_mode = info->pscan_rep_mode; + data.pscan_period_mode = info->pscan_period_mode; + data.pscan_mode = info->pscan_mode; + memcpy(data.dev_class, info->dev_class, 3); + data.clock_offset = info->clock_offset; + data.rssi = info->rssi; + info++; + hci_inquiry_cache_update(hdev, &data); + } + } else { + struct inquiry_info_with_rssi *info = + (struct inquiry_info_with_rssi *) (skb->data + 1); + + for (; num_rsp; num_rsp--) { + bacpy(&data.bdaddr, &info->bdaddr); + data.pscan_rep_mode = info->pscan_rep_mode; + data.pscan_period_mode = info->pscan_period_mode; + data.pscan_mode = 0x00; + memcpy(data.dev_class, info->dev_class, 3); + data.clock_offset = info->clock_offset; + data.rssi = info->rssi; + info++; + hci_inquiry_cache_update(hdev, &data); + } } + hci_dev_unlock(hdev); } -- cgit v0.10.2 From 85a1e930bf628700e8e9c166b1f5c1c26d3651cc Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Tue, 9 Aug 2005 20:28:02 -0700 Subject: [Bluetooth]: Track page scan repetition mode changes The HCI page scan repetition mode change event contains the actual page scan repetition mode for the remote device. It is the same value that is received from an inquiry response and it can be used to make further reconnections faster. Signed-off-by: Marcel Holtmann Signed-off-by: David S. Miller diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index cd075f1..371e7d3 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -593,6 +593,12 @@ struct hci_ev_clock_offset { __u16 clock_offset; } __attribute__ ((packed)); +#define HCI_EV_PSCAN_REP_MODE 0x20 +struct hci_ev_pscan_rep_mode { + bdaddr_t bdaddr; + __u8 pscan_rep_mode; +} __attribute__ ((packed)); + /* Internal events generated by Bluetooth stack */ #define HCI_EV_STACK_INTERNAL 0xFD struct hci_ev_stack_internal { diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 632f7a9..a004284 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -894,6 +894,24 @@ static inline void hci_clock_offset_evt(struct hci_dev *hdev, struct sk_buff *sk hci_dev_unlock(hdev); } +/* Page Scan Repetition Mode */ +static inline void hci_pscan_rep_mode_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_ev_pscan_rep_mode *ev = (struct hci_ev_pscan_rep_mode *) skb->data; + struct inquiry_entry *ie; + + BT_DBG("%s", hdev->name); + + hci_dev_lock(hdev); + + if ((ie = hci_inquiry_cache_lookup(hdev, &ev->bdaddr))) { + ie->data.pscan_rep_mode = ev->pscan_rep_mode; + ie->timestamp = jiffies; + } + + hci_dev_unlock(hdev); +} + void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_event_hdr *hdr = (struct hci_event_hdr *) skb->data; @@ -966,6 +984,10 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb) hci_clock_offset_evt(hdev, skb); break; + case HCI_EV_PSCAN_REP_MODE: + hci_pscan_rep_mode_evt(hdev, skb); + break; + case HCI_EV_CMD_STATUS: cs = (struct hci_ev_cmd_status *) skb->data; skb_pull(skb, sizeof(cs)); -- cgit v0.10.2 From 7b9eb9e2099f6f4acd6a36bcd7820d27c3cf5ee3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timo=20Ter=E4s?= Date: Tue, 9 Aug 2005 20:28:21 -0700 Subject: [Bluetooth]: Call tty_hangup() when DCD is de-asserted MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The RFCOMM layer does not handle properly the de-assertation of CD signal. It should call tty_hangup() to work properly. Signed-off-by: Timo Teräs Signed-off-by: Marcel Holtmann Signed-off-by: David S. Miller diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c index 6304590..67d9dd6 100644 --- a/net/bluetooth/rfcomm/tty.c +++ b/net/bluetooth/rfcomm/tty.c @@ -528,9 +528,14 @@ static void rfcomm_dev_modem_status(struct rfcomm_dlc *dlc, u8 v24_sig) struct rfcomm_dev *dev = dlc->owner; if (!dev) return; - + BT_DBG("dlc %p dev %p v24_sig 0x%02x", dlc, dev, v24_sig); + if ((dev->modem_status & TIOCM_CD) && !(v24_sig & RFCOMM_V24_DV)) { + if (dev->tty && !C_CLOCAL(dev->tty)) + tty_hangup(dev->tty); + } + dev->modem_status = ((v24_sig & RFCOMM_V24_RTC) ? (TIOCM_DSR | TIOCM_DTR) : 0) | ((v24_sig & RFCOMM_V24_RTR) ? (TIOCM_RTS | TIOCM_CTS) : 0) | -- cgit v0.10.2 From 3a5e903c09aed19ca4a1bb26d87b8d6461a93818 Mon Sep 17 00:00:00 2001 From: "J. Suter" Date: Tue, 9 Aug 2005 20:28:46 -0700 Subject: [Bluetooth]: Implement RFCOMM remote port negotiation This patch implements the remote port negotiation (RPN) of the RFCOMM protocol for Bluetooth. Signed-off-by: J. Suter Signed-off-by: Marcel Holtmann Signed-off-by: David S. Miller diff --git a/include/net/bluetooth/rfcomm.h b/include/net/bluetooth/rfcomm.h index 13669ba..3768823 100644 --- a/include/net/bluetooth/rfcomm.h +++ b/include/net/bluetooth/rfcomm.h @@ -80,9 +80,9 @@ #define RFCOMM_RPN_STOP_15 1 #define RFCOMM_RPN_PARITY_NONE 0x0 -#define RFCOMM_RPN_PARITY_ODD 0x4 -#define RFCOMM_RPN_PARITY_EVEN 0x5 -#define RFCOMM_RPN_PARITY_MARK 0x6 +#define RFCOMM_RPN_PARITY_ODD 0x1 +#define RFCOMM_RPN_PARITY_EVEN 0x3 +#define RFCOMM_RPN_PARITY_MARK 0x5 #define RFCOMM_RPN_PARITY_SPACE 0x7 #define RFCOMM_RPN_FLOW_NONE 0x00 @@ -223,6 +223,12 @@ struct rfcomm_dlc { #define RFCOMM_CFC_DISABLED 0 #define RFCOMM_CFC_ENABLED RFCOMM_MAX_CREDITS +/* ---- RFCOMM SEND RPN ---- */ +int rfcomm_send_rpn(struct rfcomm_session *s, int cr, u8 dlci, + u8 bit_rate, u8 data_bits, u8 stop_bits, + u8 parity, u8 flow_ctrl_settings, + u8 xon_char, u8 xoff_char, u16 param_mask); + /* ---- RFCOMM DLCs (channels) ---- */ struct rfcomm_dlc *rfcomm_dlc_alloc(int prio); void rfcomm_dlc_free(struct rfcomm_dlc *d); diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index 27bf504..52022cc 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -21,10 +21,6 @@ SOFTWARE IS DISCLAIMED. */ -/* - RPN support - Dirk Husemann -*/ - /* * Bluetooth RFCOMM core. * @@ -115,10 +111,10 @@ static void rfcomm_session_del(struct rfcomm_session *s); #define __get_mcc_len(b) ((b & 0xfe) >> 1) /* RPN macros */ -#define __rpn_line_settings(data, stop, parity) ((data & 0x3) | ((stop & 0x1) << 2) | ((parity & 0x3) << 3)) +#define __rpn_line_settings(data, stop, parity) ((data & 0x3) | ((stop & 0x1) << 2) | ((parity & 0x7) << 3)) #define __get_rpn_data_bits(line) ((line) & 0x3) #define __get_rpn_stop_bits(line) (((line) >> 2) & 0x1) -#define __get_rpn_parity(line) (((line) >> 3) & 0x3) +#define __get_rpn_parity(line) (((line) >> 3) & 0x7) static inline void rfcomm_schedule(uint event) { @@ -780,10 +776,10 @@ static int rfcomm_send_pn(struct rfcomm_session *s, int cr, struct rfcomm_dlc *d return rfcomm_send_frame(s, buf, ptr - buf); } -static int rfcomm_send_rpn(struct rfcomm_session *s, int cr, u8 dlci, - u8 bit_rate, u8 data_bits, u8 stop_bits, - u8 parity, u8 flow_ctrl_settings, - u8 xon_char, u8 xoff_char, u16 param_mask) +int rfcomm_send_rpn(struct rfcomm_session *s, int cr, u8 dlci, + u8 bit_rate, u8 data_bits, u8 stop_bits, + u8 parity, u8 flow_ctrl_settings, + u8 xon_char, u8 xoff_char, u16 param_mask) { struct rfcomm_hdr *hdr; struct rfcomm_mcc *mcc; @@ -791,9 +787,9 @@ static int rfcomm_send_rpn(struct rfcomm_session *s, int cr, u8 dlci, u8 buf[16], *ptr = buf; BT_DBG("%p cr %d dlci %d bit_r 0x%x data_b 0x%x stop_b 0x%x parity 0x%x" - "flwc_s 0x%x xon_c 0x%x xoff_c 0x%x p_mask 0x%x", - s, cr, dlci, bit_rate, data_bits, stop_bits, parity, - flow_ctrl_settings, xon_char, xoff_char, param_mask); + " flwc_s 0x%x xon_c 0x%x xoff_c 0x%x p_mask 0x%x", + s, cr, dlci, bit_rate, data_bits, stop_bits, parity, + flow_ctrl_settings, xon_char, xoff_char, param_mask); hdr = (void *) ptr; ptr += sizeof(*hdr); hdr->addr = __addr(s->initiator, 0); @@ -1265,16 +1261,16 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_ u8 xon_char = 0; u8 xoff_char = 0; u16 rpn_mask = RFCOMM_RPN_PM_ALL; - - BT_DBG("dlci %d cr %d len 0x%x bitr 0x%x line 0x%x flow 0x%x xonc 0x%x xoffc 0x%x pm 0x%x", - dlci, cr, len, rpn->bit_rate, rpn->line_settings, rpn->flow_ctrl, - rpn->xon_char, rpn->xoff_char, rpn->param_mask); - - if (!cr) + + BT_DBG("dlci %d cr %d len 0x%x bitr 0x%x line 0x%x flow 0x%x xonc 0x%x xoffc 0x%x pm 0x%x", + dlci, cr, len, rpn->bit_rate, rpn->line_settings, rpn->flow_ctrl, + rpn->xon_char, rpn->xoff_char, rpn->param_mask); + + if (!cr) return 0; - + if (len == 1) { - /* request: return default setting */ + /* This is a request, return default settings */ bit_rate = RFCOMM_RPN_BR_115200; data_bits = RFCOMM_RPN_DATA_8; stop_bits = RFCOMM_RPN_STOP_1; @@ -1282,11 +1278,12 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_ flow_ctrl = RFCOMM_RPN_FLOW_NONE; xon_char = RFCOMM_RPN_XON_CHAR; xoff_char = RFCOMM_RPN_XOFF_CHAR; - goto rpn_out; } - /* check for sane values: ignore/accept bit_rate, 8 bits, 1 stop bit, no parity, - no flow control lines, normal XON/XOFF chars */ + + /* Check for sane values, ignore/accept bit_rate, 8 bits, 1 stop bit, + * no parity, no flow control lines, normal XON/XOFF chars */ + if (rpn->param_mask & RFCOMM_RPN_PM_BITRATE) { bit_rate = rpn->bit_rate; if (bit_rate != RFCOMM_RPN_BR_115200) { @@ -1295,6 +1292,7 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_ rpn_mask ^= RFCOMM_RPN_PM_BITRATE; } } + if (rpn->param_mask & RFCOMM_RPN_PM_DATA) { data_bits = __get_rpn_data_bits(rpn->line_settings); if (data_bits != RFCOMM_RPN_DATA_8) { @@ -1303,6 +1301,7 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_ rpn_mask ^= RFCOMM_RPN_PM_DATA; } } + if (rpn->param_mask & RFCOMM_RPN_PM_STOP) { stop_bits = __get_rpn_stop_bits(rpn->line_settings); if (stop_bits != RFCOMM_RPN_STOP_1) { @@ -1311,6 +1310,7 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_ rpn_mask ^= RFCOMM_RPN_PM_STOP; } } + if (rpn->param_mask & RFCOMM_RPN_PM_PARITY) { parity = __get_rpn_parity(rpn->line_settings); if (parity != RFCOMM_RPN_PARITY_NONE) { @@ -1319,6 +1319,7 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_ rpn_mask ^= RFCOMM_RPN_PM_PARITY; } } + if (rpn->param_mask & RFCOMM_RPN_PM_FLOW) { flow_ctrl = rpn->flow_ctrl; if (flow_ctrl != RFCOMM_RPN_FLOW_NONE) { @@ -1327,6 +1328,7 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_ rpn_mask ^= RFCOMM_RPN_PM_FLOW; } } + if (rpn->param_mask & RFCOMM_RPN_PM_XON) { xon_char = rpn->xon_char; if (xon_char != RFCOMM_RPN_XON_CHAR) { @@ -1335,6 +1337,7 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_ rpn_mask ^= RFCOMM_RPN_PM_XON; } } + if (rpn->param_mask & RFCOMM_RPN_PM_XOFF) { xoff_char = rpn->xoff_char; if (xoff_char != RFCOMM_RPN_XOFF_CHAR) { @@ -1345,9 +1348,8 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_ } rpn_out: - rfcomm_send_rpn(s, 0, dlci, - bit_rate, data_bits, stop_bits, parity, flow_ctrl, - xon_char, xoff_char, rpn_mask); + rfcomm_send_rpn(s, 0, dlci, bit_rate, data_bits, stop_bits, + parity, flow_ctrl, xon_char, xoff_char, rpn_mask); return 0; } @@ -1358,14 +1360,13 @@ static int rfcomm_recv_rls(struct rfcomm_session *s, int cr, struct sk_buff *skb u8 dlci = __get_dlci(rls->dlci); BT_DBG("dlci %d cr %d status 0x%x", dlci, cr, rls->status); - + if (!cr) return 0; - /* FIXME: We should probably do something with this - information here. But for now it's sufficient just - to reply -- Bluetooth 1.1 says it's mandatory to - recognise and respond to RLS */ + /* We should probably do something with this information here. But + * for now it's sufficient just to reply -- Bluetooth 1.1 says it's + * mandatory to recognise and respond to RLS */ rfcomm_send_rls(s, 0, dlci, rls->status); @@ -1381,7 +1382,7 @@ static int rfcomm_recv_msc(struct rfcomm_session *s, int cr, struct sk_buff *skb BT_DBG("dlci %d cr %d v24 0x%x", dlci, cr, msc->v24_sig); d = rfcomm_dlc_get(s, dlci); - if (!d) + if (!d) return 0; if (cr) { @@ -1389,7 +1390,7 @@ static int rfcomm_recv_msc(struct rfcomm_session *s, int cr, struct sk_buff *skb set_bit(RFCOMM_TX_THROTTLED, &d->flags); else clear_bit(RFCOMM_TX_THROTTLED, &d->flags); - + rfcomm_dlc_lock(d); if (d->modem_status) d->modem_status(d, msc->v24_sig); @@ -1398,7 +1399,7 @@ static int rfcomm_recv_msc(struct rfcomm_session *s, int cr, struct sk_buff *skb rfcomm_send_msc(s, 0, dlci, msc->v24_sig); d->mscex |= RFCOMM_MSCEX_RX; - } else + } else d->mscex |= RFCOMM_MSCEX_TX; return 0; diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c index 67d9dd6..bbc3a44 100644 --- a/net/bluetooth/rfcomm/tty.c +++ b/net/bluetooth/rfcomm/tty.c @@ -745,20 +745,143 @@ static int rfcomm_tty_ioctl(struct tty_struct *tty, struct file *filp, unsigned return -ENOIOCTLCMD; } -#define RELEVANT_IFLAG(iflag) (iflag & (IGNBRK|BRKINT|IGNPAR|PARMRK|INPCK)) - static void rfcomm_tty_set_termios(struct tty_struct *tty, struct termios *old) { - BT_DBG("tty %p", tty); + struct termios *new = (struct termios *) tty->termios; + int old_baud_rate = tty_termios_baud_rate(old); + int new_baud_rate = tty_termios_baud_rate(new); - if ((tty->termios->c_cflag == old->c_cflag) && - (RELEVANT_IFLAG(tty->termios->c_iflag) == RELEVANT_IFLAG(old->c_iflag))) - return; + u8 baud, data_bits, stop_bits, parity, x_on, x_off; + u16 changes = 0; + + struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; + + BT_DBG("tty %p termios %p", tty, old); + + /* Handle turning off CRTSCTS */ + if ((old->c_cflag & CRTSCTS) && !(new->c_cflag & CRTSCTS)) + BT_DBG("Turning off CRTSCTS unsupported"); + + /* Parity on/off and when on, odd/even */ + if (((old->c_cflag & PARENB) != (new->c_cflag & PARENB)) || + ((old->c_cflag & PARODD) != (new->c_cflag & PARODD)) ) { + changes |= RFCOMM_RPN_PM_PARITY; + BT_DBG("Parity change detected."); + } + + /* Mark and space parity are not supported! */ + if (new->c_cflag & PARENB) { + if (new->c_cflag & PARODD) { + BT_DBG("Parity is ODD"); + parity = RFCOMM_RPN_PARITY_ODD; + } else { + BT_DBG("Parity is EVEN"); + parity = RFCOMM_RPN_PARITY_EVEN; + } + } else { + BT_DBG("Parity is OFF"); + parity = RFCOMM_RPN_PARITY_NONE; + } + + /* Setting the x_on / x_off characters */ + if (old->c_cc[VSTOP] != new->c_cc[VSTOP]) { + BT_DBG("XOFF custom"); + x_on = new->c_cc[VSTOP]; + changes |= RFCOMM_RPN_PM_XON; + } else { + BT_DBG("XOFF default"); + x_on = RFCOMM_RPN_XON_CHAR; + } + + if (old->c_cc[VSTART] != new->c_cc[VSTART]) { + BT_DBG("XON custom"); + x_off = new->c_cc[VSTART]; + changes |= RFCOMM_RPN_PM_XOFF; + } else { + BT_DBG("XON default"); + x_off = RFCOMM_RPN_XOFF_CHAR; + } + + /* Handle setting of stop bits */ + if ((old->c_cflag & CSTOPB) != (new->c_cflag & CSTOPB)) + changes |= RFCOMM_RPN_PM_STOP; + + /* POSIX does not support 1.5 stop bits and RFCOMM does not + * support 2 stop bits. So a request for 2 stop bits gets + * translated to 1.5 stop bits */ + if (new->c_cflag & CSTOPB) { + stop_bits = RFCOMM_RPN_STOP_15; + } else { + stop_bits = RFCOMM_RPN_STOP_1; + } - /* handle turning off CRTSCTS */ - if ((old->c_cflag & CRTSCTS) && !(tty->termios->c_cflag & CRTSCTS)) { - BT_DBG("turning off CRTSCTS"); + /* Handle number of data bits [5-8] */ + if ((old->c_cflag & CSIZE) != (new->c_cflag & CSIZE)) + changes |= RFCOMM_RPN_PM_DATA; + + switch (new->c_cflag & CSIZE) { + case CS5: + data_bits = RFCOMM_RPN_DATA_5; + break; + case CS6: + data_bits = RFCOMM_RPN_DATA_6; + break; + case CS7: + data_bits = RFCOMM_RPN_DATA_7; + break; + case CS8: + data_bits = RFCOMM_RPN_DATA_8; + break; + default: + data_bits = RFCOMM_RPN_DATA_8; + break; } + + /* Handle baudrate settings */ + if (old_baud_rate != new_baud_rate) + changes |= RFCOMM_RPN_PM_BITRATE; + + switch (new_baud_rate) { + case 2400: + baud = RFCOMM_RPN_BR_2400; + break; + case 4800: + baud = RFCOMM_RPN_BR_4800; + break; + case 7200: + baud = RFCOMM_RPN_BR_7200; + break; + case 9600: + baud = RFCOMM_RPN_BR_9600; + break; + case 19200: + baud = RFCOMM_RPN_BR_19200; + break; + case 38400: + baud = RFCOMM_RPN_BR_38400; + break; + case 57600: + baud = RFCOMM_RPN_BR_57600; + break; + case 115200: + baud = RFCOMM_RPN_BR_115200; + break; + case 230400: + baud = RFCOMM_RPN_BR_230400; + break; + default: + /* 9600 is standard accordinag to the RFCOMM specification */ + baud = RFCOMM_RPN_BR_9600; + break; + + } + + if (changes) + rfcomm_send_rpn(dev->dlc->session, 1, dev->dlc->dlci, baud, + data_bits, stop_bits, parity, + RFCOMM_RPN_FLOW_NONE, x_on, x_off, changes); + + return; } static void rfcomm_tty_throttle(struct tty_struct *tty) @@ -766,7 +889,7 @@ static void rfcomm_tty_throttle(struct tty_struct *tty) struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; BT_DBG("tty %p dev %p", tty, dev); - + rfcomm_dlc_throttle(dev->dlc); } @@ -775,7 +898,7 @@ static void rfcomm_tty_unthrottle(struct tty_struct *tty) struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; BT_DBG("tty %p dev %p", tty, dev); - + rfcomm_dlc_unthrottle(dev->dlc); } @@ -846,35 +969,35 @@ static int rfcomm_tty_tiocmget(struct tty_struct *tty, struct file *filp) static int rfcomm_tty_tiocmset(struct tty_struct *tty, struct file *filp, unsigned int set, unsigned int clear) { - struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; - struct rfcomm_dlc *dlc = dev->dlc; - u8 v24_sig; + struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; + struct rfcomm_dlc *dlc = dev->dlc; + u8 v24_sig; BT_DBG("tty %p dev %p set 0x%02x clear 0x%02x", tty, dev, set, clear); - rfcomm_dlc_get_modem_status(dlc, &v24_sig); - - if (set & TIOCM_DSR || set & TIOCM_DTR) - v24_sig |= RFCOMM_V24_RTC; - if (set & TIOCM_RTS || set & TIOCM_CTS) - v24_sig |= RFCOMM_V24_RTR; - if (set & TIOCM_RI) - v24_sig |= RFCOMM_V24_IC; - if (set & TIOCM_CD) - v24_sig |= RFCOMM_V24_DV; - - if (clear & TIOCM_DSR || clear & TIOCM_DTR) - v24_sig &= ~RFCOMM_V24_RTC; - if (clear & TIOCM_RTS || clear & TIOCM_CTS) - v24_sig &= ~RFCOMM_V24_RTR; - if (clear & TIOCM_RI) - v24_sig &= ~RFCOMM_V24_IC; - if (clear & TIOCM_CD) - v24_sig &= ~RFCOMM_V24_DV; - - rfcomm_dlc_set_modem_status(dlc, v24_sig); - - return 0; + rfcomm_dlc_get_modem_status(dlc, &v24_sig); + + if (set & TIOCM_DSR || set & TIOCM_DTR) + v24_sig |= RFCOMM_V24_RTC; + if (set & TIOCM_RTS || set & TIOCM_CTS) + v24_sig |= RFCOMM_V24_RTR; + if (set & TIOCM_RI) + v24_sig |= RFCOMM_V24_IC; + if (set & TIOCM_CD) + v24_sig |= RFCOMM_V24_DV; + + if (clear & TIOCM_DSR || clear & TIOCM_DTR) + v24_sig &= ~RFCOMM_V24_RTC; + if (clear & TIOCM_RTS || clear & TIOCM_CTS) + v24_sig &= ~RFCOMM_V24_RTR; + if (clear & TIOCM_RI) + v24_sig &= ~RFCOMM_V24_IC; + if (clear & TIOCM_CD) + v24_sig &= ~RFCOMM_V24_DV; + + rfcomm_dlc_set_modem_status(dlc, v24_sig); + + return 0; } /* ---- TTY structure ---- */ -- cgit v0.10.2 From 2eb25a6c34504254760e67172f7518d6bfdd7676 Mon Sep 17 00:00:00 2001 From: Victor Fusco Date: Tue, 9 Aug 2005 20:29:11 -0700 Subject: [Bluetooth]: Fix sparse warnings (__nocast type) This patch fixes the sparse warnings "implicit cast to nocast type" for the priority or gfp_mask parameters of the memory allocations. Signed-off-by: Victor Fusco Signed-off-by: Domen Puncer Signed-off-by: Marcel Holtmann Signed-off-by: David S. Miller diff --git a/drivers/bluetooth/bpa10x.c b/drivers/bluetooth/bpa10x.c index f696da6..d6b81c1 100644 --- a/drivers/bluetooth/bpa10x.c +++ b/drivers/bluetooth/bpa10x.c @@ -307,7 +307,8 @@ unlock: read_unlock(&data->lock); } -static inline struct urb *bpa10x_alloc_urb(struct usb_device *udev, unsigned int pipe, size_t size, int flags, void *data) +static inline struct urb *bpa10x_alloc_urb(struct usb_device *udev, unsigned int pipe, + size_t size, unsigned int __nocast flags, void *data) { struct urb *urb; struct usb_ctrlrequest *cr; diff --git a/drivers/bluetooth/hci_usb.c b/drivers/bluetooth/hci_usb.c index 657719b..6574034 100644 --- a/drivers/bluetooth/hci_usb.c +++ b/drivers/bluetooth/hci_usb.c @@ -127,7 +127,7 @@ static struct usb_device_id blacklist_ids[] = { { } /* Terminating entry */ }; -static struct _urb *_urb_alloc(int isoc, int gfp) +static struct _urb *_urb_alloc(int isoc, unsigned int __nocast gfp) { struct _urb *_urb = kmalloc(sizeof(struct _urb) + sizeof(struct usb_iso_packet_descriptor) * isoc, gfp); diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 06b24f6..5309b6f 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -135,7 +135,7 @@ struct bt_skb_cb { }; #define bt_cb(skb) ((struct bt_skb_cb *)(skb->cb)) -static inline struct sk_buff *bt_skb_alloc(unsigned int len, int how) +static inline struct sk_buff *bt_skb_alloc(unsigned int len, unsigned int __nocast how) { struct sk_buff *skb; diff --git a/include/net/bluetooth/rfcomm.h b/include/net/bluetooth/rfcomm.h index 3768823..ffea9d5 100644 --- a/include/net/bluetooth/rfcomm.h +++ b/include/net/bluetooth/rfcomm.h @@ -230,7 +230,7 @@ int rfcomm_send_rpn(struct rfcomm_session *s, int cr, u8 dlci, u8 xon_char, u8 xoff_char, u16 param_mask); /* ---- RFCOMM DLCs (channels) ---- */ -struct rfcomm_dlc *rfcomm_dlc_alloc(int prio); +struct rfcomm_dlc *rfcomm_dlc_alloc(unsigned int __nocast prio); void rfcomm_dlc_free(struct rfcomm_dlc *d); int rfcomm_dlc_open(struct rfcomm_dlc *d, bdaddr_t *src, bdaddr_t *dst, u8 channel); int rfcomm_dlc_close(struct rfcomm_dlc *d, int reason); diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c index 32fccfb..d3d6bc5 100644 --- a/net/bluetooth/l2cap.c +++ b/net/bluetooth/l2cap.c @@ -372,7 +372,7 @@ static struct proto l2cap_proto = { .obj_size = sizeof(struct l2cap_pinfo) }; -static struct sock *l2cap_sock_alloc(struct socket *sock, int proto, int prio) +static struct sock *l2cap_sock_alloc(struct socket *sock, int proto, unsigned int __nocast prio) { struct sock *sk; diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index 52022cc..173f46e 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -229,7 +229,7 @@ static void rfcomm_dlc_clear_state(struct rfcomm_dlc *d) d->rx_credits = RFCOMM_DEFAULT_CREDITS; } -struct rfcomm_dlc *rfcomm_dlc_alloc(int prio) +struct rfcomm_dlc *rfcomm_dlc_alloc(unsigned int __nocast prio) { struct rfcomm_dlc *d = kmalloc(sizeof(*d), prio); if (!d) diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index 63a123c..90e19eb 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -284,7 +284,7 @@ static struct proto rfcomm_proto = { .obj_size = sizeof(struct rfcomm_pinfo) }; -static struct sock *rfcomm_sock_alloc(struct socket *sock, int proto, int prio) +static struct sock *rfcomm_sock_alloc(struct socket *sock, int proto, unsigned int __nocast prio) { struct rfcomm_dlc *d; struct sock *sk; diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c index bbc3a44..1bca860 100644 --- a/net/bluetooth/rfcomm/tty.c +++ b/net/bluetooth/rfcomm/tty.c @@ -286,7 +286,7 @@ static inline void rfcomm_set_owner_w(struct sk_buff *skb, struct rfcomm_dev *de skb->destructor = rfcomm_wfree; } -static struct sk_buff *rfcomm_wmalloc(struct rfcomm_dev *dev, unsigned long size, int priority) +static struct sk_buff *rfcomm_wmalloc(struct rfcomm_dev *dev, unsigned long size, unsigned int __nocast priority) { if (atomic_read(&dev->wmem_alloc) < rfcomm_room(dev->dlc)) { struct sk_buff *skb = alloc_skb(size, priority); diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 746c11f..ce7ab7d 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -418,7 +418,7 @@ static struct proto sco_proto = { .obj_size = sizeof(struct sco_pinfo) }; -static struct sock *sco_sock_alloc(struct socket *sock, int proto, int prio) +static struct sock *sco_sock_alloc(struct socket *sock, int proto, unsigned int __nocast prio) { struct sock *sk; -- cgit v0.10.2 From 0d48d93947dd9ea21c5cdc76a8581b06a4a39281 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Tue, 9 Aug 2005 20:30:28 -0700 Subject: [Bluetooth]: Move packet type into the SKB control buffer This patch moves the usage of packet type into the SKB control buffer. After this patch it is now possible to shrink the sk_buff structure and redefine its pkt_type. Signed-off-by: Marcel Holtmann Signed-off-by: David S. Miller diff --git a/drivers/bluetooth/bfusb.c b/drivers/bluetooth/bfusb.c index e8d2a34..1e9db01 100644 --- a/drivers/bluetooth/bfusb.c +++ b/drivers/bluetooth/bfusb.c @@ -330,7 +330,7 @@ static inline int bfusb_recv_block(struct bfusb *bfusb, int hdr, unsigned char * } skb->dev = (void *) bfusb->hdev; - skb->pkt_type = pkt_type; + bt_cb(skb)->pkt_type = pkt_type; bfusb->reassembly = skb; } else { @@ -485,7 +485,7 @@ static int bfusb_send_frame(struct sk_buff *skb) unsigned char buf[3]; int sent = 0, size, count; - BT_DBG("hdev %p skb %p type %d len %d", hdev, skb, skb->pkt_type, skb->len); + BT_DBG("hdev %p skb %p type %d len %d", hdev, skb, bt_cb(skb)->pkt_type, skb->len); if (!hdev) { BT_ERR("Frame for unknown HCI device (hdev=NULL)"); @@ -497,7 +497,7 @@ static int bfusb_send_frame(struct sk_buff *skb) bfusb = (struct bfusb *) hdev->driver_data; - switch (skb->pkt_type) { + switch (bt_cb(skb)->pkt_type) { case HCI_COMMAND_PKT: hdev->stat.cmd_tx++; break; @@ -510,7 +510,7 @@ static int bfusb_send_frame(struct sk_buff *skb) }; /* Prepend skb with frame type */ - memcpy(skb_push(skb, 1), &(skb->pkt_type), 1); + memcpy(skb_push(skb, 1), &bt_cb(skb)->pkt_type, 1); count = skb->len; diff --git a/drivers/bluetooth/bluecard_cs.c b/drivers/bluetooth/bluecard_cs.c index bd2ec7e..26fe9c0 100644 --- a/drivers/bluetooth/bluecard_cs.c +++ b/drivers/bluetooth/bluecard_cs.c @@ -270,7 +270,7 @@ static void bluecard_write_wakeup(bluecard_info_t *info) if (!(skb = skb_dequeue(&(info->txq)))) break; - if (skb->pkt_type & 0x80) { + if (bt_cb(skb)->pkt_type & 0x80) { /* Disable RTS */ info->ctrl_reg |= REG_CONTROL_RTS; outb(info->ctrl_reg, iobase + REG_CONTROL); @@ -288,13 +288,13 @@ static void bluecard_write_wakeup(bluecard_info_t *info) /* Mark the buffer as dirty */ clear_bit(ready_bit, &(info->tx_state)); - if (skb->pkt_type & 0x80) { + if (bt_cb(skb)->pkt_type & 0x80) { DECLARE_WAIT_QUEUE_HEAD(wq); DEFINE_WAIT(wait); unsigned char baud_reg; - switch (skb->pkt_type) { + switch (bt_cb(skb)->pkt_type) { case PKT_BAUD_RATE_460800: baud_reg = REG_CONTROL_BAUD_RATE_460800; break; @@ -410,9 +410,9 @@ static void bluecard_receive(bluecard_info_t *info, unsigned int offset) if (info->rx_state == RECV_WAIT_PACKET_TYPE) { info->rx_skb->dev = (void *) info->hdev; - info->rx_skb->pkt_type = buf[i]; + bt_cb(info->rx_skb)->pkt_type = buf[i]; - switch (info->rx_skb->pkt_type) { + switch (bt_cb(info->rx_skb)->pkt_type) { case 0x00: /* init packet */ @@ -444,7 +444,7 @@ static void bluecard_receive(bluecard_info_t *info, unsigned int offset) default: /* unknown packet */ - BT_ERR("Unknown HCI packet with type 0x%02x received", info->rx_skb->pkt_type); + BT_ERR("Unknown HCI packet with type 0x%02x received", bt_cb(info->rx_skb)->pkt_type); info->hdev->stat.err_rx++; kfree_skb(info->rx_skb); @@ -586,21 +586,21 @@ static int bluecard_hci_set_baud_rate(struct hci_dev *hdev, int baud) switch (baud) { case 460800: cmd[4] = 0x00; - skb->pkt_type = PKT_BAUD_RATE_460800; + bt_cb(skb)->pkt_type = PKT_BAUD_RATE_460800; break; case 230400: cmd[4] = 0x01; - skb->pkt_type = PKT_BAUD_RATE_230400; + bt_cb(skb)->pkt_type = PKT_BAUD_RATE_230400; break; case 115200: cmd[4] = 0x02; - skb->pkt_type = PKT_BAUD_RATE_115200; + bt_cb(skb)->pkt_type = PKT_BAUD_RATE_115200; break; case 57600: /* Fall through... */ default: cmd[4] = 0x03; - skb->pkt_type = PKT_BAUD_RATE_57600; + bt_cb(skb)->pkt_type = PKT_BAUD_RATE_57600; break; } @@ -680,7 +680,7 @@ static int bluecard_hci_send_frame(struct sk_buff *skb) info = (bluecard_info_t *)(hdev->driver_data); - switch (skb->pkt_type) { + switch (bt_cb(skb)->pkt_type) { case HCI_COMMAND_PKT: hdev->stat.cmd_tx++; break; @@ -693,7 +693,7 @@ static int bluecard_hci_send_frame(struct sk_buff *skb) }; /* Prepend skb with frame type */ - memcpy(skb_push(skb, 1), &(skb->pkt_type), 1); + memcpy(skb_push(skb, 1), &bt_cb(skb)->pkt_type, 1); skb_queue_tail(&(info->txq), skb); bluecard_write_wakeup(info); diff --git a/drivers/bluetooth/bpa10x.c b/drivers/bluetooth/bpa10x.c index d6b81c1..a1bf8f0 100644 --- a/drivers/bluetooth/bpa10x.c +++ b/drivers/bluetooth/bpa10x.c @@ -105,7 +105,7 @@ static void bpa10x_recv_bulk(struct bpa10x_data *data, unsigned char *buf, int c if (skb) { memcpy(skb_put(skb, len), buf, len); skb->dev = (void *) data->hdev; - skb->pkt_type = HCI_ACLDATA_PKT; + bt_cb(skb)->pkt_type = HCI_ACLDATA_PKT; hci_recv_frame(skb); } break; @@ -117,7 +117,7 @@ static void bpa10x_recv_bulk(struct bpa10x_data *data, unsigned char *buf, int c if (skb) { memcpy(skb_put(skb, len), buf, len); skb->dev = (void *) data->hdev; - skb->pkt_type = HCI_SCODATA_PKT; + bt_cb(skb)->pkt_type = HCI_SCODATA_PKT; hci_recv_frame(skb); } break; @@ -129,7 +129,7 @@ static void bpa10x_recv_bulk(struct bpa10x_data *data, unsigned char *buf, int c if (skb) { memcpy(skb_put(skb, len), buf, len); skb->dev = (void *) data->hdev; - skb->pkt_type = HCI_VENDOR_PKT; + bt_cb(skb)->pkt_type = HCI_VENDOR_PKT; hci_recv_frame(skb); } break; @@ -190,7 +190,7 @@ static int bpa10x_recv_event(struct bpa10x_data *data, unsigned char *buf, int s } skb->dev = (void *) data->hdev; - skb->pkt_type = pkt_type; + bt_cb(skb)->pkt_type = pkt_type; memcpy(skb_put(skb, size), buf, size); @@ -488,7 +488,7 @@ static int bpa10x_send_frame(struct sk_buff *skb) struct hci_dev *hdev = (struct hci_dev *) skb->dev; struct bpa10x_data *data; - BT_DBG("hdev %p skb %p type %d len %d", hdev, skb, skb->pkt_type, skb->len); + BT_DBG("hdev %p skb %p type %d len %d", hdev, skb, bt_cb(skb)->pkt_type, skb->len); if (!hdev) { BT_ERR("Frame for unknown HCI device"); @@ -501,9 +501,9 @@ static int bpa10x_send_frame(struct sk_buff *skb) data = hdev->driver_data; /* Prepend skb with frame type */ - memcpy(skb_push(skb, 1), &(skb->pkt_type), 1); + memcpy(skb_push(skb, 1), &bt_cb(skb)->pkt_type, 1); - switch (skb->pkt_type) { + switch (bt_cb(skb)->pkt_type) { case HCI_COMMAND_PKT: hdev->stat.cmd_tx++; skb_queue_tail(&data->cmd_queue, skb); diff --git a/drivers/bluetooth/bt3c_cs.c b/drivers/bluetooth/bt3c_cs.c index adf1750..2e0338d 100644 --- a/drivers/bluetooth/bt3c_cs.c +++ b/drivers/bluetooth/bt3c_cs.c @@ -259,11 +259,11 @@ static void bt3c_receive(bt3c_info_t *info) if (info->rx_state == RECV_WAIT_PACKET_TYPE) { info->rx_skb->dev = (void *) info->hdev; - info->rx_skb->pkt_type = inb(iobase + DATA_L); + bt_cb(info->rx_skb)->pkt_type = inb(iobase + DATA_L); inb(iobase + DATA_H); - //printk("bt3c: PACKET_TYPE=%02x\n", info->rx_skb->pkt_type); + //printk("bt3c: PACKET_TYPE=%02x\n", bt_cb(info->rx_skb)->pkt_type); - switch (info->rx_skb->pkt_type) { + switch (bt_cb(info->rx_skb)->pkt_type) { case HCI_EVENT_PKT: info->rx_state = RECV_WAIT_EVENT_HEADER; @@ -282,7 +282,7 @@ static void bt3c_receive(bt3c_info_t *info) default: /* Unknown packet */ - BT_ERR("Unknown HCI packet with type 0x%02x received", info->rx_skb->pkt_type); + BT_ERR("Unknown HCI packet with type 0x%02x received", bt_cb(info->rx_skb)->pkt_type); info->hdev->stat.err_rx++; clear_bit(HCI_RUNNING, &(info->hdev->flags)); @@ -439,7 +439,7 @@ static int bt3c_hci_send_frame(struct sk_buff *skb) info = (bt3c_info_t *) (hdev->driver_data); - switch (skb->pkt_type) { + switch (bt_cb(skb)->pkt_type) { case HCI_COMMAND_PKT: hdev->stat.cmd_tx++; break; @@ -452,7 +452,7 @@ static int bt3c_hci_send_frame(struct sk_buff *skb) }; /* Prepend skb with frame type */ - memcpy(skb_push(skb, 1), &(skb->pkt_type), 1); + memcpy(skb_push(skb, 1), &bt_cb(skb)->pkt_type, 1); skb_queue_tail(&(info->txq), skb); spin_lock_irqsave(&(info->lock), flags); diff --git a/drivers/bluetooth/btuart_cs.c b/drivers/bluetooth/btuart_cs.c index e4c59fd..89486ea 100644 --- a/drivers/bluetooth/btuart_cs.c +++ b/drivers/bluetooth/btuart_cs.c @@ -211,9 +211,9 @@ static void btuart_receive(btuart_info_t *info) if (info->rx_state == RECV_WAIT_PACKET_TYPE) { info->rx_skb->dev = (void *) info->hdev; - info->rx_skb->pkt_type = inb(iobase + UART_RX); + bt_cb(info->rx_skb)->pkt_type = inb(iobase + UART_RX); - switch (info->rx_skb->pkt_type) { + switch (bt_cb(info->rx_skb)->pkt_type) { case HCI_EVENT_PKT: info->rx_state = RECV_WAIT_EVENT_HEADER; @@ -232,7 +232,7 @@ static void btuart_receive(btuart_info_t *info) default: /* Unknown packet */ - BT_ERR("Unknown HCI packet with type 0x%02x received", info->rx_skb->pkt_type); + BT_ERR("Unknown HCI packet with type 0x%02x received", bt_cb(info->rx_skb)->pkt_type); info->hdev->stat.err_rx++; clear_bit(HCI_RUNNING, &(info->hdev->flags)); @@ -447,7 +447,7 @@ static int btuart_hci_send_frame(struct sk_buff *skb) info = (btuart_info_t *)(hdev->driver_data); - switch (skb->pkt_type) { + switch (bt_cb(skb)->pkt_type) { case HCI_COMMAND_PKT: hdev->stat.cmd_tx++; break; @@ -460,7 +460,7 @@ static int btuart_hci_send_frame(struct sk_buff *skb) }; /* Prepend skb with frame type */ - memcpy(skb_push(skb, 1), &(skb->pkt_type), 1); + memcpy(skb_push(skb, 1), &bt_cb(skb)->pkt_type, 1); skb_queue_tail(&(info->txq), skb); btuart_write_wakeup(info); diff --git a/drivers/bluetooth/dtl1_cs.c b/drivers/bluetooth/dtl1_cs.c index e39868c..84c1f88 100644 --- a/drivers/bluetooth/dtl1_cs.c +++ b/drivers/bluetooth/dtl1_cs.c @@ -251,7 +251,7 @@ static void dtl1_receive(dtl1_info_t *info) info->rx_count = nsh->len + (nsh->len & 0x0001); break; case RECV_WAIT_DATA: - info->rx_skb->pkt_type = nsh->type; + bt_cb(info->rx_skb)->pkt_type = nsh->type; /* remove PAD byte if it exists */ if (nsh->len & 0x0001) { @@ -262,7 +262,7 @@ static void dtl1_receive(dtl1_info_t *info) /* remove NSH */ skb_pull(info->rx_skb, NSHL); - switch (info->rx_skb->pkt_type) { + switch (bt_cb(info->rx_skb)->pkt_type) { case 0x80: /* control data for the Nokia Card */ dtl1_control(info, info->rx_skb); @@ -272,12 +272,12 @@ static void dtl1_receive(dtl1_info_t *info) case 0x84: /* send frame to the HCI layer */ info->rx_skb->dev = (void *) info->hdev; - info->rx_skb->pkt_type &= 0x0f; + bt_cb(info->rx_skb)->pkt_type &= 0x0f; hci_recv_frame(info->rx_skb); break; default: /* unknown packet */ - BT_ERR("Unknown HCI packet with type 0x%02x received", info->rx_skb->pkt_type); + BT_ERR("Unknown HCI packet with type 0x%02x received", bt_cb(info->rx_skb)->pkt_type); kfree_skb(info->rx_skb); break; } @@ -410,7 +410,7 @@ static int dtl1_hci_send_frame(struct sk_buff *skb) info = (dtl1_info_t *)(hdev->driver_data); - switch (skb->pkt_type) { + switch (bt_cb(skb)->pkt_type) { case HCI_COMMAND_PKT: hdev->stat.cmd_tx++; nsh.type = 0x81; diff --git a/drivers/bluetooth/hci_bcsp.c b/drivers/bluetooth/hci_bcsp.c index 858fddb..0ee324e 100644 --- a/drivers/bluetooth/hci_bcsp.c +++ b/drivers/bluetooth/hci_bcsp.c @@ -149,7 +149,7 @@ static int bcsp_enqueue(struct hci_uart *hu, struct sk_buff *skb) return 0; } - switch (skb->pkt_type) { + switch (bt_cb(skb)->pkt_type) { case HCI_ACLDATA_PKT: case HCI_COMMAND_PKT: skb_queue_tail(&bcsp->rel, skb); @@ -227,7 +227,7 @@ static struct sk_buff *bcsp_prepare_pkt(struct bcsp_struct *bcsp, u8 *data, if (!nskb) return NULL; - nskb->pkt_type = pkt_type; + bt_cb(nskb)->pkt_type = pkt_type; bcsp_slip_msgdelim(nskb); @@ -286,7 +286,7 @@ static struct sk_buff *bcsp_dequeue(struct hci_uart *hu) since they have priority */ if ((skb = skb_dequeue(&bcsp->unrel)) != NULL) { - struct sk_buff *nskb = bcsp_prepare_pkt(bcsp, skb->data, skb->len, skb->pkt_type); + struct sk_buff *nskb = bcsp_prepare_pkt(bcsp, skb->data, skb->len, bt_cb(skb)->pkt_type); if (nskb) { kfree_skb(skb); return nskb; @@ -303,7 +303,7 @@ static struct sk_buff *bcsp_dequeue(struct hci_uart *hu) spin_lock_irqsave(&bcsp->unack.lock, flags); if (bcsp->unack.qlen < BCSP_TXWINSIZE && (skb = skb_dequeue(&bcsp->rel)) != NULL) { - struct sk_buff *nskb = bcsp_prepare_pkt(bcsp, skb->data, skb->len, skb->pkt_type); + struct sk_buff *nskb = bcsp_prepare_pkt(bcsp, skb->data, skb->len, bt_cb(skb)->pkt_type); if (nskb) { __skb_queue_tail(&bcsp->unack, skb); mod_timer(&bcsp->tbcsp, jiffies + HZ / 4); @@ -401,7 +401,7 @@ static void bcsp_handle_le_pkt(struct hci_uart *hu) if (!nskb) return; memcpy(skb_put(nskb, 4), conf_rsp_pkt, 4); - nskb->pkt_type = BCSP_LE_PKT; + bt_cb(nskb)->pkt_type = BCSP_LE_PKT; skb_queue_head(&bcsp->unrel, nskb); hci_uart_tx_wakeup(hu); @@ -483,14 +483,14 @@ static inline void bcsp_complete_rx_pkt(struct hci_uart *hu) bcsp_pkt_cull(bcsp); if ((bcsp->rx_skb->data[1] & 0x0f) == 6 && bcsp->rx_skb->data[0] & 0x80) { - bcsp->rx_skb->pkt_type = HCI_ACLDATA_PKT; + bt_cb(bcsp->rx_skb)->pkt_type = HCI_ACLDATA_PKT; pass_up = 1; } else if ((bcsp->rx_skb->data[1] & 0x0f) == 5 && bcsp->rx_skb->data[0] & 0x80) { - bcsp->rx_skb->pkt_type = HCI_EVENT_PKT; + bt_cb(bcsp->rx_skb)->pkt_type = HCI_EVENT_PKT; pass_up = 1; } else if ((bcsp->rx_skb->data[1] & 0x0f) == 7) { - bcsp->rx_skb->pkt_type = HCI_SCODATA_PKT; + bt_cb(bcsp->rx_skb)->pkt_type = HCI_SCODATA_PKT; pass_up = 1; } else if ((bcsp->rx_skb->data[1] & 0x0f) == 1 && !(bcsp->rx_skb->data[0] & 0x80)) { @@ -512,7 +512,7 @@ static inline void bcsp_complete_rx_pkt(struct hci_uart *hu) hdr.evt = 0xff; hdr.plen = bcsp->rx_skb->len; memcpy(skb_push(bcsp->rx_skb, HCI_EVENT_HDR_SIZE), &hdr, HCI_EVENT_HDR_SIZE); - bcsp->rx_skb->pkt_type = HCI_EVENT_PKT; + bt_cb(bcsp->rx_skb)->pkt_type = HCI_EVENT_PKT; hci_recv_frame(bcsp->rx_skb); } else { diff --git a/drivers/bluetooth/hci_h4.c b/drivers/bluetooth/hci_h4.c index 533323b..cf8a22d 100644 --- a/drivers/bluetooth/hci_h4.c +++ b/drivers/bluetooth/hci_h4.c @@ -112,7 +112,7 @@ static int h4_enqueue(struct hci_uart *hu, struct sk_buff *skb) BT_DBG("hu %p skb %p", hu, skb); /* Prepend skb with frame type */ - memcpy(skb_push(skb, 1), &skb->pkt_type, 1); + memcpy(skb_push(skb, 1), &bt_cb(skb)->pkt_type, 1); skb_queue_tail(&h4->txq, skb); return 0; } @@ -239,7 +239,7 @@ static int h4_recv(struct hci_uart *hu, void *data, int count) return 0; } h4->rx_skb->dev = (void *) hu->hdev; - h4->rx_skb->pkt_type = type; + bt_cb(h4->rx_skb)->pkt_type = type; } return count; } diff --git a/drivers/bluetooth/hci_ldisc.c b/drivers/bluetooth/hci_ldisc.c index 90be2ea..aed80cc 100644 --- a/drivers/bluetooth/hci_ldisc.c +++ b/drivers/bluetooth/hci_ldisc.c @@ -153,7 +153,7 @@ restart: break; } - hci_uart_tx_complete(hu, skb->pkt_type); + hci_uart_tx_complete(hu, bt_cb(skb)->pkt_type); kfree_skb(skb); } @@ -229,7 +229,7 @@ static int hci_uart_send_frame(struct sk_buff *skb) hu = (struct hci_uart *) hdev->driver_data; tty = hu->tty; - BT_DBG("%s: type %d len %d", hdev->name, skb->pkt_type, skb->len); + BT_DBG("%s: type %d len %d", hdev->name, bt_cb(skb)->pkt_type, skb->len); hu->proto->enqueue(hu, skb); diff --git a/drivers/bluetooth/hci_usb.c b/drivers/bluetooth/hci_usb.c index 6574034..67d96b5 100644 --- a/drivers/bluetooth/hci_usb.c +++ b/drivers/bluetooth/hci_usb.c @@ -443,7 +443,7 @@ static int __tx_submit(struct hci_usb *husb, struct _urb *_urb) static inline int hci_usb_send_ctrl(struct hci_usb *husb, struct sk_buff *skb) { - struct _urb *_urb = __get_completed(husb, skb->pkt_type); + struct _urb *_urb = __get_completed(husb, bt_cb(skb)->pkt_type); struct usb_ctrlrequest *dr; struct urb *urb; @@ -451,7 +451,7 @@ static inline int hci_usb_send_ctrl(struct hci_usb *husb, struct sk_buff *skb) _urb = _urb_alloc(0, GFP_ATOMIC); if (!_urb) return -ENOMEM; - _urb->type = skb->pkt_type; + _urb->type = bt_cb(skb)->pkt_type; dr = kmalloc(sizeof(*dr), GFP_ATOMIC); if (!dr) { @@ -479,7 +479,7 @@ static inline int hci_usb_send_ctrl(struct hci_usb *husb, struct sk_buff *skb) static inline int hci_usb_send_bulk(struct hci_usb *husb, struct sk_buff *skb) { - struct _urb *_urb = __get_completed(husb, skb->pkt_type); + struct _urb *_urb = __get_completed(husb, bt_cb(skb)->pkt_type); struct urb *urb; int pipe; @@ -487,7 +487,7 @@ static inline int hci_usb_send_bulk(struct hci_usb *husb, struct sk_buff *skb) _urb = _urb_alloc(0, GFP_ATOMIC); if (!_urb) return -ENOMEM; - _urb->type = skb->pkt_type; + _urb->type = bt_cb(skb)->pkt_type; } urb = &_urb->urb; @@ -505,14 +505,14 @@ static inline int hci_usb_send_bulk(struct hci_usb *husb, struct sk_buff *skb) #ifdef CONFIG_BT_HCIUSB_SCO static inline int hci_usb_send_isoc(struct hci_usb *husb, struct sk_buff *skb) { - struct _urb *_urb = __get_completed(husb, skb->pkt_type); + struct _urb *_urb = __get_completed(husb, bt_cb(skb)->pkt_type); struct urb *urb; if (!_urb) { _urb = _urb_alloc(HCI_MAX_ISOC_FRAMES, GFP_ATOMIC); if (!_urb) return -ENOMEM; - _urb->type = skb->pkt_type; + _urb->type = bt_cb(skb)->pkt_type; } BT_DBG("%s skb %p len %d", husb->hdev->name, skb, skb->len); @@ -601,11 +601,11 @@ static int hci_usb_send_frame(struct sk_buff *skb) if (!test_bit(HCI_RUNNING, &hdev->flags)) return -EBUSY; - BT_DBG("%s type %d len %d", hdev->name, skb->pkt_type, skb->len); + BT_DBG("%s type %d len %d", hdev->name, bt_cb(skb)->pkt_type, skb->len); husb = (struct hci_usb *) hdev->driver_data; - switch (skb->pkt_type) { + switch (bt_cb(skb)->pkt_type) { case HCI_COMMAND_PKT: hdev->stat.cmd_tx++; break; @@ -627,7 +627,7 @@ static int hci_usb_send_frame(struct sk_buff *skb) read_lock(&husb->completion_lock); - skb_queue_tail(__transmit_q(husb, skb->pkt_type), skb); + skb_queue_tail(__transmit_q(husb, bt_cb(skb)->pkt_type), skb); hci_usb_tx_wakeup(husb); read_unlock(&husb->completion_lock); @@ -682,7 +682,7 @@ static inline int __recv_frame(struct hci_usb *husb, int type, void *data, int c return -ENOMEM; } skb->dev = (void *) husb->hdev; - skb->pkt_type = type; + bt_cb(skb)->pkt_type = type; __reassembly(husb, type) = skb; @@ -702,6 +702,7 @@ static inline int __recv_frame(struct hci_usb *husb, int type, void *data, int c if (!scb->expect) { /* Complete frame */ __reassembly(husb, type) = NULL; + bt_cb(skb)->pkt_type = type; hci_recv_frame(skb); } diff --git a/drivers/bluetooth/hci_vhci.c b/drivers/bluetooth/hci_vhci.c index 4aa5dff..52cbd45 100644 --- a/drivers/bluetooth/hci_vhci.c +++ b/drivers/bluetooth/hci_vhci.c @@ -107,7 +107,7 @@ static int vhci_send_frame(struct sk_buff *skb) vhci = hdev->driver_data; - memcpy(skb_push(skb, 1), &skb->pkt_type, 1); + memcpy(skb_push(skb, 1), &bt_cb(skb)->pkt_type, 1); skb_queue_tail(&vhci->readq, skb); if (vhci->flags & VHCI_FASYNC) @@ -141,7 +141,7 @@ static inline ssize_t vhci_get_user(struct vhci_data *vhci, } skb->dev = (void *) vhci->hdev; - skb->pkt_type = *((__u8 *) skb->data); + bt_cb(skb)->pkt_type = *((__u8 *) skb->data); skb_pull(skb, 1); hci_recv_frame(skb); @@ -164,18 +164,18 @@ static inline ssize_t vhci_put_user(struct vhci_data *vhci, vhci->hdev->stat.byte_tx += len; - switch (skb->pkt_type) { - case HCI_COMMAND_PKT: - vhci->hdev->stat.cmd_tx++; - break; + switch (bt_cb(skb)->pkt_type) { + case HCI_COMMAND_PKT: + vhci->hdev->stat.cmd_tx++; + break; - case HCI_ACLDATA_PKT: - vhci->hdev->stat.acl_tx++; - break; + case HCI_ACLDATA_PKT: + vhci->hdev->stat.acl_tx++; + break; - case HCI_SCODATA_PKT: - vhci->hdev->stat.cmd_tx++; - break; + case HCI_SCODATA_PKT: + vhci->hdev->stat.cmd_tx++; + break; }; return total; diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 5309b6f..6dfa4a6 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -131,7 +131,8 @@ struct sock *bt_accept_dequeue(struct sock *parent, struct socket *newsock); /* Skb helpers */ struct bt_skb_cb { - int incoming; + __u8 pkt_type; + __u8 incoming; }; #define bt_cb(skb) ((struct bt_skb_cb *)(skb->cb)) diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index ffa26c1..4f9e11b 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -191,7 +191,7 @@ static void hci_init_req(struct hci_dev *hdev, unsigned long opt) /* Special commands */ while ((skb = skb_dequeue(&hdev->driver_init))) { - skb->pkt_type = HCI_COMMAND_PKT; + bt_cb(skb)->pkt_type = HCI_COMMAND_PKT; skb->dev = (void *) hdev; skb_queue_tail(&hdev->cmd_q, skb); hci_sched_cmd(hdev); @@ -995,7 +995,7 @@ static int hci_send_frame(struct sk_buff *skb) return -ENODEV; } - BT_DBG("%s type %d len %d", hdev->name, skb->pkt_type, skb->len); + BT_DBG("%s type %d len %d", hdev->name, bt_cb(skb)->pkt_type, skb->len); if (atomic_read(&hdev->promisc)) { /* Time stamp */ @@ -1034,7 +1034,7 @@ int hci_send_cmd(struct hci_dev *hdev, __u16 ogf, __u16 ocf, __u32 plen, void *p BT_DBG("skb len %d", skb->len); - skb->pkt_type = HCI_COMMAND_PKT; + bt_cb(skb)->pkt_type = HCI_COMMAND_PKT; skb->dev = (void *) hdev; skb_queue_tail(&hdev->cmd_q, skb); hci_sched_cmd(hdev); @@ -1081,7 +1081,7 @@ int hci_send_acl(struct hci_conn *conn, struct sk_buff *skb, __u16 flags) BT_DBG("%s conn %p flags 0x%x", hdev->name, conn, flags); skb->dev = (void *) hdev; - skb->pkt_type = HCI_ACLDATA_PKT; + bt_cb(skb)->pkt_type = HCI_ACLDATA_PKT; hci_add_acl_hdr(skb, conn->handle, flags | ACL_START); if (!(list = skb_shinfo(skb)->frag_list)) { @@ -1103,7 +1103,7 @@ int hci_send_acl(struct hci_conn *conn, struct sk_buff *skb, __u16 flags) skb = list; list = list->next; skb->dev = (void *) hdev; - skb->pkt_type = HCI_ACLDATA_PKT; + bt_cb(skb)->pkt_type = HCI_ACLDATA_PKT; hci_add_acl_hdr(skb, conn->handle, flags | ACL_CONT); BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len); @@ -1139,7 +1139,7 @@ int hci_send_sco(struct hci_conn *conn, struct sk_buff *skb) memcpy(skb->h.raw, &hdr, HCI_SCO_HDR_SIZE); skb->dev = (void *) hdev; - skb->pkt_type = HCI_SCODATA_PKT; + bt_cb(skb)->pkt_type = HCI_SCODATA_PKT; skb_queue_tail(&conn->data_q, skb); hci_sched_tx(hdev); return 0; @@ -1369,7 +1369,7 @@ void hci_rx_task(unsigned long arg) if (test_bit(HCI_INIT, &hdev->flags)) { /* Don't process data packets in this states. */ - switch (skb->pkt_type) { + switch (bt_cb(skb)->pkt_type) { case HCI_ACLDATA_PKT: case HCI_SCODATA_PKT: kfree_skb(skb); @@ -1378,7 +1378,7 @@ void hci_rx_task(unsigned long arg) } /* Process frame */ - switch (skb->pkt_type) { + switch (bt_cb(skb)->pkt_type) { case HCI_EVENT_PKT: hci_event_packet(hdev, skb); break; diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index a004284..40b2195 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1089,7 +1089,7 @@ void hci_si_event(struct hci_dev *hdev, int type, int dlen, void *data) bt_cb(skb)->incoming = 1; do_gettimeofday(&skb->stamp); - skb->pkt_type = HCI_EVENT_PKT; + bt_cb(skb)->pkt_type = HCI_EVENT_PKT; skb->dev = (void *) hdev; hci_send_to_sock(hdev, skb); kfree_skb(skb); diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index ebdcce5..eed9090 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -110,11 +110,11 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb) /* Apply filter */ flt = &hci_pi(sk)->filter; - if (!test_bit((skb->pkt_type == HCI_VENDOR_PKT) ? - 0 : (skb->pkt_type & HCI_FLT_TYPE_BITS), &flt->type_mask)) + if (!test_bit((bt_cb(skb)->pkt_type == HCI_VENDOR_PKT) ? + 0 : (bt_cb(skb)->pkt_type & HCI_FLT_TYPE_BITS), &flt->type_mask)) continue; - if (skb->pkt_type == HCI_EVENT_PKT) { + if (bt_cb(skb)->pkt_type == HCI_EVENT_PKT) { register int evt = (*(__u8 *)skb->data & HCI_FLT_EVENT_BITS); if (!hci_test_bit(evt, &flt->event_mask)) @@ -131,7 +131,7 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb) continue; /* Put type byte before the data */ - memcpy(skb_push(nskb, 1), &nskb->pkt_type, 1); + memcpy(skb_push(nskb, 1), &bt_cb(nskb)->pkt_type, 1); if (sock_queue_rcv_skb(sk, nskb)) kfree_skb(nskb); @@ -327,8 +327,10 @@ static inline void hci_sock_cmsg(struct sock *sk, struct msghdr *msg, struct sk_ { __u32 mask = hci_pi(sk)->cmsg_mask; - if (mask & HCI_CMSG_DIR) - put_cmsg(msg, SOL_HCI, HCI_CMSG_DIR, sizeof(int), &bt_cb(skb)->incoming); + if (mask & HCI_CMSG_DIR) { + int incoming = bt_cb(skb)->incoming; + put_cmsg(msg, SOL_HCI, HCI_CMSG_DIR, sizeof(incoming), &incoming); + } if (mask & HCI_CMSG_TSTAMP) put_cmsg(msg, SOL_HCI, HCI_CMSG_TSTAMP, sizeof(skb->stamp), &skb->stamp); @@ -405,11 +407,11 @@ static int hci_sock_sendmsg(struct kiocb *iocb, struct socket *sock, goto drop; } - skb->pkt_type = *((unsigned char *) skb->data); + bt_cb(skb)->pkt_type = *((unsigned char *) skb->data); skb_pull(skb, 1); skb->dev = (void *) hdev; - if (skb->pkt_type == HCI_COMMAND_PKT) { + if (bt_cb(skb)->pkt_type == HCI_COMMAND_PKT) { u16 opcode = __le16_to_cpu(get_unaligned((u16 *)skb->data)); u16 ogf = hci_opcode_ogf(opcode); u16 ocf = hci_opcode_ocf(opcode); -- cgit v0.10.2 From 27258ee54f8cd4a43d09319aa5448145afc2cb8d Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:30:56 -0700 Subject: [DCCP]: Introduce dccp_write_xmit from code in dccp_sendmsg This way it gets closer to the TCP flow, where congestion window checks are done, it seems we can map ccid_hc_tx_send_packet in dccp_write_xmit to tcp_snd_wnd_test in tcp_write_xmit, a CCID2 decision should just fit in here as well... Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h index 06105b2..469f9a1 100644 --- a/net/dccp/ccid.h +++ b/net/dccp/ccid.h @@ -43,8 +43,7 @@ struct ccid { unsigned char len, u16 idx, unsigned char* value); int (*ccid_hc_tx_send_packet)(struct sock *sk, - struct sk_buff *skb, int len, - long *delay); + struct sk_buff *skb, int len); void (*ccid_hc_tx_packet_sent)(struct sock *sk, int more, int len); }; @@ -60,12 +59,11 @@ static inline void __ccid_get(struct ccid *ccid) } static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk, - struct sk_buff *skb, int len, - long *delay) + struct sk_buff *skb, int len) { int rc = 0; if (ccid->ccid_hc_tx_send_packet != NULL) - rc = ccid->ccid_hc_tx_send_packet(sk, skb, len, delay); + rc = ccid->ccid_hc_tx_send_packet(sk, skb, len); return rc; } diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 04299c7..df4adfe 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -977,13 +977,14 @@ out: sock_put(sk); } -static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb, - int len, long *delay) +static int ccid3_hc_tx_send_packet(struct sock *sk, + struct sk_buff *skb, int len) { struct dccp_sock *dp = dccp_sk(sk); struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private; struct ccid3_tx_hist_entry *new_packet = NULL; struct timeval now; + long delay; int rc = -ENOTCONN; // ccid3_pr_debug("%s, sk=%p, skb=%p, len=%d\n", dccp_role(sk), sk, skb, len); @@ -1037,11 +1038,11 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb, break; case TFRC_SSTATE_NO_FBACK: case TFRC_SSTATE_FBACK: - *delay = (now_delta(hctx->ccid3hctx_t_nom) - hctx->ccid3hctx_delta); - ccid3_pr_debug("send_packet delay=%ld\n",*delay); - *delay /= -1000; + delay = (now_delta(hctx->ccid3hctx_t_nom) - hctx->ccid3hctx_delta); + ccid3_pr_debug("send_packet delay=%ld\n", delay); + delay /= -1000; /* divide by -1000 is to convert to ms and get sign right */ - rc = *delay > 0 ? -EAGAIN : 0; + rc = delay > 0 ? -EAGAIN : 0; break; default: printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n", diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 55b690a..8a0d7af 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -122,6 +122,9 @@ extern void dccp_send_ack(struct sock *sk); extern void dccp_send_delayed_ack(struct sock *sk); extern void dccp_send_sync(struct sock *sk, u64 seq); +extern int dccp_write_xmit(struct sock *sk, struct sk_buff *skb, + const int len); + extern void dccp_init_xmit_timers(struct sock *sk); static inline void dccp_clear_xmit_timers(struct sock *sk) { @@ -194,8 +197,6 @@ static inline void dccp_openreq_init(struct request_sock *req, req->rcv_wnd = 0; } -extern void dccp_v4_send_check(struct sock *sk, struct dccp_hdr *dh, int len, - struct sk_buff *skb); extern int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb); extern struct sock *dccp_create_openreq_child(struct sock *sk, diff --git a/net/dccp/output.c b/net/dccp/output.c index 4945eaa..50292c0 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -148,6 +148,41 @@ unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu) return mss_now; } +int dccp_write_xmit(struct sock *sk, struct sk_buff *skb, const int len) +{ + const struct dccp_sock *dp = dccp_sk(sk); + int err = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb, len); + + if (err == 0) { + const struct dccp_ackpkts *ap = dp->dccps_hc_rx_ackpkts; + struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); + + if (sk->sk_state == DCCP_PARTOPEN) { + /* See 8.1.5. Handshake Completion */ + inet_csk_schedule_ack(sk); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, + inet_csk(sk)->icsk_rto, + DCCP_RTO_MAX); + dcb->dccpd_type = DCCP_PKT_DATAACK; + /* + * FIXME: we really should have a + * dccps_ack_pending or use icsk. + */ + } else if (inet_csk_ack_scheduled(sk) || + (dp->dccps_options.dccpo_send_ack_vector && + ap->dccpap_buf_ackno != DCCP_MAX_SEQNO + 1 && + ap->dccpap_ack_seqno == DCCP_MAX_SEQNO + 1)) + dcb->dccpd_type = DCCP_PKT_DATAACK; + else + dcb->dccpd_type = DCCP_PKT_DATA; + + err = dccp_transmit_skb(sk, skb); + ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, 0, len); + } + + return err; +} + int dccp_retransmit_skb(struct sock *sk, struct sk_buff *skb) { if (inet_sk_rebuild_header(sk) != 0) @@ -299,7 +334,8 @@ int dccp_connect(struct sock *sk) DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS); /* Timer for repeating the REQUEST until an answer. */ - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + icsk->icsk_rto, DCCP_RTO_MAX); return 0; } diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 66c43fc..877c1e0 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -182,8 +182,7 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, return -EMSGSIZE; lock_sock(sk); - - timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); + timeo = sock_sndtimeo(sk, noblock); /* * We have to use sk_stream_wait_connect here to set sk_write_pending, @@ -192,77 +191,27 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, /* Wait for a connection to finish. */ if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN | DCCPF_CLOSING)) if ((rc = sk_stream_wait_connect(sk, &timeo)) != 0) - goto out_err; + goto out_release; size = sk->sk_prot->max_header + len; release_sock(sk); skb = sock_alloc_send_skb(sk, size, noblock, &rc); lock_sock(sk); - if (skb == NULL) goto out_release; skb_reserve(skb, sk->sk_prot->max_header); rc = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len); - if (rc == 0) { - struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); - const struct dccp_ackpkts *ap = dp->dccps_hc_rx_ackpkts; - long delay; - - /* - * XXX: This is just to match the Waikato tree CA interaction - * points, after the CCID3 code is stable and I have a better - * understanding of behaviour I'll change this to look more like - * TCP. - */ - while (1) { - rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, - skb, len, &delay); - if (rc == 0) - break; - if (rc != -EAGAIN) - goto out_discard; - if (delay > timeo) - goto out_discard; - release_sock(sk); - delay = schedule_timeout(delay); - lock_sock(sk); - timeo -= delay; - if (signal_pending(current)) - goto out_interrupted; - rc = -EPIPE; - if (!(sk->sk_state == DCCP_PARTOPEN || sk->sk_state == DCCP_OPEN)) - goto out_discard; - } + if (rc != 0) + goto out_discard; - if (sk->sk_state == DCCP_PARTOPEN) { - /* See 8.1.5. Handshake Completion */ - inet_csk_schedule_ack(sk); - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, inet_csk(sk)->icsk_rto, TCP_RTO_MAX); - dcb->dccpd_type = DCCP_PKT_DATAACK; - /* FIXME: we really should have a dccps_ack_pending or use icsk */ - } else if (inet_csk_ack_scheduled(sk) || - (dp->dccps_options.dccpo_send_ack_vector && - ap->dccpap_buf_ackno != DCCP_MAX_SEQNO + 1 && - ap->dccpap_ack_seqno == DCCP_MAX_SEQNO + 1)) - dcb->dccpd_type = DCCP_PKT_DATAACK; - else - dcb->dccpd_type = DCCP_PKT_DATA; - dccp_transmit_skb(sk, skb); - ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, 0, len); - } else { -out_discard: - kfree_skb(skb); - } + rc = dccp_write_xmit(sk, skb, len); out_release: release_sock(sk); return rc ? : len; -out_err: - rc = sk_stream_error(sk, flags, rc); +out_discard: + kfree_skb(skb); goto out_release; -out_interrupted: - rc = sock_intr_errno(timeo); - goto out_discard; } EXPORT_SYMBOL(dccp_sendmsg); -- cgit v0.10.2 From 0b4e03bf0bc43ad6250a1e2fa25fc3eb2b028977 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:31:11 -0700 Subject: [DCCP]: Initialize icsk_rto in dccp_v4_init_sock Fixes nasty bug related to the retransmit timer (yeah, DCCP does retransmits) firing too early. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 4fa56db..6bccf4d 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -1219,6 +1219,7 @@ static int dccp_v4_init_sock(struct sock *sk) dccp_ctl_socket_init = 0; dccp_init_xmit_timers(sk); + inet_csk(sk)->icsk_rto = DCCP_TIMEOUT_INIT; sk->sk_state = DCCP_CLOSED; dp->dccps_mss_cache = 536; dp->dccps_role = DCCP_ROLE_UNDEFINED; -- cgit v0.10.2 From 295ff7edb8f72b77d524759266f7524deae379b3 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:44:40 -0700 Subject: [TIMEWAIT]: Introduce inet_timewait_death_row That groups all of the tables and variables associated to the TCP timewait schedulling/recycling/killing code, that now can be isolated from the TCP specific code and used by other transport protocols, such as DCCP. Next changeset will move this code to net/ipv4/inet_timewait_sock.c Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index e00861b..a7e8052 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -19,13 +19,69 @@ #include #include +#include #include +#include #include #include #include +struct inet_hashinfo; + +#define INET_TWDR_RECYCLE_SLOTS_LOG 5 +#define INET_TWDR_RECYCLE_SLOTS (1 << INET_TWDR_RECYCLE_SLOTS_LOG) + +/* + * If time > 4sec, it is "slow" path, no recycling is required, + * so that we select tick to get range about 4 seconds. + */ +#if HZ <= 16 || HZ > 4096 +# error Unsupported: HZ <= 16 or HZ > 4096 +#elif HZ <= 32 +# define INET_TWDR_RECYCLE_TICK (5 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) +#elif HZ <= 64 +# define INET_TWDR_RECYCLE_TICK (6 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) +#elif HZ <= 128 +# define INET_TWDR_RECYCLE_TICK (7 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) +#elif HZ <= 256 +# define INET_TWDR_RECYCLE_TICK (8 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) +#elif HZ <= 512 +# define INET_TWDR_RECYCLE_TICK (9 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) +#elif HZ <= 1024 +# define INET_TWDR_RECYCLE_TICK (10 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) +#elif HZ <= 2048 +# define INET_TWDR_RECYCLE_TICK (11 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) +#else +# define INET_TWDR_RECYCLE_TICK (12 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) +#endif + +/* TIME_WAIT reaping mechanism. */ +#define INET_TWDR_TWKILL_SLOTS 8 /* Please keep this a power of 2. */ + +#define INET_TWDR_TWKILL_QUOTA 100 + +struct inet_timewait_death_row { + /* Short-time timewait calendar */ + int twcal_hand; + int twcal_jiffie; + struct timer_list twcal_timer; + struct hlist_head twcal_row[INET_TWDR_RECYCLE_SLOTS]; + + spinlock_t death_lock; + int tw_count; + int period; + u32 thread_slots; + struct work_struct twkill_work; + struct timer_list tw_timer; + int slot; + struct hlist_head cells[INET_TWDR_TWKILL_SLOTS]; + struct inet_hashinfo *hashinfo; + int sysctl_tw_recycle; + int sysctl_max_tw_buckets; +}; + #if (BITS_PER_LONG == 64) #define INET_TIMEWAIT_ADDRCMP_ALIGN_BYTES 8 #else @@ -33,7 +89,6 @@ #endif struct inet_bind_bucket; -struct inet_hashinfo; /* * This is a TIME_WAIT sock. It works around the memory consumption diff --git a/include/net/tcp.h b/include/net/tcp.h index 077db85..4c4cd4f 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -29,6 +29,7 @@ #include #include +#include #include #include #include @@ -42,9 +43,9 @@ extern struct inet_hashinfo tcp_hashinfo; extern atomic_t tcp_orphan_count; -extern int tcp_tw_count; extern void tcp_time_wait(struct sock *sk, int state, int timeo); -extern void tcp_tw_deschedule(struct inet_timewait_sock *tw); +extern void inet_twsk_deschedule(struct inet_timewait_sock *tw, + struct inet_timewait_death_row *twdr); #define MAX_TCP_HEADER (128 + MAX_HEADER) @@ -148,33 +149,6 @@ extern void tcp_tw_deschedule(struct inet_timewait_sock *tw); * timestamps. It must be less than * minimal timewait lifetime. */ - -#define TCP_TW_RECYCLE_SLOTS_LOG 5 -#define TCP_TW_RECYCLE_SLOTS (1< 4sec, it is "slow" path, no recycling is required, - so that we select tick to get range about 4 seconds. - */ - -#if HZ <= 16 || HZ > 4096 -# error Unsupported: HZ <= 16 or HZ > 4096 -#elif HZ <= 32 -# define TCP_TW_RECYCLE_TICK (5+2-TCP_TW_RECYCLE_SLOTS_LOG) -#elif HZ <= 64 -# define TCP_TW_RECYCLE_TICK (6+2-TCP_TW_RECYCLE_SLOTS_LOG) -#elif HZ <= 128 -# define TCP_TW_RECYCLE_TICK (7+2-TCP_TW_RECYCLE_SLOTS_LOG) -#elif HZ <= 256 -# define TCP_TW_RECYCLE_TICK (8+2-TCP_TW_RECYCLE_SLOTS_LOG) -#elif HZ <= 512 -# define TCP_TW_RECYCLE_TICK (9+2-TCP_TW_RECYCLE_SLOTS_LOG) -#elif HZ <= 1024 -# define TCP_TW_RECYCLE_TICK (10+2-TCP_TW_RECYCLE_SLOTS_LOG) -#elif HZ <= 2048 -# define TCP_TW_RECYCLE_TICK (11+2-TCP_TW_RECYCLE_SLOTS_LOG) -#else -# define TCP_TW_RECYCLE_TICK (12+2-TCP_TW_RECYCLE_SLOTS_LOG) -#endif /* * TCP option */ @@ -209,12 +183,13 @@ extern void tcp_tw_deschedule(struct inet_timewait_sock *tw); #define TCP_NAGLE_CORK 2 /* Socket is corked */ #define TCP_NAGLE_PUSH 4 /* Cork is overriden for already queued data */ +extern struct inet_timewait_death_row tcp_death_row; + /* sysctl variables for tcp */ extern int sysctl_tcp_timestamps; extern int sysctl_tcp_window_scaling; extern int sysctl_tcp_sack; extern int sysctl_tcp_fin_timeout; -extern int sysctl_tcp_tw_recycle; extern int sysctl_tcp_keepalive_time; extern int sysctl_tcp_keepalive_probes; extern int sysctl_tcp_keepalive_intvl; @@ -229,7 +204,6 @@ extern int sysctl_tcp_stdurg; extern int sysctl_tcp_rfc1337; extern int sysctl_tcp_abort_on_overflow; extern int sysctl_tcp_max_orphans; -extern int sysctl_tcp_max_tw_buckets; extern int sysctl_tcp_fack; extern int sysctl_tcp_reordering; extern int sysctl_tcp_ecn; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 912bbcc..3eadbb2 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -65,7 +65,7 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) socket_seq_show(seq); seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n", fold_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count), - tcp_tw_count, atomic_read(&tcp_sockets_allocated), + tcp_death_row.tw_count, atomic_read(&tcp_sockets_allocated), atomic_read(&tcp_memory_allocated)); seq_printf(seq, "UDP: inuse %d\n", fold_prot_inuse(&udp_prot)); seq_printf(seq, "RAW: inuse %d\n", fold_prot_inuse(&raw_prot)); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index e328945..ce47a34 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -259,7 +259,7 @@ ctl_table ipv4_table[] = { { .ctl_name = NET_TCP_MAX_TW_BUCKETS, .procname = "tcp_max_tw_buckets", - .data = &sysctl_tcp_max_tw_buckets, + .data = &tcp_death_row.sysctl_max_tw_buckets, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec @@ -363,7 +363,7 @@ ctl_table ipv4_table[] = { { .ctl_name = NET_TCP_TW_RECYCLE, .procname = "tcp_tw_recycle", - .data = &sysctl_tcp_tw_recycle, + .data = &tcp_death_row.sysctl_tw_recycle, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4bda522..0eed64a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2109,12 +2109,12 @@ void __init tcp_init(void) if (order >= 4) { sysctl_local_port_range[0] = 32768; sysctl_local_port_range[1] = 61000; - sysctl_tcp_max_tw_buckets = 180000; + tcp_death_row.sysctl_max_tw_buckets = 180000; sysctl_tcp_max_orphans = 4096 << (order - 4); sysctl_max_syn_backlog = 1024; } else if (order < 3) { sysctl_local_port_range[0] = 1024 * (3 - order); - sysctl_tcp_max_tw_buckets >>= (3 - order); + tcp_death_row.sysctl_max_tw_buckets >>= (3 - order); sysctl_tcp_max_orphans >>= (3 - order); sysctl_max_syn_backlog = 128; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index b966102..83f7234 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -199,7 +199,7 @@ unique: NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); } else if (tw) { /* Silly. Should hash-dance instead... */ - tcp_tw_deschedule(tw); + inet_twsk_deschedule(tw, &tcp_death_row); NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); inet_twsk_put(tw); @@ -291,7 +291,7 @@ ok: spin_unlock(&head->lock); if (tw) { - tcp_tw_deschedule(tw); + inet_twsk_deschedule(tw, &tcp_death_row);; inet_twsk_put(tw); } @@ -366,7 +366,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) tp->write_seq = 0; } - if (sysctl_tcp_tw_recycle && + if (tcp_death_row.sysctl_tw_recycle && !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) { struct inet_peer *peer = rt_get_peer(rt); @@ -965,7 +965,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) * are made in the function processing timewait state. */ if (tmp_opt.saw_tstamp && - sysctl_tcp_tw_recycle && + tcp_death_row.sysctl_tw_recycle && (dst = inet_csk_route_req(sk, req)) != NULL && (peer = rt_get_peer((struct rtable *)dst)) != NULL && peer->v4daddr == saddr) { @@ -1305,7 +1305,8 @@ do_time_wait: ntohs(th->dest), inet_iif(skb)); if (sk2) { - tcp_tw_deschedule((struct inet_timewait_sock *)sk); + inet_twsk_deschedule((struct inet_timewait_sock *)sk, + &tcp_death_row); inet_twsk_put((struct inet_timewait_sock *)sk); sk = sk2; goto process; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 2d95afe..81b9a52 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -35,13 +35,37 @@ #define SYNC_INIT 1 #endif -int sysctl_tcp_tw_recycle; -int sysctl_tcp_max_tw_buckets = NR_FILE*2; +/* New-style handling of TIME_WAIT sockets. */ + +static void inet_twdr_hangman(unsigned long data); +static void inet_twdr_twkill_work(void *data); +static void inet_twdr_twcal_tick(unsigned long data); int sysctl_tcp_syncookies = SYNC_INIT; int sysctl_tcp_abort_on_overflow; -static void tcp_tw_schedule(struct inet_timewait_sock *tw, int timeo); +struct inet_timewait_death_row tcp_death_row = { + .sysctl_max_tw_buckets = NR_FILE * 2, + .period = TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS, + .death_lock = SPIN_LOCK_UNLOCKED, + .hashinfo = &tcp_hashinfo, + .tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0, + (unsigned long)&tcp_death_row), + .twkill_work = __WORK_INITIALIZER(tcp_death_row.twkill_work, + inet_twdr_twkill_work, + &tcp_death_row), +/* Short-time timewait calendar */ + + .twcal_hand = -1, + .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0, + (unsigned long)&tcp_death_row), +}; + +EXPORT_SYMBOL_GPL(tcp_death_row); + +static void inet_twsk_schedule(struct inet_timewait_sock *tw, + struct inet_timewait_death_row *twdr, + const int timeo); static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) { @@ -52,10 +76,6 @@ static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) return (seq == e_win && seq == end_seq); } -/* New-style handling of TIME_WAIT sockets. */ - -int tcp_tw_count; - /* * * Main purpose of TIME-WAIT state is to close connection gracefully, * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN @@ -132,7 +152,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, if (!th->fin || TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) { kill_with_rst: - tcp_tw_deschedule(tw); + inet_twsk_deschedule(tw, &tcp_death_row); inet_twsk_put(tw); return TCP_TW_RST; } @@ -151,11 +171,11 @@ kill_with_rst: * do not undertsnad recycling in any case, it not * a big problem in practice. --ANK */ if (tw->tw_family == AF_INET && - sysctl_tcp_tw_recycle && tcptw->tw_ts_recent_stamp && + tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp && tcp_v4_tw_remember_stamp(tw)) - tcp_tw_schedule(tw, tw->tw_timeout); + inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout); else - tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); + inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN); return TCP_TW_ACK; } @@ -188,12 +208,12 @@ kill_with_rst: */ if (sysctl_tcp_rfc1337 == 0) { kill: - tcp_tw_deschedule(tw); + inet_twsk_deschedule(tw, &tcp_death_row); inet_twsk_put(tw); return TCP_TW_SUCCESS; } } - tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); + inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN); if (tmp_opt.saw_tstamp) { tcptw->tw_ts_recent = tmp_opt.rcv_tsval; @@ -243,7 +263,7 @@ kill: * Do not reschedule in the last case. */ if (paws_reject || th->ack) - tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); + inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN); /* Send ACK. Note, we do not put the bucket, * it will be released by caller. @@ -263,10 +283,10 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) const struct tcp_sock *tp = tcp_sk(sk); int recycle_ok = 0; - if (sysctl_tcp_tw_recycle && tp->rx_opt.ts_recent_stamp) + if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) recycle_ok = tp->af_specific->remember_stamp(sk); - if (tcp_tw_count < sysctl_tcp_max_tw_buckets) + if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets) tw = inet_twsk_alloc(sk, state); if (tw != NULL) { @@ -306,7 +326,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) timeo = TCP_TIMEWAIT_LEN; } - tcp_tw_schedule(tw, timeo); + inet_twsk_schedule(tw, &tcp_death_row, timeo); inet_twsk_put(tw); } else { /* Sorry, if we're out of memory, just CLOSE this @@ -321,26 +341,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tcp_done(sk); } -/* Kill off TIME_WAIT sockets once their lifetime has expired. */ -static int tcp_tw_death_row_slot; - -static void tcp_twkill(unsigned long); - -/* TIME_WAIT reaping mechanism. */ -#define TCP_TWKILL_SLOTS 8 /* Please keep this a power of 2. */ -#define TCP_TWKILL_PERIOD (TCP_TIMEWAIT_LEN/TCP_TWKILL_SLOTS) - -#define TCP_TWKILL_QUOTA 100 - -static struct hlist_head tcp_tw_death_row[TCP_TWKILL_SLOTS]; -static DEFINE_SPINLOCK(tw_death_lock); -static struct timer_list tcp_tw_timer = TIMER_INITIALIZER(tcp_twkill, 0, 0); -static void twkill_work(void *); -static DECLARE_WORK(tcp_twkill_work, twkill_work, NULL); -static u32 twkill_thread_slots; - /* Returns non-zero if quota exceeded. */ -static int tcp_do_twkill_work(int slot, unsigned int quota) +static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr, + const int slot) { struct inet_timewait_sock *tw; struct hlist_node *node; @@ -356,19 +359,19 @@ static int tcp_do_twkill_work(int slot, unsigned int quota) killed = 0; ret = 0; rescan: - inet_twsk_for_each_inmate(tw, node, &tcp_tw_death_row[slot]) { + inet_twsk_for_each_inmate(tw, node, &twdr->cells[slot]) { __inet_twsk_del_dead_node(tw); - spin_unlock(&tw_death_lock); - __inet_twsk_kill(tw, &tcp_hashinfo); + spin_unlock(&twdr->death_lock); + __inet_twsk_kill(tw, twdr->hashinfo); inet_twsk_put(tw); killed++; - spin_lock(&tw_death_lock); - if (killed > quota) { + spin_lock(&twdr->death_lock); + if (killed > INET_TWDR_TWKILL_QUOTA) { ret = 1; break; } - /* While we dropped tw_death_lock, another cpu may have + /* While we dropped twdr->death_lock, another cpu may have * killed off the next TW bucket in the list, therefore * do a fresh re-read of the hlist head node with the * lock reacquired. We still use the hlist traversal @@ -377,67 +380,68 @@ rescan: goto rescan; } - tcp_tw_count -= killed; + twdr->tw_count -= killed; NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed); return ret; } -static void tcp_twkill(unsigned long dummy) +static void inet_twdr_hangman(unsigned long data) { - int need_timer, ret; + struct inet_timewait_death_row *twdr; + int unsigned need_timer; - spin_lock(&tw_death_lock); + twdr = (struct inet_timewait_death_row *)data; + spin_lock(&twdr->death_lock); - if (tcp_tw_count == 0) + if (twdr->tw_count == 0) goto out; need_timer = 0; - ret = tcp_do_twkill_work(tcp_tw_death_row_slot, TCP_TWKILL_QUOTA); - if (ret) { - twkill_thread_slots |= (1 << tcp_tw_death_row_slot); + if (inet_twdr_do_twkill_work(twdr, twdr->slot)) { + twdr->thread_slots |= (1 << twdr->slot); mb(); - schedule_work(&tcp_twkill_work); + schedule_work(&twdr->twkill_work); need_timer = 1; } else { /* We purged the entire slot, anything left? */ - if (tcp_tw_count) + if (twdr->tw_count) need_timer = 1; } - tcp_tw_death_row_slot = - ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1)); + twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1)); if (need_timer) - mod_timer(&tcp_tw_timer, jiffies + TCP_TWKILL_PERIOD); + mod_timer(&twdr->tw_timer, jiffies + twdr->period); out: - spin_unlock(&tw_death_lock); + spin_unlock(&twdr->death_lock); } extern void twkill_slots_invalid(void); -static void twkill_work(void *dummy) +static void inet_twdr_twkill_work(void *data) { + struct inet_timewait_death_row *twdr = data; int i; - if ((TCP_TWKILL_SLOTS - 1) > (sizeof(twkill_thread_slots) * 8)) + if ((INET_TWDR_TWKILL_SLOTS - 1) > (sizeof(twdr->thread_slots) * 8)) twkill_slots_invalid(); - while (twkill_thread_slots) { - spin_lock_bh(&tw_death_lock); - for (i = 0; i < TCP_TWKILL_SLOTS; i++) { - if (!(twkill_thread_slots & (1 << i))) + while (twdr->thread_slots) { + spin_lock_bh(&twdr->death_lock); + for (i = 0; i < INET_TWDR_TWKILL_SLOTS; i++) { + if (!(twdr->thread_slots & (1 << i))) continue; - while (tcp_do_twkill_work(i, TCP_TWKILL_QUOTA) != 0) { + while (inet_twdr_do_twkill_work(twdr, i) != 0) { if (need_resched()) { - spin_unlock_bh(&tw_death_lock); + spin_unlock_bh(&twdr->death_lock); schedule(); - spin_lock_bh(&tw_death_lock); + spin_lock_bh(&twdr->death_lock); } } - twkill_thread_slots &= ~(1 << i); + twdr->thread_slots &= ~(1 << i); } - spin_unlock_bh(&tw_death_lock); + spin_unlock_bh(&twdr->death_lock); } } @@ -446,28 +450,22 @@ static void twkill_work(void *dummy) */ /* This is for handling early-kills of TIME_WAIT sockets. */ -void tcp_tw_deschedule(struct inet_timewait_sock *tw) +void inet_twsk_deschedule(struct inet_timewait_sock *tw, + struct inet_timewait_death_row *twdr) { - spin_lock(&tw_death_lock); + spin_lock(&twdr->death_lock); if (inet_twsk_del_dead_node(tw)) { inet_twsk_put(tw); - if (--tcp_tw_count == 0) - del_timer(&tcp_tw_timer); + if (--twdr->tw_count == 0) + del_timer(&twdr->tw_timer); } - spin_unlock(&tw_death_lock); - __inet_twsk_kill(tw, &tcp_hashinfo); + spin_unlock(&twdr->death_lock); + __inet_twsk_kill(tw, twdr->hashinfo); } -/* Short-time timewait calendar */ - -static int tcp_twcal_hand = -1; -static int tcp_twcal_jiffie; -static void tcp_twcal_tick(unsigned long); -static struct timer_list tcp_twcal_timer = - TIMER_INITIALIZER(tcp_twcal_tick, 0, 0); -static struct hlist_head tcp_twcal_row[TCP_TW_RECYCLE_SLOTS]; - -static void tcp_tw_schedule(struct inet_timewait_sock *tw, const int timeo) +static void inet_twsk_schedule(struct inet_timewait_sock *tw, + struct inet_timewait_death_row *twdr, + const int timeo) { struct hlist_head *list; int slot; @@ -496,100 +494,106 @@ static void tcp_tw_schedule(struct inet_timewait_sock *tw, const int timeo) * is greater than TS tick!) and detect old duplicates with help * of PAWS. */ - slot = (timeo + (1<> TCP_TW_RECYCLE_TICK; + slot = (timeo + (1 << INET_TWDR_RECYCLE_TICK) - 1) >> INET_TWDR_RECYCLE_TICK; - spin_lock(&tw_death_lock); + spin_lock(&twdr->death_lock); /* Unlink it, if it was scheduled */ if (inet_twsk_del_dead_node(tw)) - tcp_tw_count--; + twdr->tw_count--; else atomic_inc(&tw->tw_refcnt); - if (slot >= TCP_TW_RECYCLE_SLOTS) { + if (slot >= INET_TWDR_RECYCLE_SLOTS) { /* Schedule to slow timer */ if (timeo >= TCP_TIMEWAIT_LEN) { - slot = TCP_TWKILL_SLOTS-1; + slot = INET_TWDR_TWKILL_SLOTS - 1; } else { - slot = (timeo + TCP_TWKILL_PERIOD-1) / TCP_TWKILL_PERIOD; - if (slot >= TCP_TWKILL_SLOTS) - slot = TCP_TWKILL_SLOTS-1; + slot = (timeo + twdr->period - 1) / twdr->period; + if (slot >= INET_TWDR_TWKILL_SLOTS) + slot = INET_TWDR_TWKILL_SLOTS - 1; } tw->tw_ttd = jiffies + timeo; - slot = (tcp_tw_death_row_slot + slot) & (TCP_TWKILL_SLOTS - 1); - list = &tcp_tw_death_row[slot]; + slot = (twdr->slot + slot) & (INET_TWDR_TWKILL_SLOTS - 1); + list = &twdr->cells[slot]; } else { - tw->tw_ttd = jiffies + (slot << TCP_TW_RECYCLE_TICK); - - if (tcp_twcal_hand < 0) { - tcp_twcal_hand = 0; - tcp_twcal_jiffie = jiffies; - tcp_twcal_timer.expires = tcp_twcal_jiffie + (slot<tw_ttd = jiffies + (slot << INET_TWDR_RECYCLE_TICK); + + if (twdr->twcal_hand < 0) { + twdr->twcal_hand = 0; + twdr->twcal_jiffie = jiffies; + twdr->twcal_timer.expires = twdr->twcal_jiffie + + (slot << INET_TWDR_RECYCLE_TICK); + add_timer(&twdr->twcal_timer); } else { - if (time_after(tcp_twcal_timer.expires, jiffies + (slot<twcal_timer.expires, + jiffies + (slot << INET_TWDR_RECYCLE_TICK))) + mod_timer(&twdr->twcal_timer, + jiffies + (slot << INET_TWDR_RECYCLE_TICK)); + slot = (twdr->twcal_hand + slot) & (INET_TWDR_RECYCLE_SLOTS - 1); } - list = &tcp_twcal_row[slot]; + list = &twdr->twcal_row[slot]; } hlist_add_head(&tw->tw_death_node, list); - if (tcp_tw_count++ == 0) - mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD); - spin_unlock(&tw_death_lock); + if (twdr->tw_count++ == 0) + mod_timer(&twdr->tw_timer, jiffies + twdr->period); + spin_unlock(&twdr->death_lock); } -void tcp_twcal_tick(unsigned long dummy) +void inet_twdr_twcal_tick(unsigned long data) { + struct inet_timewait_death_row *twdr; int n, slot; unsigned long j; unsigned long now = jiffies; int killed = 0; int adv = 0; - spin_lock(&tw_death_lock); - if (tcp_twcal_hand < 0) + twdr = (struct inet_timewait_death_row *)data; + + spin_lock(&twdr->death_lock); + if (twdr->twcal_hand < 0) goto out; - slot = tcp_twcal_hand; - j = tcp_twcal_jiffie; + slot = twdr->twcal_hand; + j = twdr->twcal_jiffie; - for (n=0; ntwcal_row[slot]) { __inet_twsk_del_dead_node(tw); - __inet_twsk_kill(tw, &tcp_hashinfo); + __inet_twsk_kill(tw, twdr->hashinfo); inet_twsk_put(tw); killed++; } } else { if (!adv) { adv = 1; - tcp_twcal_jiffie = j; - tcp_twcal_hand = slot; + twdr->twcal_jiffie = j; + twdr->twcal_hand = slot; } - if (!hlist_empty(&tcp_twcal_row[slot])) { - mod_timer(&tcp_twcal_timer, j); + if (!hlist_empty(&twdr->twcal_row[slot])) { + mod_timer(&twdr->twcal_timer, j); goto out; } } - j += (1<twcal_hand = -1; out: - if ((tcp_tw_count -= killed) == 0) - del_timer(&tcp_tw_timer); + if ((twdr->tw_count -= killed) == 0) + del_timer(&twdr->tw_timer); NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed); - spin_unlock(&tw_death_lock); + spin_unlock(&twdr->death_lock); } /* This is not only more efficient than what we used to do, it eliminates @@ -929,4 +933,4 @@ EXPORT_SYMBOL(tcp_check_req); EXPORT_SYMBOL(tcp_child_process); EXPORT_SYMBOL(tcp_create_openreq_child); EXPORT_SYMBOL(tcp_timewait_state_process); -EXPORT_SYMBOL(tcp_tw_deschedule); +EXPORT_SYMBOL(inet_twsk_deschedule); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 0b51ec3..1c21ad6 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -521,7 +521,7 @@ unique: NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); } else if (tw) { /* Silly. Should hash-dance instead... */ - tcp_tw_deschedule(tw); + inet_twsk_deschedule(tw, &tcp_death_row); NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); inet_twsk_put(tw); @@ -611,7 +611,7 @@ ok: spin_unlock(&head->lock); if (tw) { - tcp_tw_deschedule(tw); + inet_twsk_deschedule(tw, &tcp_death_row); inet_twsk_put(tw); } @@ -1820,8 +1820,9 @@ do_time_wait: sk2 = tcp_v6_lookup_listener(&skb->nh.ipv6h->daddr, ntohs(th->dest), tcp_v6_iif(skb)); if (sk2 != NULL) { - tcp_tw_deschedule((struct inet_timewait_sock *)sk); - inet_twsk_put((struct inet_timewait_sock *)sk); + struct inet_timewait_sock *tw = inet_twsk(sk); + inet_twsk_deschedule(tw, &tcp_death_row); + inet_twsk_put(tw); sk = sk2; goto process; } -- cgit v0.10.2 From 696ab2d3bffc746fb8cf3712f066d42b9886aeed Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:45:03 -0700 Subject: [TIMEWAIT]: Move inet_timewait_death_row routines to net/ipv4/inet_timewait_sock.c Also export the ones that will be used in the next changeset, when DCCP uses this infrastructure. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index a7e8052..3b07035 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -82,6 +82,10 @@ struct inet_timewait_death_row { int sysctl_max_tw_buckets; }; +extern void inet_twdr_hangman(unsigned long data); +extern void inet_twdr_twkill_work(void *data); +extern void inet_twdr_twcal_tick(unsigned long data); + #if (BITS_PER_LONG == 64) #define INET_TIMEWAIT_ADDRCMP_ALIGN_BYTES 8 #else @@ -206,4 +210,10 @@ extern void __inet_twsk_kill(struct inet_timewait_sock *tw, extern void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, struct inet_hashinfo *hashinfo); + +extern void inet_twsk_schedule(struct inet_timewait_sock *tw, + struct inet_timewait_death_row *twdr, + const int timeo, const int timewait_len); +extern void inet_twsk_deschedule(struct inet_timewait_sock *tw, + struct inet_timewait_death_row *twdr); #endif /* _INET_TIMEWAIT_SOCK_ */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 4c4cd4f..d489ac5 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -44,8 +44,6 @@ extern struct inet_hashinfo tcp_hashinfo; extern atomic_t tcp_orphan_count; extern void tcp_time_wait(struct sock *sk, int state, int timeo); -extern void inet_twsk_deschedule(struct inet_timewait_sock *tw, - struct inet_timewait_death_row *twdr); #define MAX_TCP_HEADER (128 + MAX_HEADER) diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 22882d9..4d1502a 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -12,6 +12,7 @@ #include #include +#include /* Must be called with locally disabled BHs. */ void __inet_twsk_kill(struct inet_timewait_sock *tw, struct inet_hashinfo *hashinfo) @@ -85,6 +86,8 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, write_unlock(&ehead->lock); } +EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); + struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state) { struct inet_timewait_sock *tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_slab, @@ -112,3 +115,270 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat return tw; } + +EXPORT_SYMBOL_GPL(inet_twsk_alloc); + +/* Returns non-zero if quota exceeded. */ +static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr, + const int slot) +{ + struct inet_timewait_sock *tw; + struct hlist_node *node; + unsigned int killed; + int ret; + + /* NOTE: compare this to previous version where lock + * was released after detaching chain. It was racy, + * because tw buckets are scheduled in not serialized context + * in 2.3 (with netfilter), and with softnet it is common, because + * soft irqs are not sequenced. + */ + killed = 0; + ret = 0; +rescan: + inet_twsk_for_each_inmate(tw, node, &twdr->cells[slot]) { + __inet_twsk_del_dead_node(tw); + spin_unlock(&twdr->death_lock); + __inet_twsk_kill(tw, twdr->hashinfo); + inet_twsk_put(tw); + killed++; + spin_lock(&twdr->death_lock); + if (killed > INET_TWDR_TWKILL_QUOTA) { + ret = 1; + break; + } + + /* While we dropped twdr->death_lock, another cpu may have + * killed off the next TW bucket in the list, therefore + * do a fresh re-read of the hlist head node with the + * lock reacquired. We still use the hlist traversal + * macro in order to get the prefetches. + */ + goto rescan; + } + + twdr->tw_count -= killed; + NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed); + + return ret; +} + +void inet_twdr_hangman(unsigned long data) +{ + struct inet_timewait_death_row *twdr; + int unsigned need_timer; + + twdr = (struct inet_timewait_death_row *)data; + spin_lock(&twdr->death_lock); + + if (twdr->tw_count == 0) + goto out; + + need_timer = 0; + if (inet_twdr_do_twkill_work(twdr, twdr->slot)) { + twdr->thread_slots |= (1 << twdr->slot); + mb(); + schedule_work(&twdr->twkill_work); + need_timer = 1; + } else { + /* We purged the entire slot, anything left? */ + if (twdr->tw_count) + need_timer = 1; + } + twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1)); + if (need_timer) + mod_timer(&twdr->tw_timer, jiffies + twdr->period); +out: + spin_unlock(&twdr->death_lock); +} + +EXPORT_SYMBOL_GPL(inet_twdr_hangman); + +extern void twkill_slots_invalid(void); + +void inet_twdr_twkill_work(void *data) +{ + struct inet_timewait_death_row *twdr = data; + int i; + + if ((INET_TWDR_TWKILL_SLOTS - 1) > (sizeof(twdr->thread_slots) * 8)) + twkill_slots_invalid(); + + while (twdr->thread_slots) { + spin_lock_bh(&twdr->death_lock); + for (i = 0; i < INET_TWDR_TWKILL_SLOTS; i++) { + if (!(twdr->thread_slots & (1 << i))) + continue; + + while (inet_twdr_do_twkill_work(twdr, i) != 0) { + if (need_resched()) { + spin_unlock_bh(&twdr->death_lock); + schedule(); + spin_lock_bh(&twdr->death_lock); + } + } + + twdr->thread_slots &= ~(1 << i); + } + spin_unlock_bh(&twdr->death_lock); + } +} + +EXPORT_SYMBOL_GPL(inet_twdr_twkill_work); + +/* These are always called from BH context. See callers in + * tcp_input.c to verify this. + */ + +/* This is for handling early-kills of TIME_WAIT sockets. */ +void inet_twsk_deschedule(struct inet_timewait_sock *tw, + struct inet_timewait_death_row *twdr) +{ + spin_lock(&twdr->death_lock); + if (inet_twsk_del_dead_node(tw)) { + inet_twsk_put(tw); + if (--twdr->tw_count == 0) + del_timer(&twdr->tw_timer); + } + spin_unlock(&twdr->death_lock); + __inet_twsk_kill(tw, twdr->hashinfo); +} + +EXPORT_SYMBOL(inet_twsk_deschedule); + +void inet_twsk_schedule(struct inet_timewait_sock *tw, + struct inet_timewait_death_row *twdr, + const int timeo, const int timewait_len) +{ + struct hlist_head *list; + int slot; + + /* timeout := RTO * 3.5 + * + * 3.5 = 1+2+0.5 to wait for two retransmits. + * + * RATIONALE: if FIN arrived and we entered TIME-WAIT state, + * our ACK acking that FIN can be lost. If N subsequent retransmitted + * FINs (or previous seqments) are lost (probability of such event + * is p^(N+1), where p is probability to lose single packet and + * time to detect the loss is about RTO*(2^N - 1) with exponential + * backoff). Normal timewait length is calculated so, that we + * waited at least for one retransmitted FIN (maximal RTO is 120sec). + * [ BTW Linux. following BSD, violates this requirement waiting + * only for 60sec, we should wait at least for 240 secs. + * Well, 240 consumes too much of resources 8) + * ] + * This interval is not reduced to catch old duplicate and + * responces to our wandering segments living for two MSLs. + * However, if we use PAWS to detect + * old duplicates, we can reduce the interval to bounds required + * by RTO, rather than MSL. So, if peer understands PAWS, we + * kill tw bucket after 3.5*RTO (it is important that this number + * is greater than TS tick!) and detect old duplicates with help + * of PAWS. + */ + slot = (timeo + (1 << INET_TWDR_RECYCLE_TICK) - 1) >> INET_TWDR_RECYCLE_TICK; + + spin_lock(&twdr->death_lock); + + /* Unlink it, if it was scheduled */ + if (inet_twsk_del_dead_node(tw)) + twdr->tw_count--; + else + atomic_inc(&tw->tw_refcnt); + + if (slot >= INET_TWDR_RECYCLE_SLOTS) { + /* Schedule to slow timer */ + if (timeo >= timewait_len) { + slot = INET_TWDR_TWKILL_SLOTS - 1; + } else { + slot = (timeo + twdr->period - 1) / twdr->period; + if (slot >= INET_TWDR_TWKILL_SLOTS) + slot = INET_TWDR_TWKILL_SLOTS - 1; + } + tw->tw_ttd = jiffies + timeo; + slot = (twdr->slot + slot) & (INET_TWDR_TWKILL_SLOTS - 1); + list = &twdr->cells[slot]; + } else { + tw->tw_ttd = jiffies + (slot << INET_TWDR_RECYCLE_TICK); + + if (twdr->twcal_hand < 0) { + twdr->twcal_hand = 0; + twdr->twcal_jiffie = jiffies; + twdr->twcal_timer.expires = twdr->twcal_jiffie + + (slot << INET_TWDR_RECYCLE_TICK); + add_timer(&twdr->twcal_timer); + } else { + if (time_after(twdr->twcal_timer.expires, + jiffies + (slot << INET_TWDR_RECYCLE_TICK))) + mod_timer(&twdr->twcal_timer, + jiffies + (slot << INET_TWDR_RECYCLE_TICK)); + slot = (twdr->twcal_hand + slot) & (INET_TWDR_RECYCLE_SLOTS - 1); + } + list = &twdr->twcal_row[slot]; + } + + hlist_add_head(&tw->tw_death_node, list); + + if (twdr->tw_count++ == 0) + mod_timer(&twdr->tw_timer, jiffies + twdr->period); + spin_unlock(&twdr->death_lock); +} + +EXPORT_SYMBOL_GPL(inet_twsk_schedule); + +void inet_twdr_twcal_tick(unsigned long data) +{ + struct inet_timewait_death_row *twdr; + int n, slot; + unsigned long j; + unsigned long now = jiffies; + int killed = 0; + int adv = 0; + + twdr = (struct inet_timewait_death_row *)data; + + spin_lock(&twdr->death_lock); + if (twdr->twcal_hand < 0) + goto out; + + slot = twdr->twcal_hand; + j = twdr->twcal_jiffie; + + for (n = 0; n < INET_TWDR_RECYCLE_SLOTS; n++) { + if (time_before_eq(j, now)) { + struct hlist_node *node, *safe; + struct inet_timewait_sock *tw; + + inet_twsk_for_each_inmate_safe(tw, node, safe, + &twdr->twcal_row[slot]) { + __inet_twsk_del_dead_node(tw); + __inet_twsk_kill(tw, twdr->hashinfo); + inet_twsk_put(tw); + killed++; + } + } else { + if (!adv) { + adv = 1; + twdr->twcal_jiffie = j; + twdr->twcal_hand = slot; + } + + if (!hlist_empty(&twdr->twcal_row[slot])) { + mod_timer(&twdr->twcal_timer, j); + goto out; + } + } + j += 1 << INET_TWDR_RECYCLE_TICK; + slot = (slot + 1) & (INET_TWDR_RECYCLE_SLOTS - 1); + } + twdr->twcal_hand = -1; + +out: + if ((twdr->tw_count -= killed) == 0) + del_timer(&twdr->tw_timer); + NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed); + spin_unlock(&twdr->death_lock); +} + +EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 81b9a52..dc08523 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -35,12 +35,6 @@ #define SYNC_INIT 1 #endif -/* New-style handling of TIME_WAIT sockets. */ - -static void inet_twdr_hangman(unsigned long data); -static void inet_twdr_twkill_work(void *data); -static void inet_twdr_twcal_tick(unsigned long data); - int sysctl_tcp_syncookies = SYNC_INIT; int sysctl_tcp_abort_on_overflow; @@ -63,10 +57,6 @@ struct inet_timewait_death_row tcp_death_row = { EXPORT_SYMBOL_GPL(tcp_death_row); -static void inet_twsk_schedule(struct inet_timewait_sock *tw, - struct inet_timewait_death_row *twdr, - const int timeo); - static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) { if (seq == s_win) @@ -173,9 +163,11 @@ kill_with_rst: if (tw->tw_family == AF_INET && tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp && tcp_v4_tw_remember_stamp(tw)) - inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout); + inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout, + TCP_TIMEWAIT_LEN); else - inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN); + inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, + TCP_TIMEWAIT_LEN); return TCP_TW_ACK; } @@ -213,7 +205,8 @@ kill: return TCP_TW_SUCCESS; } } - inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN); + inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, + TCP_TIMEWAIT_LEN); if (tmp_opt.saw_tstamp) { tcptw->tw_ts_recent = tmp_opt.rcv_tsval; @@ -263,7 +256,8 @@ kill: * Do not reschedule in the last case. */ if (paws_reject || th->ack) - inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN); + inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, + TCP_TIMEWAIT_LEN); /* Send ACK. Note, we do not put the bucket, * it will be released by caller. @@ -326,7 +320,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) timeo = TCP_TIMEWAIT_LEN; } - inet_twsk_schedule(tw, &tcp_death_row, timeo); + inet_twsk_schedule(tw, &tcp_death_row, timeo, + TCP_TIMEWAIT_LEN); inet_twsk_put(tw); } else { /* Sorry, if we're out of memory, just CLOSE this @@ -341,261 +336,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tcp_done(sk); } -/* Returns non-zero if quota exceeded. */ -static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr, - const int slot) -{ - struct inet_timewait_sock *tw; - struct hlist_node *node; - unsigned int killed; - int ret; - - /* NOTE: compare this to previous version where lock - * was released after detaching chain. It was racy, - * because tw buckets are scheduled in not serialized context - * in 2.3 (with netfilter), and with softnet it is common, because - * soft irqs are not sequenced. - */ - killed = 0; - ret = 0; -rescan: - inet_twsk_for_each_inmate(tw, node, &twdr->cells[slot]) { - __inet_twsk_del_dead_node(tw); - spin_unlock(&twdr->death_lock); - __inet_twsk_kill(tw, twdr->hashinfo); - inet_twsk_put(tw); - killed++; - spin_lock(&twdr->death_lock); - if (killed > INET_TWDR_TWKILL_QUOTA) { - ret = 1; - break; - } - - /* While we dropped twdr->death_lock, another cpu may have - * killed off the next TW bucket in the list, therefore - * do a fresh re-read of the hlist head node with the - * lock reacquired. We still use the hlist traversal - * macro in order to get the prefetches. - */ - goto rescan; - } - - twdr->tw_count -= killed; - NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed); - - return ret; -} - -static void inet_twdr_hangman(unsigned long data) -{ - struct inet_timewait_death_row *twdr; - int unsigned need_timer; - - twdr = (struct inet_timewait_death_row *)data; - spin_lock(&twdr->death_lock); - - if (twdr->tw_count == 0) - goto out; - - need_timer = 0; - if (inet_twdr_do_twkill_work(twdr, twdr->slot)) { - twdr->thread_slots |= (1 << twdr->slot); - mb(); - schedule_work(&twdr->twkill_work); - need_timer = 1; - } else { - /* We purged the entire slot, anything left? */ - if (twdr->tw_count) - need_timer = 1; - } - twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1)); - if (need_timer) - mod_timer(&twdr->tw_timer, jiffies + twdr->period); -out: - spin_unlock(&twdr->death_lock); -} - -extern void twkill_slots_invalid(void); - -static void inet_twdr_twkill_work(void *data) -{ - struct inet_timewait_death_row *twdr = data; - int i; - - if ((INET_TWDR_TWKILL_SLOTS - 1) > (sizeof(twdr->thread_slots) * 8)) - twkill_slots_invalid(); - - while (twdr->thread_slots) { - spin_lock_bh(&twdr->death_lock); - for (i = 0; i < INET_TWDR_TWKILL_SLOTS; i++) { - if (!(twdr->thread_slots & (1 << i))) - continue; - - while (inet_twdr_do_twkill_work(twdr, i) != 0) { - if (need_resched()) { - spin_unlock_bh(&twdr->death_lock); - schedule(); - spin_lock_bh(&twdr->death_lock); - } - } - - twdr->thread_slots &= ~(1 << i); - } - spin_unlock_bh(&twdr->death_lock); - } -} - -/* These are always called from BH context. See callers in - * tcp_input.c to verify this. - */ - -/* This is for handling early-kills of TIME_WAIT sockets. */ -void inet_twsk_deschedule(struct inet_timewait_sock *tw, - struct inet_timewait_death_row *twdr) -{ - spin_lock(&twdr->death_lock); - if (inet_twsk_del_dead_node(tw)) { - inet_twsk_put(tw); - if (--twdr->tw_count == 0) - del_timer(&twdr->tw_timer); - } - spin_unlock(&twdr->death_lock); - __inet_twsk_kill(tw, twdr->hashinfo); -} - -static void inet_twsk_schedule(struct inet_timewait_sock *tw, - struct inet_timewait_death_row *twdr, - const int timeo) -{ - struct hlist_head *list; - int slot; - - /* timeout := RTO * 3.5 - * - * 3.5 = 1+2+0.5 to wait for two retransmits. - * - * RATIONALE: if FIN arrived and we entered TIME-WAIT state, - * our ACK acking that FIN can be lost. If N subsequent retransmitted - * FINs (or previous seqments) are lost (probability of such event - * is p^(N+1), where p is probability to lose single packet and - * time to detect the loss is about RTO*(2^N - 1) with exponential - * backoff). Normal timewait length is calculated so, that we - * waited at least for one retransmitted FIN (maximal RTO is 120sec). - * [ BTW Linux. following BSD, violates this requirement waiting - * only for 60sec, we should wait at least for 240 secs. - * Well, 240 consumes too much of resources 8) - * ] - * This interval is not reduced to catch old duplicate and - * responces to our wandering segments living for two MSLs. - * However, if we use PAWS to detect - * old duplicates, we can reduce the interval to bounds required - * by RTO, rather than MSL. So, if peer understands PAWS, we - * kill tw bucket after 3.5*RTO (it is important that this number - * is greater than TS tick!) and detect old duplicates with help - * of PAWS. - */ - slot = (timeo + (1 << INET_TWDR_RECYCLE_TICK) - 1) >> INET_TWDR_RECYCLE_TICK; - - spin_lock(&twdr->death_lock); - - /* Unlink it, if it was scheduled */ - if (inet_twsk_del_dead_node(tw)) - twdr->tw_count--; - else - atomic_inc(&tw->tw_refcnt); - - if (slot >= INET_TWDR_RECYCLE_SLOTS) { - /* Schedule to slow timer */ - if (timeo >= TCP_TIMEWAIT_LEN) { - slot = INET_TWDR_TWKILL_SLOTS - 1; - } else { - slot = (timeo + twdr->period - 1) / twdr->period; - if (slot >= INET_TWDR_TWKILL_SLOTS) - slot = INET_TWDR_TWKILL_SLOTS - 1; - } - tw->tw_ttd = jiffies + timeo; - slot = (twdr->slot + slot) & (INET_TWDR_TWKILL_SLOTS - 1); - list = &twdr->cells[slot]; - } else { - tw->tw_ttd = jiffies + (slot << INET_TWDR_RECYCLE_TICK); - - if (twdr->twcal_hand < 0) { - twdr->twcal_hand = 0; - twdr->twcal_jiffie = jiffies; - twdr->twcal_timer.expires = twdr->twcal_jiffie + - (slot << INET_TWDR_RECYCLE_TICK); - add_timer(&twdr->twcal_timer); - } else { - if (time_after(twdr->twcal_timer.expires, - jiffies + (slot << INET_TWDR_RECYCLE_TICK))) - mod_timer(&twdr->twcal_timer, - jiffies + (slot << INET_TWDR_RECYCLE_TICK)); - slot = (twdr->twcal_hand + slot) & (INET_TWDR_RECYCLE_SLOTS - 1); - } - list = &twdr->twcal_row[slot]; - } - - hlist_add_head(&tw->tw_death_node, list); - - if (twdr->tw_count++ == 0) - mod_timer(&twdr->tw_timer, jiffies + twdr->period); - spin_unlock(&twdr->death_lock); -} - -void inet_twdr_twcal_tick(unsigned long data) -{ - struct inet_timewait_death_row *twdr; - int n, slot; - unsigned long j; - unsigned long now = jiffies; - int killed = 0; - int adv = 0; - - twdr = (struct inet_timewait_death_row *)data; - - spin_lock(&twdr->death_lock); - if (twdr->twcal_hand < 0) - goto out; - - slot = twdr->twcal_hand; - j = twdr->twcal_jiffie; - - for (n = 0; n < INET_TWDR_RECYCLE_SLOTS; n++) { - if (time_before_eq(j, now)) { - struct hlist_node *node, *safe; - struct inet_timewait_sock *tw; - - inet_twsk_for_each_inmate_safe(tw, node, safe, - &twdr->twcal_row[slot]) { - __inet_twsk_del_dead_node(tw); - __inet_twsk_kill(tw, twdr->hashinfo); - inet_twsk_put(tw); - killed++; - } - } else { - if (!adv) { - adv = 1; - twdr->twcal_jiffie = j; - twdr->twcal_hand = slot; - } - - if (!hlist_empty(&twdr->twcal_row[slot])) { - mod_timer(&twdr->twcal_timer, j); - goto out; - } - } - j += 1 << INET_TWDR_RECYCLE_TICK; - slot = (slot + 1) & (INET_TWDR_RECYCLE_SLOTS - 1); - } - twdr->twcal_hand = -1; - -out: - if ((twdr->tw_count -= killed) == 0) - del_timer(&twdr->tw_timer); - NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed); - spin_unlock(&twdr->death_lock); -} - /* This is not only more efficient than what we used to do, it eliminates * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM * @@ -933,4 +673,3 @@ EXPORT_SYMBOL(tcp_check_req); EXPORT_SYMBOL(tcp_child_process); EXPORT_SYMBOL(tcp_create_openreq_child); EXPORT_SYMBOL(tcp_timewait_state_process); -EXPORT_SYMBOL(inet_twsk_deschedule); -- cgit v0.10.2 From 64cf1e5d8b5f88d56509260e08fa0d8314277350 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:45:21 -0700 Subject: [DCCP]: Finish the TIMEWAIT minisock support Using most of the infrastructure TCP uses, with a dccp_death_row, etc. As per my current interpretation of the draft what we have with this changeset seems to be all we need (or very close to it 8)). Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 431d589..3dccdd5 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -194,6 +194,7 @@ enum { #include #include +#include #include #include #include @@ -354,6 +355,8 @@ static inline struct dccp_request_sock *dccp_rsk(const struct request_sock *req) return (struct dccp_request_sock *)req; } +extern struct inet_timewait_death_row dccp_death_row; + /* Read about the ECN nonce to see why it is 253 */ #define DCCP_MAX_ACK_VECTOR_LEN 253 diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 6bccf4d..f6da932 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -97,7 +97,7 @@ static int __dccp_v4_check_established(struct sock *sk, const __u16 lport, NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); } else if (tw != NULL) { /* Silly. Should hash-dance instead... */ - dccp_tw_deschedule(tw); + inet_twsk_deschedule(tw, &dccp_death_row); NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); inet_twsk_put(tw); @@ -201,7 +201,7 @@ ok: spin_unlock(&head->lock); if (tw != NULL) { - dccp_tw_deschedule(tw); + inet_twsk_deschedule(tw, &dccp_death_row); inet_twsk_put(tw); } @@ -1131,8 +1131,9 @@ int dccp_v4_rcv(struct sk_buff *skb) */ if (sk->sk_state == DCCP_TIME_WAIT) { - dccp_pr_debug("sk->sk_state == DCCP_TIME_WAIT: discard_and_relse\n"); - goto discard_and_relse; + dccp_pr_debug("sk->sk_state == DCCP_TIME_WAIT: " + "do_time_wait\n"); + goto do_time_wait; } if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { @@ -1179,6 +1180,10 @@ discard_it: discard_and_relse: sock_put(sk); goto discard_it; + +do_time_wait: + inet_twsk_put((struct inet_timewait_sock *)sk); + goto no_dccp_socket; } static int dccp_v4_init_sock(struct sock *sk) @@ -1290,5 +1295,5 @@ struct proto dccp_v4_prot = { .max_header = MAX_DCCP_HEADER, .obj_size = sizeof(struct dccp_sock), .rsk_prot = &dccp_request_sock_ops, - .twsk_obj_size = sizeof(struct inet_timewait_sock), /* FIXME! create dccp_timewait_sock */ + .twsk_obj_size = sizeof(struct inet_timewait_sock), }; diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index e498e38..a6a0b27 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -22,18 +22,58 @@ #include "ccid.h" #include "dccp.h" +struct inet_timewait_death_row dccp_death_row = { + .sysctl_max_tw_buckets = NR_FILE * 2, + .period = DCCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS, + .death_lock = SPIN_LOCK_UNLOCKED, + .hashinfo = &dccp_hashinfo, + .tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0, + (unsigned long)&dccp_death_row), + .twkill_work = __WORK_INITIALIZER(dccp_death_row.twkill_work, + inet_twdr_twkill_work, + &dccp_death_row), +/* Short-time timewait calendar */ + + .twcal_hand = -1, + .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0, + (unsigned long)&dccp_death_row), +}; + void dccp_time_wait(struct sock *sk, int state, int timeo) { - /* FIXME: Implement */ - dccp_pr_debug("Want to help? Start here\n"); - dccp_set_state(sk, state); -} + struct inet_timewait_sock *tw = NULL; -/* This is for handling early-kills of TIME_WAIT sockets. */ -void dccp_tw_deschedule(struct inet_timewait_sock *tw) -{ - dccp_pr_debug("Want to help? Start here\n"); - __inet_twsk_kill(tw, &dccp_hashinfo); + if (dccp_death_row.tw_count < dccp_death_row.sysctl_max_tw_buckets) + tw = inet_twsk_alloc(sk, state); + + if (tw != NULL) { + const struct inet_connection_sock *icsk = inet_csk(sk); + const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); + + /* Linkage updates. */ + __inet_twsk_hashdance(tw, sk, &dccp_hashinfo); + + /* Get the TIME_WAIT timeout firing. */ + if (timeo < rto) + timeo = rto; + + tw->tw_timeout = DCCP_TIMEWAIT_LEN; + if (state == DCCP_TIME_WAIT) + timeo = DCCP_TIMEWAIT_LEN; + + inet_twsk_schedule(tw, &dccp_death_row, timeo, + DCCP_TIMEWAIT_LEN); + inet_twsk_put(tw); + } else { + /* Sorry, if we're out of memory, just CLOSE this + * socket up. We've got bigger problems than + * non-graceful socket closings. + */ + if (net_ratelimit()) + printk(KERN_INFO "DCCP: time wait bucket table overflow\n"); + } + + dccp_done(sk); } struct sock *dccp_create_openreq_child(struct sock *sk, @@ -55,7 +95,7 @@ struct sock *dccp_create_openreq_child(struct sock *sk, newdp->dccps_hc_rx_ackpkts = NULL; newdp->dccps_role = DCCP_ROLE_SERVER; - newicsk->icsk_rto = TCP_TIMEOUT_INIT; + newicsk->icsk_rto = DCCP_TIMEOUT_INIT; if (newdp->dccps_options.dccpo_send_ack_vector) { newdp->dccps_hc_rx_ackpkts = dccp_ackpkts_alloc(DCCP_MAX_ACK_VECTOR_LEN, -- cgit v0.10.2 From 64ce207306debd7157f47282be94770407bec01c Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 9 Aug 2005 20:50:53 -0700 Subject: [NET]: Make NETDEBUG pure printk wrappers Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller diff --git a/include/net/sock.h b/include/net/sock.h index 8678313..065df67 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1316,11 +1316,11 @@ extern int sock_get_timestamp(struct sock *, struct timeval __user *); */ #if 0 -#define NETDEBUG(x) do { } while (0) -#define LIMIT_NETDEBUG(x) do {} while(0) +#define NETDEBUG(fmt, args...) do { } while (0) +#define LIMIT_NETDEBUG(fmt, args...) do { } while(0) #else -#define NETDEBUG(x) do { x; } while (0) -#define LIMIT_NETDEBUG(x) do { if (net_ratelimit()) { x; } } while(0) +#define NETDEBUG(fmt, args...) printk(fmt,##args) +#define LIMIT_NETDEBUG(fmt, args...) do { if (net_ratelimit()) printk(fmt,##args); } while(0) #endif /* diff --git a/net/dccp/input.c b/net/dccp/input.c index 76c3401..bdaecde 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -161,7 +161,7 @@ int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, if (dccp_ackpkts_add(dp->dccps_hc_rx_ackpkts, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_ACKPKTS_STATE_RECEIVED)) { - LIMIT_NETDEBUG(pr_info("DCCP: acknowledgeable packets buffer full!\n")); + LIMIT_NETDEBUG(KERN_INFO "DCCP: acknowledgeable packets buffer full!\n"); ap->dccpap_ack_seqno = DCCP_MAX_SEQNO + 1; inet_csk_schedule_ack(sk); inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, TCP_DELACK_MIN, TCP_RTO_MAX); diff --git a/net/dccp/options.c b/net/dccp/options.c index 9ca32cb..5bf9976 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -231,7 +231,7 @@ void dccp_insert_option(struct sock *sk, struct sk_buff *skb, unsigned char *to; if (DCCP_SKB_CB(skb)->dccpd_opt_len + len + 2 > DCCP_MAX_OPT_LEN) { - LIMIT_NETDEBUG(pr_info("DCCP: packet too small to insert %d option!\n", option)); + LIMIT_NETDEBUG(KERN_INFO "DCCP: packet too small to insert %d option!\n", option); return; } @@ -299,7 +299,7 @@ void dccp_insert_option_elapsed_time(struct sock *sk, return; if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) { - LIMIT_NETDEBUG(pr_info("DCCP: packet too small to insert elapsed time!\n")); + LIMIT_NETDEBUG(KERN_INFO "DCCP: packet too small to insert elapsed time!\n"); return; } @@ -335,7 +335,7 @@ static void dccp_insert_option_ack_vector(struct sock *sk, struct sk_buff *skb) dccp_insert_option_elapsed_time(sk, skb, elapsed_time); if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) { - LIMIT_NETDEBUG(pr_info("DCCP: packet too small to insert ACK Vector!\n")); + LIMIT_NETDEBUG(KERN_INFO "DCCP: packet too small to insert ACK Vector!\n"); return; } @@ -412,7 +412,7 @@ static void dccp_insert_option_timestamp_echo(struct sock *sk, struct sk_buff *s unsigned char *to; if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) { - LIMIT_NETDEBUG(pr_info("DCCP: packet too small to insert timestamp echo!\n")); + LIMIT_NETDEBUG(KERN_INFO "DCCP: packet too small to insert timestamp echo!\n"); return; } diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index ba57446..b31ffc5 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -331,8 +331,8 @@ static void esp4_err(struct sk_buff *skb, u32 info) x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET); if (!x) return; - NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n", - ntohl(esph->spi), ntohl(iph->daddr))); + NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n", + ntohl(esph->spi), ntohl(iph->daddr)); xfrm_state_put(x); } @@ -395,10 +395,10 @@ static int esp_init_state(struct xfrm_state *x) if (aalg_desc->uinfo.auth.icv_fullbits/8 != crypto_tfm_alg_digestsize(esp->auth.tfm)) { - NETDEBUG(printk(KERN_INFO "ESP: %s digestsize %u != %hu\n", - x->aalg->alg_name, - crypto_tfm_alg_digestsize(esp->auth.tfm), - aalg_desc->uinfo.auth.icv_fullbits/8)); + NETDEBUG(KERN_INFO "ESP: %s digestsize %u != %hu\n", + x->aalg->alg_name, + crypto_tfm_alg_digestsize(esp->auth.tfm), + aalg_desc->uinfo.auth.icv_fullbits/8); goto error; } diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index badfc58..25f66b7 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -627,11 +627,10 @@ static void icmp_unreach(struct sk_buff *skb) break; case ICMP_FRAG_NEEDED: if (ipv4_config.no_pmtu_disc) { - LIMIT_NETDEBUG( - printk(KERN_INFO "ICMP: %u.%u.%u.%u: " + LIMIT_NETDEBUG(KERN_INFO "ICMP: %u.%u.%u.%u: " "fragmentation needed " "and DF set.\n", - NIPQUAD(iph->daddr))); + NIPQUAD(iph->daddr)); } else { info = ip_rt_frag_needed(iph, ntohs(icmph->un.frag.mtu)); @@ -640,10 +639,9 @@ static void icmp_unreach(struct sk_buff *skb) } break; case ICMP_SR_FAILED: - LIMIT_NETDEBUG( - printk(KERN_INFO "ICMP: %u.%u.%u.%u: Source " + LIMIT_NETDEBUG(KERN_INFO "ICMP: %u.%u.%u.%u: Source " "Route Failed.\n", - NIPQUAD(iph->daddr))); + NIPQUAD(iph->daddr)); break; default: break; @@ -936,7 +934,7 @@ int icmp_rcv(struct sk_buff *skb) case CHECKSUM_HW: if (!(u16)csum_fold(skb->csum)) break; - LIMIT_NETDEBUG(printk(KERN_DEBUG "icmp v4 hw csum failure\n")); + LIMIT_NETDEBUG(KERN_DEBUG "icmp v4 hw csum failure\n"); case CHECKSUM_NONE: if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) goto error; diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 5088f90..44607f4 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -904,7 +904,7 @@ int igmp_rcv(struct sk_buff *skb) case IGMP_MTRACE_RESP: break; default: - NETDEBUG(printk(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type)); + NETDEBUG(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type); } in_dev_put(in_dev); kfree_skb(skb); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index eb377ae..1ac64c0 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -377,7 +377,7 @@ static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user) return ip_frag_intern(hash, qp); out_nomem: - LIMIT_NETDEBUG(printk(KERN_ERR "ip_frag_create: no memory left !\n")); + LIMIT_NETDEBUG(KERN_ERR "ip_frag_create: no memory left !\n"); return NULL; } @@ -625,8 +625,8 @@ static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev) return head; out_nomem: - LIMIT_NETDEBUG(printk(KERN_ERR "IP: queue_glue: no memory for gluing " - "queue %p\n", qp)); + LIMIT_NETDEBUG(KERN_ERR "IP: queue_glue: no memory for gluing " + "queue %p\n", qp); goto out_fail; out_oversize: if (net_ratelimit()) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 633945d..19f24f7 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -573,7 +573,7 @@ slow_path: */ if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) { - NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n")); + NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n"); err = -ENOMEM; goto fail; } diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 7ded6e6..dcb7ee6 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -214,8 +214,8 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info) spi, IPPROTO_COMP, AF_INET); if (!x) return; - NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%u.%u.%u.%u\n", - spi, NIPQUAD(iph->daddr))); + NETDEBUG(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%u.%u.%u.%u\n", + spi, NIPQUAD(iph->daddr)); xfrm_state_put(x); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 83f7234..32a0ebc 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -990,11 +990,10 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) * to destinations, already remembered * to the moment of synflood. */ - LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open " - "request from %u.%u." - "%u.%u/%u\n", - NIPQUAD(saddr), - ntohs(skb->h.th->source))); + LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open " + "request from %u.%u.%u.%u/%u\n", + NIPQUAD(saddr), + ntohs(skb->h.th->source)); dst_release(dst); goto drop_and_free; } @@ -1118,7 +1117,7 @@ static int tcp_v4_checksum_init(struct sk_buff *skb) skb->nh.iph->daddr, skb->csum)) return 0; - LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n")); + LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v4 csum failed\n"); skb->ip_summed = CHECKSUM_NONE; } if (skb->len <= 76) { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index a8135e1..3a5bbbe 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -629,7 +629,7 @@ back_from_confirm: /* ... which is an evident application bug. --ANK */ release_sock(sk); - LIMIT_NETDEBUG(printk(KERN_DEBUG "udp cork app bug 2\n")); + LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 2\n"); err = -EINVAL; goto out; } @@ -694,7 +694,7 @@ static int udp_sendpage(struct sock *sk, struct page *page, int offset, if (unlikely(!up->pending)) { release_sock(sk); - LIMIT_NETDEBUG(printk(KERN_DEBUG "udp cork app bug 3\n")); + LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 3\n"); return -EINVAL; } @@ -1103,7 +1103,7 @@ static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh, skb->ip_summed = CHECKSUM_UNNECESSARY; if (!udp_check(uh, ulen, saddr, daddr, skb->csum)) return 0; - LIMIT_NETDEBUG(printk(KERN_DEBUG "udp v4 hw csum failure.\n")); + LIMIT_NETDEBUG(KERN_DEBUG "udp v4 hw csum failure.\n"); skb->ip_summed = CHECKSUM_NONE; } if (skb->ip_summed != CHECKSUM_UNNECESSARY) @@ -1182,13 +1182,13 @@ int udp_rcv(struct sk_buff *skb) return(0); short_packet: - LIMIT_NETDEBUG(printk(KERN_DEBUG "UDP: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n", - NIPQUAD(saddr), - ntohs(uh->source), - ulen, - len, - NIPQUAD(daddr), - ntohs(uh->dest))); + LIMIT_NETDEBUG(KERN_DEBUG "UDP: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n", + NIPQUAD(saddr), + ntohs(uh->source), + ulen, + len, + NIPQUAD(daddr), + ntohs(uh->dest)); no_header: UDP_INC_STATS_BH(UDP_MIB_INERRORS); kfree_skb(skb); @@ -1199,12 +1199,12 @@ csum_error: * RFC1122: OK. Discards the bad packet silently (as far as * the network is concerned, anyway) as per 4.1.3.4 (MUST). */ - LIMIT_NETDEBUG(printk(KERN_DEBUG "UDP: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n", - NIPQUAD(saddr), - ntohs(uh->source), - NIPQUAD(daddr), - ntohs(uh->dest), - ulen)); + LIMIT_NETDEBUG(KERN_DEBUG "UDP: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n", + NIPQUAD(saddr), + ntohs(uh->source), + NIPQUAD(daddr), + ntohs(uh->dest), + ulen); drop: UDP_INC_STATS_BH(UDP_MIB_INERRORS); kfree_skb(skb); diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index 986fdfd..0ebfad9 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -131,10 +131,10 @@ static int ipv6_clear_mutable_options(struct ipv6hdr *iph, int len) case NEXTHDR_HOP: case NEXTHDR_DEST: if (!zero_out_mutable_opts(exthdr.opth)) { - LIMIT_NETDEBUG(printk( + LIMIT_NETDEBUG( KERN_WARNING "overrun %sopts\n", nexthdr == NEXTHDR_HOP ? - "hop" : "dest")); + "hop" : "dest"); return -EINVAL; } break; @@ -293,8 +293,7 @@ static int ah6_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struc skb_push(skb, skb->data - skb->nh.raw); ahp->icv(ahp, skb, ah->auth_data); if (memcmp(ah->auth_data, auth_data, ahp->icv_trunc_len)) { - LIMIT_NETDEBUG( - printk(KERN_WARNING "ipsec ah authentication error\n")); + LIMIT_NETDEBUG(KERN_WARNING "ipsec ah authentication error\n"); x->stats.integrity_failed++; goto free_out; } @@ -332,9 +331,9 @@ static void ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (!x) return; - NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/" - "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", - ntohl(ah->spi), NIP6(iph->daddr))); + NETDEBUG(KERN_DEBUG "pmtu discovery on SA AH/%08x/" + "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", + ntohl(ah->spi), NIP6(iph->daddr)); xfrm_state_put(x); } diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 761984f..01468fa 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -589,8 +589,8 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl, break; default: - LIMIT_NETDEBUG( - printk(KERN_DEBUG "invalid cmsg type: %d\n", cmsg->cmsg_type)); + LIMIT_NETDEBUG(KERN_DEBUG "invalid cmsg type: %d\n", + cmsg->cmsg_type); err = -EINVAL; break; }; diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 324db62..e8bff9d 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -212,8 +212,7 @@ static int esp6_input(struct xfrm_state *x, struct xfrm_decap_state *decap, stru padlen = nexthdr[0]; if (padlen+2 >= elen) { - LIMIT_NETDEBUG( - printk(KERN_WARNING "ipsec esp packet is garbage padlen=%d, elen=%d\n", padlen+2, elen)); + LIMIT_NETDEBUG(KERN_WARNING "ipsec esp packet is garbage padlen=%d, elen=%d\n", padlen+2, elen); ret = -EINVAL; goto out; } diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index e0839ea..5be6da2 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -424,8 +424,8 @@ static int ipv6_hop_ra(struct sk_buff *skb, int optoff) IP6CB(skb)->ra = optoff; return 1; } - LIMIT_NETDEBUG( - printk(KERN_DEBUG "ipv6_hop_ra: wrong RA length %d\n", skb->nh.raw[optoff+1])); + LIMIT_NETDEBUG(KERN_DEBUG "ipv6_hop_ra: wrong RA length %d\n", + skb->nh.raw[optoff+1]); kfree_skb(skb); return 0; } @@ -437,8 +437,8 @@ static int ipv6_hop_jumbo(struct sk_buff *skb, int optoff) u32 pkt_len; if (skb->nh.raw[optoff+1] != 4 || (optoff&3) != 2) { - LIMIT_NETDEBUG( - printk(KERN_DEBUG "ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n", skb->nh.raw[optoff+1])); + LIMIT_NETDEBUG(KERN_DEBUG "ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n", + skb->nh.raw[optoff+1]); IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); goto drop; } diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index ee9f1d3..ff685f2 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -332,8 +332,7 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info, * for now we don't know that. */ if ((addr_type == IPV6_ADDR_ANY) || (addr_type & IPV6_ADDR_MULTICAST)) { - LIMIT_NETDEBUG( - printk(KERN_DEBUG "icmpv6_send: addr_any/mcast source\n")); + LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: addr_any/mcast source\n"); return; } @@ -341,8 +340,7 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info, * Never answer to a ICMP packet. */ if (is_ineligible(skb)) { - LIMIT_NETDEBUG( - printk(KERN_DEBUG "icmpv6_send: no reply to icmp error\n")); + LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: no reply to icmp error\n"); return; } @@ -393,8 +391,7 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info, len = skb->len - msg.offset; len = min_t(unsigned int, len, IPV6_MIN_MTU - sizeof(struct ipv6hdr) -sizeof(struct icmp6hdr)); if (len < 0) { - LIMIT_NETDEBUG( - printk(KERN_DEBUG "icmp: len problem\n")); + LIMIT_NETDEBUG(KERN_DEBUG "icmp: len problem\n"); goto out_dst_release; } @@ -584,17 +581,15 @@ static int icmpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) skb->ip_summed = CHECKSUM_UNNECESSARY; if (csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6, skb->csum)) { - LIMIT_NETDEBUG( - printk(KERN_DEBUG "ICMPv6 hw checksum failed\n")); + LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 hw checksum failed\n"); skb->ip_summed = CHECKSUM_NONE; } } if (skb->ip_summed == CHECKSUM_NONE) { if (csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6, skb_checksum(skb, 0, skb->len, 0))) { - LIMIT_NETDEBUG( - printk(KERN_DEBUG "ICMPv6 checksum failed [%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x > %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x]\n", - NIP6(*saddr), NIP6(*daddr))); + LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 checksum failed [%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x > %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x]\n", + NIP6(*saddr), NIP6(*daddr)); goto discard_it; } } @@ -670,8 +665,7 @@ static int icmpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) break; default: - LIMIT_NETDEBUG( - printk(KERN_DEBUG "icmpv6: msg of unknown type\n")); + LIMIT_NETDEBUG(KERN_DEBUG "icmpv6: msg of unknown type\n"); /* informational */ if (type & ICMPV6_INFOMSG_MASK) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 00f8514..01ef94f 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -625,7 +625,7 @@ slow_path: */ if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) { - NETDEBUG(printk(KERN_INFO "IPv6: frag: no memory for new fragment!\n")); + NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n"); IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS); err = -ENOMEM; goto fail; diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index c8daef9..f8626eb 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -28,8 +28,7 @@ int ip6_route_me_harder(struct sk_buff *skb) if (dst->error) { IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES); - LIMIT_NETDEBUG( - printk(KERN_DEBUG "ip6_route_me_harder: No more route.\n")); + LIMIT_NETDEBUG(KERN_DEBUG "ip6_route_me_harder: No more route.\n"); dst_release(dst); return -EINVAL; } diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 766e1c7..7a58632 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -343,8 +343,7 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb) if (csum_ipv6_magic(&skb->nh.ipv6h->saddr, &skb->nh.ipv6h->daddr, skb->len, inet->num, skb->csum)) { - LIMIT_NETDEBUG( - printk(KERN_DEBUG "raw v6 hw csum failure.\n")); + LIMIT_NETDEBUG(KERN_DEBUG "raw v6 hw csum failure.\n"); skb->ip_summed = CHECKSUM_NONE; } } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 1c21ad6..08c55b0 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1564,7 +1564,7 @@ static int tcp_v6_checksum_init(struct sk_buff *skb) if (!tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, &skb->nh.ipv6h->daddr,skb->csum)) return 0; - LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v6 csum failed\n")); + LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v6 csum failed\n"); } if (skb->len <= 76) { if (tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 2ffe34c..c348307 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -478,8 +478,7 @@ static int udpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) /* RFC 2460 section 8.1 says that we SHOULD log this error. Well, it is reasonable. */ - LIMIT_NETDEBUG( - printk(KERN_INFO "IPv6: udp checksum is 0\n")); + LIMIT_NETDEBUG(KERN_INFO "IPv6: udp checksum is 0\n"); goto discard; } @@ -494,7 +493,7 @@ static int udpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) if (skb->ip_summed==CHECKSUM_HW) { skb->ip_summed = CHECKSUM_UNNECESSARY; if (csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, skb->csum)) { - LIMIT_NETDEBUG(printk(KERN_DEBUG "udp v6 hw csum failure.\n")); + LIMIT_NETDEBUG(KERN_DEBUG "udp v6 hw csum failure.\n"); skb->ip_summed = CHECKSUM_NONE; } } @@ -826,7 +825,7 @@ back_from_confirm: /* ... which is an evident application bug. --ANK */ release_sock(sk); - LIMIT_NETDEBUG(printk(KERN_DEBUG "udp cork app bug 2\n")); + LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 2\n"); err = -EINVAL; goto out; } -- cgit v0.10.2 From 6687e988d9aeaccad6774e6a8304f681f3ec0a03 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 10 Aug 2005 04:03:31 -0300 Subject: [ICSK]: Move TCP congestion avoidance members to icsk This changeset basically moves tcp_sk()->{ca_ops,ca_state,etc} to inet_csk(), minimal renaming/moving done in this changeset to ease review. Most of it is just changes of struct tcp_sock * to struct sock * parameters. With this we move to a state closer to two interesting goals: 1. Generalisation of net/ipv4/tcp_diag.c, becoming inet_diag.c, being used for any INET transport protocol that has struct inet_hashinfo and are derived from struct inet_connection_sock. Keeps the userspace API, that will just not display DCCP sockets, while newer versions of tools can support DCCP. 2. INET generic transport pluggable Congestion Avoidance infrastructure, using the current TCP CA infrastructure with DCCP. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 6200968..ac4ca44 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -258,19 +258,15 @@ struct tcp_sock { __u32 mss_cache; /* Cached effective mss, not including SACKS */ __u16 xmit_size_goal; /* Goal for segmenting output packets */ __u16 ext_header_len; /* Network protocol overhead (IP/IPv6 options) */ - __u8 ca_state; /* State of fast-retransmit machine */ - __u8 keepalive_probes; /* num of allowed keep alive probes */ - __u16 advmss; /* Advertised MSS */ __u32 window_clamp; /* Maximal window to advertise */ __u32 rcv_ssthresh; /* Current window clamp */ __u32 frto_highmark; /* snd_nxt when RTO occurred */ __u8 reordering; /* Packet reordering metric. */ __u8 frto_counter; /* Number of new acks after RTO */ - __u8 nonagle; /* Disable Nagle algorithm? */ - /* ONE BYTE HOLE, TRY TO PACK */ + __u8 keepalive_probes; /* num of allowed keep alive probes */ /* RTT measurement */ __u32 srtt; /* smoothed round trip time << 3 */ @@ -311,8 +307,7 @@ struct tcp_sock { struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */ struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/ - __u8 probes_out; /* unanswered 0 window probes */ - __u8 ecn_flags; /* ECN status bits. */ + __u16 advmss; /* Advertised MSS */ __u16 prior_ssthresh; /* ssthresh saved at recovery start */ __u32 lost_out; /* Lost packets */ __u32 sacked_out; /* SACK'd packets */ @@ -327,7 +322,7 @@ struct tcp_sock { __u32 urg_seq; /* Seq of received urgent pointer */ __u16 urg_data; /* Saved octet of OOB data and control flags */ __u8 urg_mode; /* In urgent mode */ - /* ONE BYTE HOLE, TRY TO PACK! */ + __u8 ecn_flags; /* ECN status bits. */ __u32 snd_up; /* Urgent pointer */ __u32 total_retrans; /* Total retransmits for entire connection */ @@ -351,11 +346,6 @@ struct tcp_sock { __u32 seq; __u32 time; } rcvq_space; - - /* Pluggable TCP congestion control hook */ - struct tcp_congestion_ops *ca_ops; - u32 ca_priv[16]; -#define TCP_CA_PRIV_SIZE (16*sizeof(u32)) }; static inline struct tcp_sock *tcp_sk(const struct sock *sk) @@ -377,11 +367,6 @@ static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk) return (struct tcp_timewait_sock *)sk; } -static inline void *tcp_ca(const struct tcp_sock *tp) -{ - return (void *) tp->ca_priv; -} - #endif #endif /* _LINUX_TCP_H */ diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index bec19d5..4d7e708 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -27,6 +27,7 @@ struct inet_bind_bucket; struct inet_hashinfo; +struct tcp_congestion_ops; /** inet_connection_sock - INET connection oriented sock * @@ -35,10 +36,13 @@ struct inet_hashinfo; * @icsk_timeout: Timeout * @icsk_retransmit_timer: Resend (no ack) * @icsk_rto: Retransmit timeout + * @icsk_ca_ops Pluggable congestion control hook + * @icsk_ca_state: Congestion control state * @icsk_retransmits: Number of unrecovered [RTO] timeouts * @icsk_pending: Scheduled timer event * @icsk_backoff: Backoff * @icsk_syn_retries: Number of allowed SYN (or equivalent) retries + * @icsk_probes_out: unanswered 0 window probes * @icsk_ack: Delayed ACK control data */ struct inet_connection_sock { @@ -50,10 +54,14 @@ struct inet_connection_sock { struct timer_list icsk_retransmit_timer; struct timer_list icsk_delack_timer; __u32 icsk_rto; + struct tcp_congestion_ops *icsk_ca_ops; + __u8 icsk_ca_state; __u8 icsk_retransmits; __u8 icsk_pending; __u8 icsk_backoff; __u8 icsk_syn_retries; + __u8 icsk_probes_out; + /* 2 BYTES HOLE, TRY TO PACK! */ struct { __u8 pending; /* ACK is pending */ __u8 quick; /* Scheduled number of quick acks */ @@ -65,6 +73,8 @@ struct inet_connection_sock { __u16 last_seg_size; /* Size of last incoming segment */ __u16 rcv_mss; /* MSS used for delayed ACK decisions */ } icsk_ack; + u32 icsk_ca_priv[16]; +#define ICSK_CA_PRIV_SIZE (16 * sizeof(u32)) }; #define ICSK_TIME_RETRANS 1 /* Retransmit timer */ @@ -77,6 +87,11 @@ static inline struct inet_connection_sock *inet_csk(const struct sock *sk) return (struct inet_connection_sock *)sk; } +static inline void *inet_csk_ca(const struct sock *sk) +{ + return (void *)inet_csk(sk)->icsk_ca_priv; +} + extern struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req, const unsigned int __nocast priority); diff --git a/include/net/tcp.h b/include/net/tcp.h index d489ac5..0b3f729 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -669,29 +669,29 @@ struct tcp_congestion_ops { struct list_head list; /* initialize private data (optional) */ - void (*init)(struct tcp_sock *tp); + void (*init)(struct sock *sk); /* cleanup private data (optional) */ - void (*release)(struct tcp_sock *tp); + void (*release)(struct sock *sk); /* return slow start threshold (required) */ - u32 (*ssthresh)(struct tcp_sock *tp); + u32 (*ssthresh)(struct sock *sk); /* lower bound for congestion window (optional) */ - u32 (*min_cwnd)(struct tcp_sock *tp); + u32 (*min_cwnd)(struct sock *sk); /* do new cwnd calculation (required) */ - void (*cong_avoid)(struct tcp_sock *tp, u32 ack, + void (*cong_avoid)(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, int good_ack); /* round trip time sample per acked packet (optional) */ - void (*rtt_sample)(struct tcp_sock *tp, u32 usrtt); + void (*rtt_sample)(struct sock *sk, u32 usrtt); /* call before changing ca_state (optional) */ - void (*set_state)(struct tcp_sock *tp, u8 new_state); + void (*set_state)(struct sock *sk, u8 new_state); /* call when cwnd event occurs (optional) */ - void (*cwnd_event)(struct tcp_sock *tp, enum tcp_ca_event ev); + void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev); /* new value of cwnd after loss (optional) */ - u32 (*undo_cwnd)(struct tcp_sock *tp); + u32 (*undo_cwnd)(struct sock *sk); /* hook for packet ack accounting (optional) */ - void (*pkts_acked)(struct tcp_sock *tp, u32 num_acked); + void (*pkts_acked)(struct sock *sk, u32 num_acked); /* get info for tcp_diag (optional) */ - void (*get_info)(struct tcp_sock *tp, u32 ext, struct sk_buff *skb); + void (*get_info)(struct sock *sk, u32 ext, struct sk_buff *skb); char name[TCP_CA_NAME_MAX]; struct module *owner; @@ -700,30 +700,34 @@ struct tcp_congestion_ops { extern int tcp_register_congestion_control(struct tcp_congestion_ops *type); extern void tcp_unregister_congestion_control(struct tcp_congestion_ops *type); -extern void tcp_init_congestion_control(struct tcp_sock *tp); -extern void tcp_cleanup_congestion_control(struct tcp_sock *tp); +extern void tcp_init_congestion_control(struct sock *sk); +extern void tcp_cleanup_congestion_control(struct sock *sk); extern int tcp_set_default_congestion_control(const char *name); extern void tcp_get_default_congestion_control(char *name); -extern int tcp_set_congestion_control(struct tcp_sock *tp, const char *name); +extern int tcp_set_congestion_control(struct sock *sk, const char *name); extern struct tcp_congestion_ops tcp_init_congestion_ops; -extern u32 tcp_reno_ssthresh(struct tcp_sock *tp); -extern void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, +extern u32 tcp_reno_ssthresh(struct sock *sk); +extern void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, int flag); -extern u32 tcp_reno_min_cwnd(struct tcp_sock *tp); +extern u32 tcp_reno_min_cwnd(struct sock *sk); extern struct tcp_congestion_ops tcp_reno; -static inline void tcp_set_ca_state(struct tcp_sock *tp, u8 ca_state) +static inline void tcp_set_ca_state(struct sock *sk, const u8 ca_state) { - if (tp->ca_ops->set_state) - tp->ca_ops->set_state(tp, ca_state); - tp->ca_state = ca_state; + struct inet_connection_sock *icsk = inet_csk(sk); + + if (icsk->icsk_ca_ops->set_state) + icsk->icsk_ca_ops->set_state(sk, ca_state); + icsk->icsk_ca_state = ca_state; } -static inline void tcp_ca_event(struct tcp_sock *tp, enum tcp_ca_event event) +static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) { - if (tp->ca_ops->cwnd_event) - tp->ca_ops->cwnd_event(tp, event); + const struct inet_connection_sock *icsk = inet_csk(sk); + + if (icsk->icsk_ca_ops->cwnd_event) + icsk->icsk_ca_ops->cwnd_event(sk, event); } /* This determines how many packets are "in the network" to the best @@ -749,9 +753,10 @@ static __inline__ unsigned int tcp_packets_in_flight(const struct tcp_sock *tp) * The exception is rate halving phase, when cwnd is decreasing towards * ssthresh. */ -static inline __u32 tcp_current_ssthresh(struct tcp_sock *tp) +static inline __u32 tcp_current_ssthresh(const struct sock *sk) { - if ((1<ca_state)&(TCPF_CA_CWR|TCPF_CA_Recovery)) + const struct tcp_sock *tp = tcp_sk(sk); + if ((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_CWR | TCPF_CA_Recovery)) return tp->snd_ssthresh; else return max(tp->snd_ssthresh, @@ -768,10 +773,13 @@ static inline void tcp_sync_left_out(struct tcp_sock *tp) } /* Set slow start threshold and cwnd not falling to slow start */ -static inline void __tcp_enter_cwr(struct tcp_sock *tp) +static inline void __tcp_enter_cwr(struct sock *sk) { + const struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + tp->undo_marker = 0; - tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); + tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1U); tp->snd_cwnd_cnt = 0; @@ -780,12 +788,14 @@ static inline void __tcp_enter_cwr(struct tcp_sock *tp) TCP_ECN_queue_cwr(tp); } -static inline void tcp_enter_cwr(struct tcp_sock *tp) +static inline void tcp_enter_cwr(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); + tp->prior_ssthresh = 0; - if (tp->ca_state < TCP_CA_CWR) { - __tcp_enter_cwr(tp); - tcp_set_ca_state(tp, TCP_CA_CWR); + if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { + __tcp_enter_cwr(sk); + tcp_set_ca_state(sk, TCP_CA_CWR); } } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 026630a..fe3c6d3 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -508,7 +508,8 @@ struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req, newsk->sk_write_space = sk_stream_write_space; newicsk->icsk_retransmits = 0; - newicsk->icsk_backoff = 0; + newicsk->icsk_backoff = 0; + newicsk->icsk_probes_out = 0; /* Deinitialize accept_queue to trap illegal accesses. */ memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue)); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0eed64a..02848e7 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1671,11 +1671,11 @@ int tcp_disconnect(struct sock *sk, int flags) tp->write_seq = 1; icsk->icsk_backoff = 0; tp->snd_cwnd = 2; - tp->probes_out = 0; + icsk->icsk_probes_out = 0; tp->packets_out = 0; tp->snd_ssthresh = 0x7fffffff; tp->snd_cwnd_cnt = 0; - tcp_set_ca_state(tp, TCP_CA_Open); + tcp_set_ca_state(sk, TCP_CA_Open); tcp_clear_retrans(tp); inet_csk_delack_init(sk); sk->sk_send_head = NULL; @@ -1718,7 +1718,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, name[val] = 0; lock_sock(sk); - err = tcp_set_congestion_control(tp, name); + err = tcp_set_congestion_control(sk, name); release_sock(sk); return err; } @@ -1886,9 +1886,9 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) memset(info, 0, sizeof(*info)); info->tcpi_state = sk->sk_state; - info->tcpi_ca_state = tp->ca_state; + info->tcpi_ca_state = icsk->icsk_ca_state; info->tcpi_retransmits = icsk->icsk_retransmits; - info->tcpi_probes = tp->probes_out; + info->tcpi_probes = icsk->icsk_probes_out; info->tcpi_backoff = icsk->icsk_backoff; if (tp->rx_opt.tstamp_ok) @@ -2016,7 +2016,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, len = min_t(unsigned int, len, TCP_CA_NAME_MAX); if (put_user(len, optlen)) return -EFAULT; - if (copy_to_user(optval, tp->ca_ops->name, len)) + if (copy_to_user(optval, icsk->icsk_ca_ops->name, len)) return -EFAULT; return 0; default: diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index ec38d45..b940346 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -86,11 +86,11 @@ static inline void bictcp_reset(struct bictcp *ca) ca->delayed_ack = 2 << ACK_RATIO_SHIFT; } -static void bictcp_init(struct tcp_sock *tp) +static void bictcp_init(struct sock *sk) { - bictcp_reset(tcp_ca(tp)); + bictcp_reset(inet_csk_ca(sk)); if (initial_ssthresh) - tp->snd_ssthresh = initial_ssthresh; + tcp_sk(sk)->snd_ssthresh = initial_ssthresh; } /* @@ -156,9 +156,10 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) /* Detect low utilization in congestion avoidance */ -static inline void bictcp_low_utilization(struct tcp_sock *tp, int flag) +static inline void bictcp_low_utilization(struct sock *sk, int flag) { - struct bictcp *ca = tcp_ca(tp); + const struct tcp_sock *tp = tcp_sk(sk); + struct bictcp *ca = inet_csk_ca(sk); u32 dist, delay; /* No time stamp */ @@ -208,12 +209,13 @@ static inline void bictcp_low_utilization(struct tcp_sock *tp, int flag) } -static void bictcp_cong_avoid(struct tcp_sock *tp, u32 ack, +static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 seq_rtt, u32 in_flight, int data_acked) { - struct bictcp *ca = tcp_ca(tp); + struct tcp_sock *tp = tcp_sk(sk); + struct bictcp *ca = inet_csk_ca(sk); - bictcp_low_utilization(tp, data_acked); + bictcp_low_utilization(sk, data_acked); if (in_flight < tp->snd_cwnd) return; @@ -242,9 +244,10 @@ static void bictcp_cong_avoid(struct tcp_sock *tp, u32 ack, * behave like Reno until low_window is reached, * then increase congestion window slowly */ -static u32 bictcp_recalc_ssthresh(struct tcp_sock *tp) +static u32 bictcp_recalc_ssthresh(struct sock *sk) { - struct bictcp *ca = tcp_ca(tp); + const struct tcp_sock *tp = tcp_sk(sk); + struct bictcp *ca = inet_csk_ca(sk); ca->epoch_start = 0; /* end of epoch */ @@ -269,31 +272,34 @@ static u32 bictcp_recalc_ssthresh(struct tcp_sock *tp) return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U); } -static u32 bictcp_undo_cwnd(struct tcp_sock *tp) +static u32 bictcp_undo_cwnd(struct sock *sk) { - struct bictcp *ca = tcp_ca(tp); - + const struct tcp_sock *tp = tcp_sk(sk); + const struct bictcp *ca = inet_csk_ca(sk); return max(tp->snd_cwnd, ca->last_max_cwnd); } -static u32 bictcp_min_cwnd(struct tcp_sock *tp) +static u32 bictcp_min_cwnd(struct sock *sk) { + const struct tcp_sock *tp = tcp_sk(sk); return tp->snd_ssthresh; } -static void bictcp_state(struct tcp_sock *tp, u8 new_state) +static void bictcp_state(struct sock *sk, u8 new_state) { if (new_state == TCP_CA_Loss) - bictcp_reset(tcp_ca(tp)); + bictcp_reset(inet_csk_ca(sk)); } /* Track delayed acknowledgement ratio using sliding window * ratio = (15*ratio + sample) / 16 */ -static void bictcp_acked(struct tcp_sock *tp, u32 cnt) +static void bictcp_acked(struct sock *sk, u32 cnt) { - if (cnt > 0 && tp->ca_state == TCP_CA_Open) { - struct bictcp *ca = tcp_ca(tp); + const struct inet_connection_sock *icsk = inet_csk(sk); + + if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) { + struct bictcp *ca = inet_csk_ca(sk); cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; ca->delayed_ack += cnt; } @@ -314,7 +320,7 @@ static struct tcp_congestion_ops bictcp = { static int __init bictcp_register(void) { - BUG_ON(sizeof(struct bictcp) > TCP_CA_PRIV_SIZE); + BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE); return tcp_register_congestion_control(&bictcp); } diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 4970d10..bbf2d66 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -73,33 +73,36 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); /* Assign choice of congestion control. */ -void tcp_init_congestion_control(struct tcp_sock *tp) +void tcp_init_congestion_control(struct sock *sk) { + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_congestion_ops *ca; - if (tp->ca_ops != &tcp_init_congestion_ops) + if (icsk->icsk_ca_ops != &tcp_init_congestion_ops) return; rcu_read_lock(); list_for_each_entry_rcu(ca, &tcp_cong_list, list) { if (try_module_get(ca->owner)) { - tp->ca_ops = ca; + icsk->icsk_ca_ops = ca; break; } } rcu_read_unlock(); - if (tp->ca_ops->init) - tp->ca_ops->init(tp); + if (icsk->icsk_ca_ops->init) + icsk->icsk_ca_ops->init(sk); } /* Manage refcounts on socket close. */ -void tcp_cleanup_congestion_control(struct tcp_sock *tp) +void tcp_cleanup_congestion_control(struct sock *sk) { - if (tp->ca_ops->release) - tp->ca_ops->release(tp); - module_put(tp->ca_ops->owner); + struct inet_connection_sock *icsk = inet_csk(sk); + + if (icsk->icsk_ca_ops->release) + icsk->icsk_ca_ops->release(sk); + module_put(icsk->icsk_ca_ops->owner); } /* Used by sysctl to change default congestion control */ @@ -143,14 +146,15 @@ void tcp_get_default_congestion_control(char *name) } /* Change congestion control for socket */ -int tcp_set_congestion_control(struct tcp_sock *tp, const char *name) +int tcp_set_congestion_control(struct sock *sk, const char *name) { + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_congestion_ops *ca; int err = 0; rcu_read_lock(); ca = tcp_ca_find(name); - if (ca == tp->ca_ops) + if (ca == icsk->icsk_ca_ops) goto out; if (!ca) @@ -160,10 +164,10 @@ int tcp_set_congestion_control(struct tcp_sock *tp, const char *name) err = -EBUSY; else { - tcp_cleanup_congestion_control(tp); - tp->ca_ops = ca; - if (tp->ca_ops->init) - tp->ca_ops->init(tp); + tcp_cleanup_congestion_control(sk); + icsk->icsk_ca_ops = ca; + if (icsk->icsk_ca_ops->init) + icsk->icsk_ca_ops->init(sk); } out: rcu_read_unlock(); @@ -177,9 +181,11 @@ int tcp_set_congestion_control(struct tcp_sock *tp, const char *name) /* This is Jacobson's slow start and congestion avoidance. * SIGCOMM '88, p. 328. */ -void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, u32 in_flight, +void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, int flag) { + struct tcp_sock *tp = tcp_sk(sk); + if (in_flight < tp->snd_cwnd) return; @@ -202,15 +208,17 @@ void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, u32 in_flight, EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); /* Slow start threshold is half the congestion window (min 2) */ -u32 tcp_reno_ssthresh(struct tcp_sock *tp) +u32 tcp_reno_ssthresh(struct sock *sk) { + const struct tcp_sock *tp = tcp_sk(sk); return max(tp->snd_cwnd >> 1U, 2U); } EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); /* Lower bound on congestion window. */ -u32 tcp_reno_min_cwnd(struct tcp_sock *tp) +u32 tcp_reno_min_cwnd(struct sock *sk) { + const struct tcp_sock *tp = tcp_sk(sk); return tp->snd_ssthresh/2; } EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd); diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 5f4c74f..4288ecf 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -66,10 +66,10 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, if (ext & (1<<(TCPDIAG_INFO-1))) info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info)); - if (ext & (1<<(TCPDIAG_CONG-1))) { - size_t len = strlen(tp->ca_ops->name); + if ((ext & (1 << (TCPDIAG_CONG - 1))) && icsk->icsk_ca_ops) { + size_t len = strlen(icsk->icsk_ca_ops->name); strcpy(TCPDIAG_PUT(skb, TCPDIAG_CONG, len+1), - tp->ca_ops->name); + icsk->icsk_ca_ops->name); } } r->tcpdiag_family = sk->sk_family; @@ -136,18 +136,17 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, r->tcpdiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout); } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { r->tcpdiag_timer = 4; - r->tcpdiag_retrans = tp->probes_out; + r->tcpdiag_retrans = icsk->icsk_probes_out; r->tcpdiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout); } else if (timer_pending(&sk->sk_timer)) { r->tcpdiag_timer = 2; - r->tcpdiag_retrans = tp->probes_out; + r->tcpdiag_retrans = icsk->icsk_probes_out; r->tcpdiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires); } else { r->tcpdiag_timer = 0; r->tcpdiag_expires = 0; } #undef EXPIRES_IN_MS - r->tcpdiag_rqueue = tp->rcv_nxt - tp->copied_seq; r->tcpdiag_wqueue = tp->write_seq - tp->snd_una; r->tcpdiag_uid = sock_i_uid(sk); @@ -163,8 +162,9 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, if (info) tcp_get_info(sk, info); - if (sk->sk_state < TCP_TIME_WAIT && tp->ca_ops->get_info) - tp->ca_ops->get_info(tp, ext, skb); + if (sk->sk_state < TCP_TIME_WAIT && + icsk->icsk_ca_ops && icsk->icsk_ca_ops->get_info) + icsk->icsk_ca_ops->get_info(sk, ext, skb); nlh->nlmsg_len = skb->tail - b; return skb->len; diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index 36c51f8..6acc04b 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -98,9 +98,10 @@ struct hstcp { u32 ai; }; -static void hstcp_init(struct tcp_sock *tp) +static void hstcp_init(struct sock *sk) { - struct hstcp *ca = tcp_ca(tp); + struct tcp_sock *tp = tcp_sk(sk); + struct hstcp *ca = inet_csk_ca(sk); ca->ai = 0; @@ -109,10 +110,11 @@ static void hstcp_init(struct tcp_sock *tp) tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128); } -static void hstcp_cong_avoid(struct tcp_sock *tp, u32 adk, u32 rtt, +static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt, u32 in_flight, int good) { - struct hstcp *ca = tcp_ca(tp); + struct tcp_sock *tp = tcp_sk(sk); + struct hstcp *ca = inet_csk_ca(sk); if (in_flight < tp->snd_cwnd) return; @@ -143,9 +145,10 @@ static void hstcp_cong_avoid(struct tcp_sock *tp, u32 adk, u32 rtt, } } -static u32 hstcp_ssthresh(struct tcp_sock *tp) +static u32 hstcp_ssthresh(struct sock *sk) { - struct hstcp *ca = tcp_ca(tp); + const struct tcp_sock *tp = tcp_sk(sk); + const struct hstcp *ca = inet_csk_ca(sk); /* Do multiplicative decrease */ return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U); @@ -164,7 +167,7 @@ static struct tcp_congestion_ops tcp_highspeed = { static int __init hstcp_register(void) { - BUG_ON(sizeof(struct hstcp) > TCP_CA_PRIV_SIZE); + BUG_ON(sizeof(struct hstcp) > ICSK_CA_PRIV_SIZE); return tcp_register_congestion_control(&tcp_highspeed); } diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 4016827..e47b379 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -55,18 +55,21 @@ static inline void htcp_reset(struct htcp *ca) ca->snd_cwnd_cnt2 = 0; } -static u32 htcp_cwnd_undo(struct tcp_sock *tp) +static u32 htcp_cwnd_undo(struct sock *sk) { - struct htcp *ca = tcp_ca(tp); + const struct tcp_sock *tp = tcp_sk(sk); + struct htcp *ca = inet_csk_ca(sk); ca->ccount = ca->undo_ccount; ca->maxRTT = ca->undo_maxRTT; ca->old_maxB = ca->undo_old_maxB; return max(tp->snd_cwnd, (tp->snd_ssthresh<<7)/ca->beta); } -static inline void measure_rtt(struct tcp_sock *tp) +static inline void measure_rtt(struct sock *sk) { - struct htcp *ca = tcp_ca(tp); + const struct inet_connection_sock *icsk = inet_csk(sk); + const struct tcp_sock *tp = tcp_sk(sk); + struct htcp *ca = inet_csk_ca(sk); u32 srtt = tp->srtt>>3; /* keep track of minimum RTT seen so far, minRTT is zero at first */ @@ -74,7 +77,7 @@ static inline void measure_rtt(struct tcp_sock *tp) ca->minRTT = srtt; /* max RTT */ - if (tp->ca_state == TCP_CA_Open && tp->snd_ssthresh < 0xFFFF && ca->ccount > 3) { + if (icsk->icsk_ca_state == TCP_CA_Open && tp->snd_ssthresh < 0xFFFF && ca->ccount > 3) { if (ca->maxRTT < ca->minRTT) ca->maxRTT = ca->minRTT; if (ca->maxRTT < srtt && srtt <= ca->maxRTT+HZ/50) @@ -82,13 +85,16 @@ static inline void measure_rtt(struct tcp_sock *tp) } } -static void measure_achieved_throughput(struct tcp_sock *tp, u32 pkts_acked) +static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked) { - struct htcp *ca = tcp_ca(tp); + const struct inet_connection_sock *icsk = inet_csk(sk); + const struct tcp_sock *tp = tcp_sk(sk); + struct htcp *ca = inet_csk_ca(sk); u32 now = tcp_time_stamp; /* achieved throughput calculations */ - if (tp->ca_state != TCP_CA_Open && tp->ca_state != TCP_CA_Disorder) { + if (icsk->icsk_ca_state != TCP_CA_Open && + icsk->icsk_ca_state != TCP_CA_Disorder) { ca->packetcount = 0; ca->lasttime = now; return; @@ -173,9 +179,9 @@ static inline void htcp_alpha_update(struct htcp *ca) * that point do we really have a real sense of maxRTT (the queues en route * were getting just too full now). */ -static void htcp_param_update(struct tcp_sock *tp) +static void htcp_param_update(struct sock *sk) { - struct htcp *ca = tcp_ca(tp); + struct htcp *ca = inet_csk_ca(sk); u32 minRTT = ca->minRTT; u32 maxRTT = ca->maxRTT; @@ -187,17 +193,19 @@ static void htcp_param_update(struct tcp_sock *tp) ca->maxRTT = minRTT + ((maxRTT-minRTT)*95)/100; } -static u32 htcp_recalc_ssthresh(struct tcp_sock *tp) +static u32 htcp_recalc_ssthresh(struct sock *sk) { - struct htcp *ca = tcp_ca(tp); - htcp_param_update(tp); + const struct tcp_sock *tp = tcp_sk(sk); + const struct htcp *ca = inet_csk_ca(sk); + htcp_param_update(sk); return max((tp->snd_cwnd * ca->beta) >> 7, 2U); } -static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, +static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, int data_acked) { - struct htcp *ca = tcp_ca(tp); + struct tcp_sock *tp = tcp_sk(sk); + struct htcp *ca = inet_csk_ca(sk); if (in_flight < tp->snd_cwnd) return; @@ -207,7 +215,7 @@ static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, if (tp->snd_cwnd < tp->snd_cwnd_clamp) tp->snd_cwnd++; } else { - measure_rtt(tp); + measure_rtt(sk); /* keep track of number of round-trip times since last backoff event */ if (ca->snd_cwnd_cnt2++ > tp->snd_cwnd) { @@ -229,28 +237,29 @@ static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, } /* Lower bound on congestion window. */ -static u32 htcp_min_cwnd(struct tcp_sock *tp) +static u32 htcp_min_cwnd(struct sock *sk) { + const struct tcp_sock *tp = tcp_sk(sk); return tp->snd_ssthresh; } -static void htcp_init(struct tcp_sock *tp) +static void htcp_init(struct sock *sk) { - struct htcp *ca = tcp_ca(tp); + struct htcp *ca = inet_csk_ca(sk); memset(ca, 0, sizeof(struct htcp)); ca->alpha = ALPHA_BASE; ca->beta = BETA_MIN; } -static void htcp_state(struct tcp_sock *tp, u8 new_state) +static void htcp_state(struct sock *sk, u8 new_state) { switch (new_state) { case TCP_CA_CWR: case TCP_CA_Recovery: case TCP_CA_Loss: - htcp_reset(tcp_ca(tp)); + htcp_reset(inet_csk_ca(sk)); break; } } @@ -269,7 +278,7 @@ static struct tcp_congestion_ops htcp = { static int __init htcp_register(void) { - BUG_ON(sizeof(struct htcp) > TCP_CA_PRIV_SIZE); + BUG_ON(sizeof(struct htcp) > ICSK_CA_PRIV_SIZE); BUILD_BUG_ON(BETA_MIN >= BETA_MAX); if (!use_bandwidth_switch) htcp.pkts_acked = NULL; diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index 13a6634..77add63 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c @@ -33,19 +33,20 @@ MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)"); /* This is called to refresh values for hybla parameters */ -static inline void hybla_recalc_param (struct tcp_sock *tp) +static inline void hybla_recalc_param (struct sock *sk) { - struct hybla *ca = tcp_ca(tp); + struct hybla *ca = inet_csk_ca(sk); - ca->rho_3ls = max_t(u32, tp->srtt / msecs_to_jiffies(rtt0), 8); + ca->rho_3ls = max_t(u32, tcp_sk(sk)->srtt / msecs_to_jiffies(rtt0), 8); ca->rho = ca->rho_3ls >> 3; ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1; ca->rho2 = ca->rho2_7ls >>7; } -static void hybla_init(struct tcp_sock *tp) +static void hybla_init(struct sock *sk) { - struct hybla *ca = tcp_ca(tp); + struct tcp_sock *tp = tcp_sk(sk); + struct hybla *ca = inet_csk_ca(sk); ca->rho = 0; ca->rho2 = 0; @@ -57,17 +58,16 @@ static void hybla_init(struct tcp_sock *tp) tp->snd_cwnd_clamp = 65535; /* 1st Rho measurement based on initial srtt */ - hybla_recalc_param(tp); + hybla_recalc_param(sk); /* set minimum rtt as this is the 1st ever seen */ ca->minrtt = tp->srtt; tp->snd_cwnd = ca->rho; } -static void hybla_state(struct tcp_sock *tp, u8 ca_state) +static void hybla_state(struct sock *sk, u8 ca_state) { - struct hybla *ca = tcp_ca(tp); - + struct hybla *ca = inet_csk_ca(sk); ca->hybla_en = (ca_state == TCP_CA_Open); } @@ -86,27 +86,28 @@ static inline u32 hybla_fraction(u32 odds) * o Give cwnd a new value based on the model proposed * o remember increments <1 */ -static void hybla_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, +static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, int flag) { - struct hybla *ca = tcp_ca(tp); + struct tcp_sock *tp = tcp_sk(sk); + struct hybla *ca = inet_csk_ca(sk); u32 increment, odd, rho_fractions; int is_slowstart = 0; /* Recalculate rho only if this srtt is the lowest */ if (tp->srtt < ca->minrtt){ - hybla_recalc_param(tp); + hybla_recalc_param(sk); ca->minrtt = tp->srtt; } if (!ca->hybla_en) - return tcp_reno_cong_avoid(tp, ack, rtt, in_flight, flag); + return tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag); if (in_flight < tp->snd_cwnd) return; if (ca->rho == 0) - hybla_recalc_param(tp); + hybla_recalc_param(sk); rho_fractions = ca->rho_3ls - (ca->rho << 3); @@ -170,7 +171,7 @@ static struct tcp_congestion_ops tcp_hybla = { static int __init hybla_register(void) { - BUG_ON(sizeof(struct hybla) > TCP_CA_PRIV_SIZE); + BUG_ON(sizeof(struct hybla) > ICSK_CA_PRIV_SIZE); return tcp_register_congestion_control(&tcp_hybla); } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 71d4561..fdd9547 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -325,11 +325,12 @@ static void tcp_init_buffer_space(struct sock *sk) /* 5. Recalculate window clamp after socket hit its memory bounds. */ static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) { + struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *skb; unsigned int app_win = tp->rcv_nxt - tp->copied_seq; int ofo_win = 0; - inet_csk(sk)->icsk_ack.quick = 0; + icsk->icsk_ack.quick = 0; skb_queue_walk(&tp->out_of_order_queue, skb) { ofo_win += skb->len; @@ -350,8 +351,8 @@ static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) app_win += ofo_win; if (atomic_read(&sk->sk_rmem_alloc) >= 2 * sk->sk_rcvbuf) app_win >>= 1; - if (app_win > inet_csk(sk)->icsk_ack.rcv_mss) - app_win -= inet_csk(sk)->icsk_ack.rcv_mss; + if (app_win > icsk->icsk_ack.rcv_mss) + app_win -= icsk->icsk_ack.rcv_mss; app_win = max(app_win, 2U*tp->advmss); if (!ofo_win) @@ -549,8 +550,10 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_ * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */ -static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt) +static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt) { + struct tcp_sock *tp = tcp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); long m = mrtt; /* RTT */ /* The following amusing code comes from Jacobson's @@ -610,8 +613,8 @@ static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt) tp->rtt_seq = tp->snd_nxt; } - if (tp->ca_ops->rtt_sample) - tp->ca_ops->rtt_sample(tp, *usrtt); + if (icsk->icsk_ca_ops->rtt_sample) + icsk->icsk_ca_ops->rtt_sample(sk, *usrtt); } /* Calculate rto without backoff. This is the second half of Van Jacobson's @@ -663,9 +666,10 @@ void tcp_update_metrics(struct sock *sk) dst_confirm(dst); if (dst && (dst->flags&DST_HOST)) { + const struct inet_connection_sock *icsk = inet_csk(sk); int m; - if (inet_csk(sk)->icsk_backoff || !tp->srtt) { + if (icsk->icsk_backoff || !tp->srtt) { /* This session failed to estimate rtt. Why? * Probably, no packets returned in time. * Reset our results. @@ -714,7 +718,7 @@ void tcp_update_metrics(struct sock *sk) tp->snd_cwnd > dst_metric(dst, RTAX_CWND)) dst->metrics[RTAX_CWND-1] = tp->snd_cwnd; } else if (tp->snd_cwnd > tp->snd_ssthresh && - tp->ca_state == TCP_CA_Open) { + icsk->icsk_ca_state == TCP_CA_Open) { /* Cong. avoidance phase, cwnd is reliable. */ if (!dst_metric_locked(dst, RTAX_SSTHRESH)) dst->metrics[RTAX_SSTHRESH-1] = @@ -828,8 +832,10 @@ reset: } } -static void tcp_update_reordering(struct tcp_sock *tp, int metric, int ts) +static void tcp_update_reordering(struct sock *sk, const int metric, + const int ts) { + struct tcp_sock *tp = tcp_sk(sk); if (metric > tp->reordering) { tp->reordering = min(TCP_MAX_REORDERING, metric); @@ -844,7 +850,7 @@ static void tcp_update_reordering(struct tcp_sock *tp, int metric, int ts) NET_INC_STATS_BH(LINUX_MIB_TCPSACKREORDER); #if FASTRETRANS_DEBUG > 1 printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n", - tp->rx_opt.sack_ok, tp->ca_state, + tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state, tp->reordering, tp->fackets_out, tp->sacked_out, @@ -906,6 +912,7 @@ static void tcp_update_reordering(struct tcp_sock *tp, int metric, int ts) static int tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una) { + const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); unsigned char *ptr = ack_skb->h.raw + TCP_SKB_CB(ack_skb)->sacked; struct tcp_sack_block *sp = (struct tcp_sack_block *)(ptr+2); @@ -1071,7 +1078,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ * we have to account for reordering! Ugly, * but should help. */ - if (lost_retrans && tp->ca_state == TCP_CA_Recovery) { + if (lost_retrans && icsk->icsk_ca_state == TCP_CA_Recovery) { struct sk_buff *skb; sk_stream_for_retrans_queue(skb, sk) { @@ -1100,8 +1107,8 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ tp->left_out = tp->sacked_out + tp->lost_out; - if ((reord < tp->fackets_out) && tp->ca_state != TCP_CA_Loss) - tcp_update_reordering(tp, ((tp->fackets_out + 1) - reord), 0); + if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss) + tcp_update_reordering(sk, ((tp->fackets_out + 1) - reord), 0); #if FASTRETRANS_DEBUG > 0 BUG_TRAP((int)tp->sacked_out >= 0); @@ -1118,17 +1125,18 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ */ void tcp_enter_frto(struct sock *sk) { + const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; tp->frto_counter = 1; - if (tp->ca_state <= TCP_CA_Disorder || + if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || - (tp->ca_state == TCP_CA_Loss && !inet_csk(sk)->icsk_retransmits)) { - tp->prior_ssthresh = tcp_current_ssthresh(tp); - tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); - tcp_ca_event(tp, CA_EVENT_FRTO); + (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) { + tp->prior_ssthresh = tcp_current_ssthresh(sk); + tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); + tcp_ca_event(sk, CA_EVENT_FRTO); } /* Have to clear retransmission markers here to keep the bookkeeping @@ -1145,7 +1153,7 @@ void tcp_enter_frto(struct sock *sk) } tcp_sync_left_out(tp); - tcp_set_ca_state(tp, TCP_CA_Open); + tcp_set_ca_state(sk, TCP_CA_Open); tp->frto_highmark = tp->snd_nxt; } @@ -1191,7 +1199,7 @@ static void tcp_enter_frto_loss(struct sock *sk) tp->reordering = min_t(unsigned int, tp->reordering, sysctl_tcp_reordering); - tcp_set_ca_state(tp, TCP_CA_Loss); + tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->frto_highmark; TCP_ECN_queue_cwr(tp); } @@ -1215,16 +1223,17 @@ void tcp_clear_retrans(struct tcp_sock *tp) */ void tcp_enter_loss(struct sock *sk, int how) { + const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; int cnt = 0; /* Reduce ssthresh if it has not yet been made inside this window. */ - if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || - (tp->ca_state == TCP_CA_Loss && !inet_csk(sk)->icsk_retransmits)) { - tp->prior_ssthresh = tcp_current_ssthresh(tp); - tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); - tcp_ca_event(tp, CA_EVENT_LOSS); + if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || + (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) { + tp->prior_ssthresh = tcp_current_ssthresh(sk); + tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); + tcp_ca_event(sk, CA_EVENT_LOSS); } tp->snd_cwnd = 1; tp->snd_cwnd_cnt = 0; @@ -1255,7 +1264,7 @@ void tcp_enter_loss(struct sock *sk, int how) tp->reordering = min_t(unsigned int, tp->reordering, sysctl_tcp_reordering); - tcp_set_ca_state(tp, TCP_CA_Loss); + tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->snd_nxt; TCP_ECN_queue_cwr(tp); } @@ -1272,13 +1281,14 @@ static int tcp_check_sack_reneging(struct sock *sk) */ if ((skb = skb_peek(&sk->sk_write_queue)) != NULL && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { + struct inet_connection_sock *icsk = inet_csk(sk); NET_INC_STATS_BH(LINUX_MIB_TCPSACKRENEGING); tcp_enter_loss(sk, 1); - inet_csk(sk)->icsk_retransmits++; + icsk->icsk_retransmits++; tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)); inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - inet_csk(sk)->icsk_rto, TCP_RTO_MAX); + icsk->icsk_rto, TCP_RTO_MAX); return 1; } return 0; @@ -1431,8 +1441,9 @@ static int tcp_time_to_recover(struct sock *sk, struct tcp_sock *tp) * in assumption of absent reordering, interpret this as reordering. * The only another reason could be bug in receiver TCP. */ -static void tcp_check_reno_reordering(struct tcp_sock *tp, int addend) +static void tcp_check_reno_reordering(struct sock *sk, const int addend) { + struct tcp_sock *tp = tcp_sk(sk); u32 holes; holes = max(tp->lost_out, 1U); @@ -1440,16 +1451,17 @@ static void tcp_check_reno_reordering(struct tcp_sock *tp, int addend) if ((tp->sacked_out + holes) > tp->packets_out) { tp->sacked_out = tp->packets_out - holes; - tcp_update_reordering(tp, tp->packets_out+addend, 0); + tcp_update_reordering(sk, tp->packets_out + addend, 0); } } /* Emulate SACKs for SACKless connection: account for a new dupack. */ -static void tcp_add_reno_sack(struct tcp_sock *tp) +static void tcp_add_reno_sack(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); tp->sacked_out++; - tcp_check_reno_reordering(tp, 0); + tcp_check_reno_reordering(sk, 0); tcp_sync_left_out(tp); } @@ -1464,7 +1476,7 @@ static void tcp_remove_reno_sacks(struct sock *sk, struct tcp_sock *tp, int acke else tp->sacked_out -= acked-1; } - tcp_check_reno_reordering(tp, acked); + tcp_check_reno_reordering(sk, acked); tcp_sync_left_out(tp); } @@ -1538,14 +1550,16 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp) } /* Decrease cwnd each second ack. */ -static void tcp_cwnd_down(struct tcp_sock *tp) +static void tcp_cwnd_down(struct sock *sk) { + const struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); int decr = tp->snd_cwnd_cnt + 1; tp->snd_cwnd_cnt = decr&1; decr >>= 1; - if (decr && tp->snd_cwnd > tp->ca_ops->min_cwnd(tp)) + if (decr && tp->snd_cwnd > icsk->icsk_ca_ops->min_cwnd(sk)) tp->snd_cwnd -= decr; tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); @@ -1579,11 +1593,15 @@ static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg) #define DBGUNDO(x...) do { } while (0) #endif -static void tcp_undo_cwr(struct tcp_sock *tp, int undo) +static void tcp_undo_cwr(struct sock *sk, const int undo) { + struct tcp_sock *tp = tcp_sk(sk); + if (tp->prior_ssthresh) { - if (tp->ca_ops->undo_cwnd) - tp->snd_cwnd = tp->ca_ops->undo_cwnd(tp); + const struct inet_connection_sock *icsk = inet_csk(sk); + + if (icsk->icsk_ca_ops->undo_cwnd) + tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk); else tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1); @@ -1611,9 +1629,9 @@ static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp) /* Happy end! We did not retransmit anything * or our original transmission succeeded. */ - DBGUNDO(sk, tp, tp->ca_state == TCP_CA_Loss ? "loss" : "retrans"); - tcp_undo_cwr(tp, 1); - if (tp->ca_state == TCP_CA_Loss) + DBGUNDO(sk, tp, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans"); + tcp_undo_cwr(sk, 1); + if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO); else NET_INC_STATS_BH(LINUX_MIB_TCPFULLUNDO); @@ -1626,7 +1644,7 @@ static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp) tcp_moderate_cwnd(tp); return 1; } - tcp_set_ca_state(tp, TCP_CA_Open); + tcp_set_ca_state(sk, TCP_CA_Open); return 0; } @@ -1635,7 +1653,7 @@ static void tcp_try_undo_dsack(struct sock *sk, struct tcp_sock *tp) { if (tp->undo_marker && !tp->undo_retrans) { DBGUNDO(sk, tp, "D-SACK"); - tcp_undo_cwr(tp, 1); + tcp_undo_cwr(sk, 1); tp->undo_marker = 0; NET_INC_STATS_BH(LINUX_MIB_TCPDSACKUNDO); } @@ -1656,10 +1674,10 @@ static int tcp_try_undo_partial(struct sock *sk, struct tcp_sock *tp, if (tp->retrans_out == 0) tp->retrans_stamp = 0; - tcp_update_reordering(tp, tcp_fackets_out(tp)+acked, 1); + tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); DBGUNDO(sk, tp, "Hoe"); - tcp_undo_cwr(tp, 0); + tcp_undo_cwr(sk, 0); NET_INC_STATS_BH(LINUX_MIB_TCPPARTIALUNDO); /* So... Do not make Hoe's retransmit yet. @@ -1682,22 +1700,23 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp) DBGUNDO(sk, tp, "partial loss"); tp->lost_out = 0; tp->left_out = tp->sacked_out; - tcp_undo_cwr(tp, 1); + tcp_undo_cwr(sk, 1); NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO); inet_csk(sk)->icsk_retransmits = 0; tp->undo_marker = 0; if (!IsReno(tp)) - tcp_set_ca_state(tp, TCP_CA_Open); + tcp_set_ca_state(sk, TCP_CA_Open); return 1; } return 0; } -static inline void tcp_complete_cwr(struct tcp_sock *tp) +static inline void tcp_complete_cwr(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); tp->snd_cwnd_stamp = tcp_time_stamp; - tcp_ca_event(tp, CA_EVENT_COMPLETE_CWR); + tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); } static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) @@ -1708,21 +1727,21 @@ static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) tp->retrans_stamp = 0; if (flag&FLAG_ECE) - tcp_enter_cwr(tp); + tcp_enter_cwr(sk); - if (tp->ca_state != TCP_CA_CWR) { + if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { int state = TCP_CA_Open; if (tp->left_out || tp->retrans_out || tp->undo_marker) state = TCP_CA_Disorder; - if (tp->ca_state != state) { - tcp_set_ca_state(tp, state); + if (inet_csk(sk)->icsk_ca_state != state) { + tcp_set_ca_state(sk, state); tp->high_seq = tp->snd_nxt; } tcp_moderate_cwnd(tp); } else { - tcp_cwnd_down(tp); + tcp_cwnd_down(sk); } } @@ -1741,6 +1760,7 @@ static void tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, int prior_packets, int flag) { + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int is_dupack = (tp->snd_una == prior_snd_una && !(flag&FLAG_NOT_DUP)); @@ -1764,7 +1784,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, /* C. Process data loss notification, provided it is valid. */ if ((flag&FLAG_DATA_LOST) && before(tp->snd_una, tp->high_seq) && - tp->ca_state != TCP_CA_Open && + icsk->icsk_ca_state != TCP_CA_Open && tp->fackets_out > tp->reordering) { tcp_mark_head_lost(sk, tp, tp->fackets_out-tp->reordering, tp->high_seq); NET_INC_STATS_BH(LINUX_MIB_TCPLOSS); @@ -1775,14 +1795,14 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, /* E. Check state exit conditions. State can be terminated * when high_seq is ACKed. */ - if (tp->ca_state == TCP_CA_Open) { + if (icsk->icsk_ca_state == TCP_CA_Open) { if (!sysctl_tcp_frto) BUG_TRAP(tp->retrans_out == 0); tp->retrans_stamp = 0; } else if (!before(tp->snd_una, tp->high_seq)) { - switch (tp->ca_state) { + switch (icsk->icsk_ca_state) { case TCP_CA_Loss: - inet_csk(sk)->icsk_retransmits = 0; + icsk->icsk_retransmits = 0; if (tcp_try_undo_recovery(sk, tp)) return; break; @@ -1791,8 +1811,8 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, /* CWR is to be held something *above* high_seq * is ACKed for CWR bit to reach receiver. */ if (tp->snd_una != tp->high_seq) { - tcp_complete_cwr(tp); - tcp_set_ca_state(tp, TCP_CA_Open); + tcp_complete_cwr(sk); + tcp_set_ca_state(sk, TCP_CA_Open); } break; @@ -1803,7 +1823,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, * catching for all duplicate ACKs. */ IsReno(tp) || tp->snd_una != tp->high_seq) { tp->undo_marker = 0; - tcp_set_ca_state(tp, TCP_CA_Open); + tcp_set_ca_state(sk, TCP_CA_Open); } break; @@ -1812,17 +1832,17 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, tcp_reset_reno_sack(tp); if (tcp_try_undo_recovery(sk, tp)) return; - tcp_complete_cwr(tp); + tcp_complete_cwr(sk); break; } } /* F. Process state. */ - switch (tp->ca_state) { + switch (icsk->icsk_ca_state) { case TCP_CA_Recovery: if (prior_snd_una == tp->snd_una) { if (IsReno(tp) && is_dupack) - tcp_add_reno_sack(tp); + tcp_add_reno_sack(sk); } else { int acked = prior_packets - tp->packets_out; if (IsReno(tp)) @@ -1832,13 +1852,13 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, break; case TCP_CA_Loss: if (flag&FLAG_DATA_ACKED) - inet_csk(sk)->icsk_retransmits = 0; + icsk->icsk_retransmits = 0; if (!tcp_try_undo_loss(sk, tp)) { tcp_moderate_cwnd(tp); tcp_xmit_retransmit_queue(sk); return; } - if (tp->ca_state != TCP_CA_Open) + if (icsk->icsk_ca_state != TCP_CA_Open) return; /* Loss is undone; fall through to processing in Open state. */ default: @@ -1846,10 +1866,10 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, if (tp->snd_una != prior_snd_una) tcp_reset_reno_sack(tp); if (is_dupack) - tcp_add_reno_sack(tp); + tcp_add_reno_sack(sk); } - if (tp->ca_state == TCP_CA_Disorder) + if (icsk->icsk_ca_state == TCP_CA_Disorder) tcp_try_undo_dsack(sk, tp); if (!tcp_time_to_recover(sk, tp)) { @@ -1869,20 +1889,20 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, tp->undo_marker = tp->snd_una; tp->undo_retrans = tp->retrans_out; - if (tp->ca_state < TCP_CA_CWR) { + if (icsk->icsk_ca_state < TCP_CA_CWR) { if (!(flag&FLAG_ECE)) - tp->prior_ssthresh = tcp_current_ssthresh(tp); - tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); + tp->prior_ssthresh = tcp_current_ssthresh(sk); + tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); TCP_ECN_queue_cwr(tp); } tp->snd_cwnd_cnt = 0; - tcp_set_ca_state(tp, TCP_CA_Recovery); + tcp_set_ca_state(sk, TCP_CA_Recovery); } if (is_dupack || tcp_head_timedout(sk, tp)) tcp_update_scoreboard(sk, tp); - tcp_cwnd_down(tp); + tcp_cwnd_down(sk); tcp_xmit_retransmit_queue(sk); } @@ -1908,7 +1928,7 @@ static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag) */ struct tcp_sock *tp = tcp_sk(sk); const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; - tcp_rtt_estimator(tp, seq_rtt, usrtt); + tcp_rtt_estimator(sk, seq_rtt, usrtt); tcp_set_rto(sk); inet_csk(sk)->icsk_backoff = 0; tcp_bound_rto(sk); @@ -1928,7 +1948,7 @@ static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag if (flag & FLAG_RETRANS_DATA_ACKED) return; - tcp_rtt_estimator(tcp_sk(sk), seq_rtt, usrtt); + tcp_rtt_estimator(sk, seq_rtt, usrtt); tcp_set_rto(sk); inet_csk(sk)->icsk_backoff = 0; tcp_bound_rto(sk); @@ -1945,11 +1965,12 @@ static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, tcp_ack_no_tstamp(sk, seq_rtt, usrtt, flag); } -static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, +static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, int good) { - tp->ca_ops->cong_avoid(tp, ack, rtt, in_flight, good); - tp->snd_cwnd_stamp = tcp_time_stamp; + const struct inet_connection_sock *icsk = inet_csk(sk); + icsk->icsk_ca_ops->cong_avoid(sk, ack, rtt, in_flight, good); + tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp; } /* Restart timer after forward progress on connection. @@ -2098,11 +2119,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt } if (acked&FLAG_ACKED) { + const struct inet_connection_sock *icsk = inet_csk(sk); tcp_ack_update_rtt(sk, acked, seq_rtt, seq_usrtt); tcp_ack_packets_out(sk, tp); - if (tp->ca_ops->pkts_acked) - tp->ca_ops->pkts_acked(tp, pkts_acked); + if (icsk->icsk_ca_ops->pkts_acked) + icsk->icsk_ca_ops->pkts_acked(sk, pkts_acked); } #if FASTRETRANS_DEBUG > 0 @@ -2110,19 +2132,20 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt BUG_TRAP((int)tp->lost_out >= 0); BUG_TRAP((int)tp->retrans_out >= 0); if (!tp->packets_out && tp->rx_opt.sack_ok) { + const struct inet_connection_sock *icsk = inet_csk(sk); if (tp->lost_out) { printk(KERN_DEBUG "Leak l=%u %d\n", - tp->lost_out, tp->ca_state); + tp->lost_out, icsk->icsk_ca_state); tp->lost_out = 0; } if (tp->sacked_out) { printk(KERN_DEBUG "Leak s=%u %d\n", - tp->sacked_out, tp->ca_state); + tp->sacked_out, icsk->icsk_ca_state); tp->sacked_out = 0; } if (tp->retrans_out) { printk(KERN_DEBUG "Leak r=%u %d\n", - tp->retrans_out, tp->ca_state); + tp->retrans_out, icsk->icsk_ca_state); tp->retrans_out = 0; } } @@ -2152,16 +2175,17 @@ static void tcp_ack_probe(struct sock *sk) } } -static inline int tcp_ack_is_dubious(struct tcp_sock *tp, int flag) +static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag) { return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) || - tp->ca_state != TCP_CA_Open); + inet_csk(sk)->icsk_ca_state != TCP_CA_Open); } -static inline int tcp_may_raise_cwnd(struct tcp_sock *tp, int flag) +static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag) { + const struct tcp_sock *tp = tcp_sk(sk); return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) && - !((1<ca_state)&(TCPF_CA_Recovery|TCPF_CA_CWR)); + !((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_Recovery | TCPF_CA_CWR)); } /* Check that window update is acceptable. @@ -2251,6 +2275,7 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una) /* This routine deals with incoming acks, but not outgoing ones. */ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) { + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); u32 prior_snd_una = tp->snd_una; u32 ack_seq = TCP_SKB_CB(skb)->seq; @@ -2278,7 +2303,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) tp->snd_una = ack; flag |= FLAG_WIN_UPDATE; - tcp_ca_event(tp, CA_EVENT_FAST_ACK); + tcp_ca_event(sk, CA_EVENT_FAST_ACK); NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS); } else { @@ -2295,7 +2320,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th)) flag |= FLAG_ECE; - tcp_ca_event(tp, CA_EVENT_SLOW_ACK); + tcp_ca_event(sk, CA_EVENT_SLOW_ACK); } /* We passed data and got it acked, remove any soft error @@ -2311,19 +2336,19 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) /* See if we can take anything off of the retransmit queue. */ flag |= tcp_clean_rtx_queue(sk, &seq_rtt, - tp->ca_ops->rtt_sample ? &seq_usrtt : NULL); + icsk->icsk_ca_ops->rtt_sample ? &seq_usrtt : NULL); if (tp->frto_counter) tcp_process_frto(sk, prior_snd_una); - if (tcp_ack_is_dubious(tp, flag)) { + if (tcp_ack_is_dubious(sk, flag)) { /* Advanve CWND, if state allows this. */ - if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(tp, flag)) - tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 0); + if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag)) + tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 0); tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); } else { if ((flag & FLAG_DATA_ACKED)) - tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 1); + tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 1); } if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP)) @@ -2332,7 +2357,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) return 1; no_queue: - tp->probes_out = 0; + icsk->icsk_probes_out = 0; /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than @@ -3301,12 +3326,12 @@ void tcp_cwnd_application_limited(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - if (tp->ca_state == TCP_CA_Open && + if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open && sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { /* Limited by application or receiver window. */ u32 win_used = max(tp->snd_cwnd_used, 2U); if (win_used < tp->snd_cwnd) { - tp->snd_ssthresh = tcp_current_ssthresh(tp); + tp->snd_ssthresh = tcp_current_ssthresh(sk); tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1; } tp->snd_cwnd_used = 0; @@ -3935,7 +3960,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_init_metrics(sk); - tcp_init_congestion_control(tp); + tcp_init_congestion_control(sk); /* Prevent spurious tcp_cwnd_restart() on first data * packet. @@ -4212,7 +4237,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, tcp_init_metrics(sk); - tcp_init_congestion_control(tp); + tcp_init_congestion_control(sk); /* Prevent spurious tcp_cwnd_restart() on * first data packet. diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 32a0ebc..97bbf59 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1409,13 +1409,14 @@ struct tcp_func ipv4_specific = { */ static int tcp_v4_init_sock(struct sock *sk) { + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); skb_queue_head_init(&tp->out_of_order_queue); tcp_init_xmit_timers(sk); tcp_prequeue_init(tp); - inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; + icsk->icsk_rto = TCP_TIMEOUT_INIT; tp->mdev = TCP_TIMEOUT_INIT; /* So many TCP implementations out there (incorrectly) count the @@ -1433,7 +1434,7 @@ static int tcp_v4_init_sock(struct sock *sk) tp->mss_cache = 536; tp->reordering = sysctl_tcp_reordering; - tp->ca_ops = &tcp_init_congestion_ops; + icsk->icsk_ca_ops = &tcp_init_congestion_ops; sk->sk_state = TCP_CLOSE; @@ -1456,7 +1457,7 @@ int tcp_v4_destroy_sock(struct sock *sk) tcp_clear_xmit_timers(sk); - tcp_cleanup_congestion_control(tp); + tcp_cleanup_congestion_control(sk); /* Cleanup up the write buffer. */ sk_stream_writequeue_purge(sk); @@ -1883,7 +1884,7 @@ static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i) jiffies_to_clock_t(timer_expires - jiffies), icsk->icsk_retransmits, sock_i_uid(sp), - tp->probes_out, + icsk->icsk_probes_out, sock_i_ino(sp), atomic_read(&sp->sk_refcnt), sp, icsk->icsk_rto, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index dc08523..a88db28 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -384,9 +384,9 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->frto_counter = 0; newtp->frto_highmark = 0; - newtp->ca_ops = &tcp_reno; + newicsk->icsk_ca_ops = &tcp_reno; - tcp_set_ca_state(newtp, TCP_CA_Open); + tcp_set_ca_state(newsk, TCP_CA_Open); tcp_init_xmit_timers(newsk); skb_queue_head_init(&newtp->out_of_order_queue); newtp->rcv_wup = treq->rcv_isn + 1; @@ -399,7 +399,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->rx_opt.dsack = 0; newtp->rx_opt.eff_sacks = 0; - newtp->probes_out = 0; newtp->rx_opt.num_sacks = 0; newtp->urg_data = 0; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f458eac..267b0fc 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -112,9 +112,9 @@ static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst) u32 restart_cwnd = tcp_init_cwnd(tp, dst); u32 cwnd = tp->snd_cwnd; - tcp_ca_event(tp, CA_EVENT_CWND_RESTART); + tcp_ca_event(sk, CA_EVENT_CWND_RESTART); - tp->snd_ssthresh = tcp_current_ssthresh(tp); + tp->snd_ssthresh = tcp_current_ssthresh(sk); restart_cwnd = min(restart_cwnd, cwnd); while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd) @@ -265,6 +265,7 @@ static __inline__ u16 tcp_select_window(struct sock *sk) static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) { if (skb != NULL) { + const struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet = inet_sk(sk); struct tcp_sock *tp = tcp_sk(sk); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); @@ -280,7 +281,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) #define SYSCTL_FLAG_SACK 0x4 /* If congestion control is doing timestamping */ - if (tp->ca_ops->rtt_sample) + if (icsk->icsk_ca_ops->rtt_sample) do_gettimeofday(&skb->stamp); sysctl_flags = 0; @@ -308,7 +309,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) } if (tcp_packets_in_flight(tp) == 0) - tcp_ca_event(tp, CA_EVENT_TX_START); + tcp_ca_event(sk, CA_EVENT_TX_START); th = (struct tcphdr *) skb_push(skb, tcp_header_size); skb->h.th = th; @@ -366,7 +367,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) if (err <= 0) return err; - tcp_enter_cwr(tp); + tcp_enter_cwr(sk); /* NET_XMIT_CN is special. It does not guarantee, * that this packet is lost. It tells that device @@ -905,12 +906,13 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, */ static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) { + const struct inet_connection_sock *icsk = inet_csk(sk); u32 send_win, cong_win, limit, in_flight; if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) return 0; - if (tp->ca_state != TCP_CA_Open) + if (icsk->icsk_ca_state != TCP_CA_Open) return 0; in_flight = tcp_packets_in_flight(tp); @@ -1287,6 +1289,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m */ void tcp_simple_retransmit(struct sock *sk) { + const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; unsigned int mss = tcp_current_mss(sk, 0); @@ -1317,12 +1320,12 @@ void tcp_simple_retransmit(struct sock *sk) * in network, but units changed and effective * cwnd/ssthresh really reduced now. */ - if (tp->ca_state != TCP_CA_Loss) { + if (icsk->icsk_ca_state != TCP_CA_Loss) { tp->high_seq = tp->snd_nxt; - tp->snd_ssthresh = tcp_current_ssthresh(tp); + tp->snd_ssthresh = tcp_current_ssthresh(sk); tp->prior_ssthresh = 0; tp->undo_marker = 0; - tcp_set_ca_state(tp, TCP_CA_Loss); + tcp_set_ca_state(sk, TCP_CA_Loss); } tcp_xmit_retransmit_queue(sk); } @@ -1462,6 +1465,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) */ void tcp_xmit_retransmit_queue(struct sock *sk) { + const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; int packet_cnt = tp->lost_out; @@ -1485,7 +1489,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) { if (tcp_retransmit_skb(sk, skb)) return; - if (tp->ca_state != TCP_CA_Loss) + if (icsk->icsk_ca_state != TCP_CA_Loss) NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS); else NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS); @@ -1507,7 +1511,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) /* OK, demanded retransmission is finished. */ /* Forward retransmissions are possible only during Recovery. */ - if (tp->ca_state != TCP_CA_Recovery) + if (icsk->icsk_ca_state != TCP_CA_Recovery) return; /* No forward retransmissions in Reno are possible. */ @@ -2028,7 +2032,7 @@ void tcp_send_probe0(struct sock *sk) if (tp->packets_out || !sk->sk_send_head) { /* Cancel probe timer, if it is not required. */ - tp->probes_out = 0; + icsk->icsk_probes_out = 0; icsk->icsk_backoff = 0; return; } @@ -2036,19 +2040,19 @@ void tcp_send_probe0(struct sock *sk) if (err <= 0) { if (icsk->icsk_backoff < sysctl_tcp_retries2) icsk->icsk_backoff++; - tp->probes_out++; + icsk->icsk_probes_out++; inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX), TCP_RTO_MAX); } else { /* If packet was not sent due to local congestion, - * do not backoff and do not remember probes_out. + * do not backoff and do not remember icsk_probes_out. * Let local senders to fight for local resources. * * Use accumulated backoff yet. */ - if (!tp->probes_out) - tp->probes_out=1; + if (!icsk->icsk_probes_out) + icsk->icsk_probes_out = 1; inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RESOURCE_PROBE_INTERVAL), diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index 70e108e..327770b 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -16,9 +16,10 @@ #define TCP_SCALABLE_AI_CNT 50U #define TCP_SCALABLE_MD_SCALE 3 -static void tcp_scalable_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, +static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, int flag) { + struct tcp_sock *tp = tcp_sk(sk); if (in_flight < tp->snd_cwnd) return; @@ -35,8 +36,9 @@ static void tcp_scalable_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, tp->snd_cwnd_stamp = tcp_time_stamp; } -static u32 tcp_scalable_ssthresh(struct tcp_sock *tp) +static u32 tcp_scalable_ssthresh(struct sock *sk) { + const struct tcp_sock *tp = tcp_sk(sk); return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U); } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 72cec69..415ee47 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -233,11 +233,12 @@ out_unlock: static void tcp_probe_timer(struct sock *sk) { + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int max_probes; if (tp->packets_out || !sk->sk_send_head) { - tp->probes_out = 0; + icsk->icsk_probes_out = 0; return; } @@ -248,7 +249,7 @@ static void tcp_probe_timer(struct sock *sk) * FIXME: We ought not to do it, Solaris 2.5 actually has fixing * this behaviour in Solaris down as a bug fix. [AC] * - * Let me to explain. probes_out is zeroed by incoming ACKs + * Let me to explain. icsk_probes_out is zeroed by incoming ACKs * even if they advertise zero window. Hence, connection is killed only * if we received no ACKs for normal connection timeout. It is not killed * only because window stays zero for some time, window may be zero @@ -259,16 +260,15 @@ static void tcp_probe_timer(struct sock *sk) max_probes = sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { - const struct inet_connection_sock *icsk = inet_csk(sk); const int alive = ((icsk->icsk_rto << icsk->icsk_backoff) < TCP_RTO_MAX); max_probes = tcp_orphan_retries(sk, alive); - if (tcp_out_of_resources(sk, alive || tp->probes_out <= max_probes)) + if (tcp_out_of_resources(sk, alive || icsk->icsk_probes_out <= max_probes)) return; } - if (tp->probes_out > max_probes) { + if (icsk->icsk_probes_out > max_probes) { tcp_write_err(sk); } else { /* Only send another probe if we didn't close things up. */ @@ -319,19 +319,20 @@ static void tcp_retransmit_timer(struct sock *sk) goto out; if (icsk->icsk_retransmits == 0) { - if (tp->ca_state == TCP_CA_Disorder || tp->ca_state == TCP_CA_Recovery) { + if (icsk->icsk_ca_state == TCP_CA_Disorder || + icsk->icsk_ca_state == TCP_CA_Recovery) { if (tp->rx_opt.sack_ok) { - if (tp->ca_state == TCP_CA_Recovery) + if (icsk->icsk_ca_state == TCP_CA_Recovery) NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERYFAIL); else NET_INC_STATS_BH(LINUX_MIB_TCPSACKFAILURES); } else { - if (tp->ca_state == TCP_CA_Recovery) + if (icsk->icsk_ca_state == TCP_CA_Recovery) NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERYFAIL); else NET_INC_STATS_BH(LINUX_MIB_TCPRENOFAILURES); } - } else if (tp->ca_state == TCP_CA_Loss) { + } else if (icsk->icsk_ca_state == TCP_CA_Loss) { NET_INC_STATS_BH(LINUX_MIB_TCPLOSSFAILURES); } else { NET_INC_STATS_BH(LINUX_MIB_TCPTIMEOUTS); @@ -449,6 +450,7 @@ void tcp_set_keepalive(struct sock *sk, int val) static void tcp_keepalive_timer (unsigned long data) { struct sock *sk = (struct sock *) data; + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); __u32 elapsed; @@ -490,14 +492,14 @@ static void tcp_keepalive_timer (unsigned long data) elapsed = tcp_time_stamp - tp->rcv_tstamp; if (elapsed >= keepalive_time_when(tp)) { - if ((!tp->keepalive_probes && tp->probes_out >= sysctl_tcp_keepalive_probes) || - (tp->keepalive_probes && tp->probes_out >= tp->keepalive_probes)) { + if ((!tp->keepalive_probes && icsk->icsk_probes_out >= sysctl_tcp_keepalive_probes) || + (tp->keepalive_probes && icsk->icsk_probes_out >= tp->keepalive_probes)) { tcp_send_active_reset(sk, GFP_ATOMIC); tcp_write_err(sk); goto out; } if (tcp_write_wakeup(sk) <= 0) { - tp->probes_out++; + icsk->icsk_probes_out++; elapsed = keepalive_intvl_when(tp); } else { /* If keepalive was lost due to local congestion, diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 9bd443d..054de24 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -82,9 +82,10 @@ struct vegas { * Instead we must wait until the completion of an RTT during * which we actually receive ACKs. */ -static inline void vegas_enable(struct tcp_sock *tp) +static inline void vegas_enable(struct sock *sk) { - struct vegas *vegas = tcp_ca(tp); + const struct tcp_sock *tp = tcp_sk(sk); + struct vegas *vegas = inet_csk_ca(sk); /* Begin taking Vegas samples next time we send something. */ vegas->doing_vegas_now = 1; @@ -97,19 +98,19 @@ static inline void vegas_enable(struct tcp_sock *tp) } /* Stop taking Vegas samples for now. */ -static inline void vegas_disable(struct tcp_sock *tp) +static inline void vegas_disable(struct sock *sk) { - struct vegas *vegas = tcp_ca(tp); + struct vegas *vegas = inet_csk_ca(sk); vegas->doing_vegas_now = 0; } -static void tcp_vegas_init(struct tcp_sock *tp) +static void tcp_vegas_init(struct sock *sk) { - struct vegas *vegas = tcp_ca(tp); + struct vegas *vegas = inet_csk_ca(sk); vegas->baseRTT = 0x7fffffff; - vegas_enable(tp); + vegas_enable(sk); } /* Do RTT sampling needed for Vegas. @@ -120,9 +121,9 @@ static void tcp_vegas_init(struct tcp_sock *tp) * o min-filter RTT samples from a much longer window (forever for now) * to find the propagation delay (baseRTT) */ -static void tcp_vegas_rtt_calc(struct tcp_sock *tp, u32 usrtt) +static void tcp_vegas_rtt_calc(struct sock *sk, u32 usrtt) { - struct vegas *vegas = tcp_ca(tp); + struct vegas *vegas = inet_csk_ca(sk); u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */ /* Filter to find propagation delay: */ @@ -136,13 +137,13 @@ static void tcp_vegas_rtt_calc(struct tcp_sock *tp, u32 usrtt) vegas->cntRTT++; } -static void tcp_vegas_state(struct tcp_sock *tp, u8 ca_state) +static void tcp_vegas_state(struct sock *sk, u8 ca_state) { if (ca_state == TCP_CA_Open) - vegas_enable(tp); + vegas_enable(sk); else - vegas_disable(tp); + vegas_disable(sk); } /* @@ -154,20 +155,21 @@ static void tcp_vegas_state(struct tcp_sock *tp, u8 ca_state) * packets, _then_ we can make Vegas calculations * again. */ -static void tcp_vegas_cwnd_event(struct tcp_sock *tp, enum tcp_ca_event event) +static void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event) { if (event == CA_EVENT_CWND_RESTART || event == CA_EVENT_TX_START) - tcp_vegas_init(tp); + tcp_vegas_init(sk); } -static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack, +static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 seq_rtt, u32 in_flight, int flag) { - struct vegas *vegas = tcp_ca(tp); + struct tcp_sock *tp = tcp_sk(sk); + struct vegas *vegas = inet_csk_ca(sk); if (!vegas->doing_vegas_now) - return tcp_reno_cong_avoid(tp, ack, seq_rtt, in_flight, flag); + return tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag); /* The key players are v_beg_snd_una and v_beg_snd_nxt. * @@ -219,7 +221,7 @@ static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack, * but that's not too awful, since we're taking the min, * rather than averaging. */ - tcp_vegas_rtt_calc(tp, seq_rtt*1000); + tcp_vegas_rtt_calc(sk, seq_rtt * 1000); /* We do the Vegas calculations only if we got enough RTT * samples that we can be reasonably sure that we got @@ -359,10 +361,10 @@ static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack, } /* Extract info for Tcp socket info provided via netlink. */ -static void tcp_vegas_get_info(struct tcp_sock *tp, u32 ext, +static void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) { - const struct vegas *ca = tcp_ca(tp); + const struct vegas *ca = inet_csk_ca(sk); if (ext & (1<<(TCPDIAG_VEGASINFO-1))) { struct tcpvegas_info *info; @@ -393,7 +395,7 @@ static struct tcp_congestion_ops tcp_vegas = { static int __init tcp_vegas_register(void) { - BUG_ON(sizeof(struct vegas) > TCP_CA_PRIV_SIZE); + BUG_ON(sizeof(struct vegas) > ICSK_CA_PRIV_SIZE); tcp_register_congestion_control(&tcp_vegas); return 0; } diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index ef82724..d8a5a2b 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -40,9 +40,9 @@ struct westwood { * way as soon as possible. It will reasonably happen within the first * RTT period of the connection lifetime. */ -static void tcp_westwood_init(struct tcp_sock *tp) +static void tcp_westwood_init(struct sock *sk) { - struct westwood *w = tcp_ca(tp); + struct westwood *w = inet_csk_ca(sk); w->bk = 0; w->bw_ns_est = 0; @@ -51,7 +51,7 @@ static void tcp_westwood_init(struct tcp_sock *tp) w->cumul_ack = 0; w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT; w->rtt_win_sx = tcp_time_stamp; - w->snd_una = tp->snd_una; + w->snd_una = tcp_sk(sk)->snd_una; } /* @@ -74,11 +74,11 @@ static inline void westwood_filter(struct westwood *w, u32 delta) * Called after processing group of packets. * but all westwood needs is the last sample of srtt. */ -static void tcp_westwood_pkts_acked(struct tcp_sock *tp, u32 cnt) +static void tcp_westwood_pkts_acked(struct sock *sk, u32 cnt) { - struct westwood *w = tcp_ca(tp); + struct westwood *w = inet_csk_ca(sk); if (cnt > 0) - w->rtt = tp->srtt >> 3; + w->rtt = tcp_sk(sk)->srtt >> 3; } /* @@ -86,9 +86,9 @@ static void tcp_westwood_pkts_acked(struct tcp_sock *tp, u32 cnt) * It updates RTT evaluation window if it is the right moment to do * it. If so it calls filter for evaluating bandwidth. */ -static void westwood_update_window(struct tcp_sock *tp) +static void westwood_update_window(struct sock *sk) { - struct westwood *w = tcp_ca(tp); + struct westwood *w = inet_csk_ca(sk); s32 delta = tcp_time_stamp - w->rtt_win_sx; /* @@ -114,11 +114,12 @@ static void westwood_update_window(struct tcp_sock *tp) * header prediction is successful. In such case in fact update is * straight forward and doesn't need any particular care. */ -static inline void westwood_fast_bw(struct tcp_sock *tp) +static inline void westwood_fast_bw(struct sock *sk) { - struct westwood *w = tcp_ca(tp); + const struct tcp_sock *tp = tcp_sk(sk); + struct westwood *w = inet_csk_ca(sk); - westwood_update_window(tp); + westwood_update_window(sk); w->bk += tp->snd_una - w->snd_una; w->snd_una = tp->snd_una; @@ -130,9 +131,10 @@ static inline void westwood_fast_bw(struct tcp_sock *tp) * This function evaluates cumul_ack for evaluating bk in case of * delayed or partial acks. */ -static inline u32 westwood_acked_count(struct tcp_sock *tp) +static inline u32 westwood_acked_count(struct sock *sk) { - struct westwood *w = tcp_ca(tp); + const struct tcp_sock *tp = tcp_sk(sk); + struct westwood *w = inet_csk_ca(sk); w->cumul_ack = tp->snd_una - w->snd_una; @@ -160,9 +162,10 @@ static inline u32 westwood_acked_count(struct tcp_sock *tp) return w->cumul_ack; } -static inline u32 westwood_bw_rttmin(const struct tcp_sock *tp) +static inline u32 westwood_bw_rttmin(const struct sock *sk) { - struct westwood *w = tcp_ca(tp); + const struct tcp_sock *tp = tcp_sk(sk); + const struct westwood *w = inet_csk_ca(sk); return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2); } @@ -172,31 +175,32 @@ static inline u32 westwood_bw_rttmin(const struct tcp_sock *tp) * in packets we use mss_cache). Rttmin is guaranteed to be >= 2 * so avoids ever returning 0. */ -static u32 tcp_westwood_cwnd_min(struct tcp_sock *tp) +static u32 tcp_westwood_cwnd_min(struct sock *sk) { - return westwood_bw_rttmin(tp); + return westwood_bw_rttmin(sk); } -static void tcp_westwood_event(struct tcp_sock *tp, enum tcp_ca_event event) +static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) { - struct westwood *w = tcp_ca(tp); + struct tcp_sock *tp = tcp_sk(sk); + struct westwood *w = inet_csk_ca(sk); switch(event) { case CA_EVENT_FAST_ACK: - westwood_fast_bw(tp); + westwood_fast_bw(sk); break; case CA_EVENT_COMPLETE_CWR: - tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(tp); + tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(sk); break; case CA_EVENT_FRTO: - tp->snd_ssthresh = westwood_bw_rttmin(tp); + tp->snd_ssthresh = westwood_bw_rttmin(sk); break; case CA_EVENT_SLOW_ACK: - westwood_update_window(tp); - w->bk += westwood_acked_count(tp); + westwood_update_window(sk); + w->bk += westwood_acked_count(sk); w->rtt_min = min(w->rtt, w->rtt_min); break; @@ -208,10 +212,10 @@ static void tcp_westwood_event(struct tcp_sock *tp, enum tcp_ca_event event) /* Extract info for Tcp socket info provided via netlink. */ -static void tcp_westwood_info(struct tcp_sock *tp, u32 ext, +static void tcp_westwood_info(struct sock *sk, u32 ext, struct sk_buff *skb) { - const struct westwood *ca = tcp_ca(tp); + const struct westwood *ca = inet_csk_ca(sk); if (ext & (1<<(TCPDIAG_VEGASINFO-1))) { struct rtattr *rta; struct tcpvegas_info *info; @@ -242,7 +246,7 @@ static struct tcp_congestion_ops tcp_westwood = { static int __init tcp_westwood_register(void) { - BUG_ON(sizeof(struct westwood) > TCP_CA_PRIV_SIZE); + BUG_ON(sizeof(struct westwood) > ICSK_CA_PRIV_SIZE); return tcp_register_congestion_control(&tcp_westwood); } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 08c55b0..3312cb8 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2010,13 +2010,14 @@ static struct tcp_func ipv6_mapped = { */ static int tcp_v6_init_sock(struct sock *sk) { + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); skb_queue_head_init(&tp->out_of_order_queue); tcp_init_xmit_timers(sk); tcp_prequeue_init(tp); - inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; + icsk->icsk_rto = TCP_TIMEOUT_INIT; tp->mdev = TCP_TIMEOUT_INIT; /* So many TCP implementations out there (incorrectly) count the @@ -2038,7 +2039,7 @@ static int tcp_v6_init_sock(struct sock *sk) sk->sk_state = TCP_CLOSE; tp->af_specific = &ipv6_specific; - tp->ca_ops = &tcp_init_congestion_ops; + icsk->icsk_ca_ops = &tcp_init_congestion_ops; sk->sk_write_space = sk_stream_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); @@ -2135,7 +2136,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) jiffies_to_clock_t(timer_expires - jiffies), icsk->icsk_retransmits, sock_i_uid(sp), - tp->probes_out, + icsk->icsk_probes_out, sock_i_ino(sp), atomic_read(&sp->sk_refcnt), sp, icsk->icsk_rto, -- cgit v0.10.2 From 540722ffc3a0d7e11d97a13e1ce6f3bc23b061c1 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 10 Aug 2005 05:54:28 -0300 Subject: [TCPDIAG]: Implement cheapest way of supporting DCCPDIAG_GETSOCK With ugly ifdefs, etc, but this actually: 1. keeps the existing ABI, i.e. no need to recompile the iproute2 utilities if not interested in DCCP. 2. Provides all the tcp_diag functionality in DCCP, with just a small patch that makes iproute2 support DCCP. Of course I'll get this cleaned-up in time, but for now I think its OK to be this way to quickly get this functionality. iproute2-ss050808 patch at: http://vger.kernel.org/~acme/iproute2-ss050808.dccp.patch Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/linux/tcp_diag.h b/include/linux/tcp_diag.h index 7a59967..190494e 100644 --- a/include/linux/tcp_diag.h +++ b/include/linux/tcp_diag.h @@ -3,6 +3,7 @@ /* Just some random number */ #define TCPDIAG_GETSOCK 18 +#define DCCPDIAG_GETSOCK 19 /* Socket identity */ struct tcpdiag_sockid diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index f6da932..d3770ae 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -34,6 +34,8 @@ struct inet_hashinfo __cacheline_aligned dccp_hashinfo = { .port_rover = 1024 - 1, }; +EXPORT_SYMBOL_GPL(dccp_hashinfo); + static int dccp_v4_get_port(struct sock *sk, const unsigned short snum) { return inet_csk_get_port(&dccp_hashinfo, sk, snum); diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 4288ecf..f5fc84a 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -45,11 +45,15 @@ static struct sock *tcpnl; #define TCPDIAG_PUT(skb, attrtype, attrlen) \ RTA_DATA(__RTA_PUT(skb, attrtype, attrlen)) +#if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE) +extern struct inet_hashinfo dccp_hashinfo; +#endif + static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, - int ext, u32 pid, u32 seq, u16 nlmsg_flags) + int ext, u32 pid, u32 seq, u16 nlmsg_flags, + const struct nlmsghdr *unlh) { const struct inet_sock *inet = inet_sk(sk); - struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); struct tcpdiagmsg *r; struct nlmsghdr *nlh; @@ -57,7 +61,7 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, struct tcpdiag_meminfo *minfo = NULL; unsigned char *b = skb->tail; - nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r)); + nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r)); nlh->nlmsg_flags = nlmsg_flags; r = NLMSG_DATA(nlh); if (sk->sk_state != TCP_TIME_WAIT) { @@ -147,8 +151,20 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, r->tcpdiag_expires = 0; } #undef EXPIRES_IN_MS - r->tcpdiag_rqueue = tp->rcv_nxt - tp->copied_seq; - r->tcpdiag_wqueue = tp->write_seq - tp->snd_una; + /* + * Ahem... for now we'll have some knowledge about TCP -acme + * But this is just one of two small exceptions, both in this + * function, so lets close our eyes for some 15 lines or so... 8) + * -acme + */ + if (sk->sk_protocol == IPPROTO_TCP) { + const struct tcp_sock *tp = tcp_sk(sk); + + r->tcpdiag_rqueue = tp->rcv_nxt - tp->copied_seq; + r->tcpdiag_wqueue = tp->write_seq - tp->snd_una; + } else + r->tcpdiag_rqueue = r->tcpdiag_wqueue = 0; + r->tcpdiag_uid = sock_i_uid(sk); r->tcpdiag_inode = sock_i_ino(sk); @@ -159,8 +175,13 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, minfo->tcpdiag_tmem = atomic_read(&sk->sk_wmem_alloc); } - if (info) - tcp_get_info(sk, info); + /* Ahem... for now we'll have some knowledge about TCP -acme */ + if (info) { + if (sk->sk_protocol == IPPROTO_TCP) + tcp_get_info(sk, info); + else + memset(info, 0, sizeof(*info)); + } if (sk->sk_state < TCP_TIME_WAIT && icsk->icsk_ca_ops && icsk->icsk_ca_ops->get_info) @@ -194,9 +215,13 @@ static int tcpdiag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh) struct sock *sk; struct tcpdiagreq *req = NLMSG_DATA(nlh); struct sk_buff *rep; - + struct inet_hashinfo *hashinfo = &tcp_hashinfo; +#if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE) + if (nlh->nlmsg_type == DCCPDIAG_GETSOCK) + hashinfo = &dccp_hashinfo; +#endif if (req->tcpdiag_family == AF_INET) { - sk = inet_lookup(&tcp_hashinfo, req->id.tcpdiag_dst[0], + sk = inet_lookup(hashinfo, req->id.tcpdiag_dst[0], req->id.tcpdiag_dport, req->id.tcpdiag_src[0], req->id.tcpdiag_sport, req->id.tcpdiag_if); } @@ -230,7 +255,7 @@ static int tcpdiag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh) if (tcpdiag_fill(rep, sk, req->tcpdiag_ext, NETLINK_CB(in_skb).pid, - nlh->nlmsg_seq, 0) <= 0) + nlh->nlmsg_seq, 0, nlh) <= 0) BUG(); err = netlink_unicast(tcpnl, rep, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); @@ -436,12 +461,13 @@ static int tcpdiag_dump_sock(struct sk_buff *skb, struct sock *sk, } return tcpdiag_fill(skb, sk, r->tcpdiag_ext, NETLINK_CB(cb->skb).pid, - cb->nlh->nlmsg_seq, NLM_F_MULTI); + cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); } static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk, struct request_sock *req, - u32 pid, u32 seq) + u32 pid, u32 seq, + const struct nlmsghdr *unlh) { const struct inet_request_sock *ireq = inet_rsk(req); struct inet_sock *inet = inet_sk(sk); @@ -450,7 +476,7 @@ static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk, struct nlmsghdr *nlh; long tmo; - nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r)); + nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r)); nlh->nlmsg_flags = NLM_F_MULTI; r = NLMSG_DATA(nlh); @@ -526,7 +552,7 @@ static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk, entry.userlocks = sk->sk_userlocks; } - for (j = s_j; j < TCP_SYNQ_HSIZE; j++) { + for (j = s_j; j < lopt->nr_table_entries; j++) { struct request_sock *req, *head = lopt->syn_table[j]; reqnum = 0; @@ -561,7 +587,7 @@ static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk, err = tcpdiag_fill_req(skb, sk, req, NETLINK_CB(cb->skb).pid, - cb->nlh->nlmsg_seq); + cb->nlh->nlmsg_seq, cb->nlh); if (err < 0) { cb->args[3] = j + 1; cb->args[4] = reqnum; @@ -583,20 +609,26 @@ static int tcpdiag_dump(struct sk_buff *skb, struct netlink_callback *cb) int i, num; int s_i, s_num; struct tcpdiagreq *r = NLMSG_DATA(cb->nlh); + struct inet_hashinfo *hashinfo; s_i = cb->args[1]; s_num = num = cb->args[2]; - + hashinfo = &tcp_hashinfo; +#if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE) + if (cb->nlh->nlmsg_type == DCCPDIAG_GETSOCK) + hashinfo = &dccp_hashinfo; +#endif if (cb->args[0] == 0) { if (!(r->tcpdiag_states&(TCPF_LISTEN|TCPF_SYN_RECV))) goto skip_listen_ht; - inet_listen_lock(&tcp_hashinfo); + + inet_listen_lock(hashinfo); for (i = s_i; i < INET_LHTABLE_SIZE; i++) { struct sock *sk; struct hlist_node *node; num = 0; - sk_for_each(sk, node, &tcp_hashinfo.listening_hash[i]) { + sk_for_each(sk, node, &hashinfo->listening_hash[i]) { struct inet_sock *inet = inet_sk(sk); if (num < s_num) { @@ -614,7 +646,7 @@ static int tcpdiag_dump(struct sk_buff *skb, struct netlink_callback *cb) goto syn_recv; if (tcpdiag_dump_sock(skb, sk, cb) < 0) { - inet_listen_unlock(&tcp_hashinfo); + inet_listen_unlock(hashinfo); goto done; } @@ -623,7 +655,7 @@ syn_recv: goto next_listen; if (tcpdiag_dump_reqs(skb, sk, cb) < 0) { - inet_listen_unlock(&tcp_hashinfo); + inet_listen_unlock(hashinfo); goto done; } @@ -637,7 +669,7 @@ next_listen: cb->args[3] = 0; cb->args[4] = 0; } - inet_listen_unlock(&tcp_hashinfo); + inet_listen_unlock(hashinfo); skip_listen_ht: cb->args[0] = 1; s_i = num = s_num = 0; @@ -646,8 +678,8 @@ skip_listen_ht: if (!(r->tcpdiag_states&~(TCPF_LISTEN|TCPF_SYN_RECV))) return skb->len; - for (i = s_i; i < tcp_hashinfo.ehash_size; i++) { - struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[i]; + for (i = s_i; i < hashinfo->ehash_size; i++) { + struct inet_ehash_bucket *head = &hashinfo->ehash[i]; struct sock *sk; struct hlist_node *node; @@ -679,7 +711,7 @@ next_normal: if (r->tcpdiag_states&TCPF_TIME_WAIT) { sk_for_each(sk, node, - &tcp_hashinfo.ehash[i + tcp_hashinfo.ehash_size].chain) { + &hashinfo->ehash[i + hashinfo->ehash_size].chain) { struct inet_sock *inet = inet_sk(sk); if (num < s_num) @@ -719,7 +751,11 @@ tcpdiag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (!(nlh->nlmsg_flags&NLM_F_REQUEST)) return 0; - if (nlh->nlmsg_type != TCPDIAG_GETSOCK) + if (nlh->nlmsg_type != TCPDIAG_GETSOCK +#if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE) + && nlh->nlmsg_type != DCCPDIAG_GETSOCK +#endif + ) goto err_inval; if (NLMSG_LENGTH(sizeof(struct tcpdiagreq)) > skb->len) -- cgit v0.10.2 From 8c60f3fab55712f23f2bd557ceedfbb00c649f37 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 10 Aug 2005 12:59:38 -0300 Subject: [CCID3]: Separate most of the packet history code This also changes the list_for_each_entry_safe_continue behaviour to match its kerneldoc comment, that is, to start after the pos passed. Also adds several helper functions from previously open coded fragments, making the code more clear. Signed-off-by: Arnaldo Carvalho de Melo diff --git a/include/linux/list.h b/include/linux/list.h index 597094e..0f2435f 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -427,7 +427,8 @@ static inline void list_splice_init(struct list_head *list, * @member: the name of the list_struct within the struct. */ #define list_for_each_entry_safe_continue(pos, n, head, member) \ - for (pos = n, n = list_entry(n->member.next, typeof(*n), member); \ + for (pos = list_entry(pos->member.next, typeof(*pos), member), \ + n = list_entry(pos->member.next, typeof(*pos), member); \ &pos->member != (head); \ pos = n, n = list_entry(n->member.next, typeof(*n), member)) diff --git a/net/dccp/Makefile b/net/dccp/Makefile index c6e6ba5..25a50bd 100644 --- a/net/dccp/Makefile +++ b/net/dccp/Makefile @@ -1,5 +1,6 @@ obj-$(CONFIG_IP_DCCP) += dccp.o -dccp-y := ccid.o input.o ipv4.o minisocks.o options.o output.o proto.o timer.o +dccp-y := ccid.o input.o ipv4.o minisocks.o options.o output.o proto.o \ + timer.o packet_history.o obj-y += ccids/ diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index df4adfe..15c25f6 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -34,8 +34,10 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +#include #include "../ccid.h" #include "../dccp.h" +#include "../packet_history.h" #include "ccid3.h" #ifdef CCID3_DEBUG @@ -82,60 +84,10 @@ enum ccid3_options { static int ccid3_debug; -static kmem_cache_t *ccid3_tx_hist_slab; -static kmem_cache_t *ccid3_rx_hist_slab; -static kmem_cache_t *ccid3_loss_interval_hist_slab; - -static inline struct ccid3_tx_hist_entry *ccid3_tx_hist_entry_new(int prio) -{ - struct ccid3_tx_hist_entry *entry = kmem_cache_alloc(ccid3_tx_hist_slab, prio); - - if (entry != NULL) - entry->ccid3htx_sent = 0; - - return entry; -} - -static inline void ccid3_tx_hist_entry_delete(struct ccid3_tx_hist_entry *entry) -{ - if (entry != NULL) - kmem_cache_free(ccid3_tx_hist_slab, entry); -} - -static inline struct ccid3_rx_hist_entry *ccid3_rx_hist_entry_new(struct sock *sk, - struct sk_buff *skb, - int prio) -{ - struct ccid3_rx_hist_entry *entry = kmem_cache_alloc(ccid3_rx_hist_slab, prio); - - if (entry != NULL) { - const struct dccp_hdr *dh = dccp_hdr(skb); - - entry->ccid3hrx_seqno = DCCP_SKB_CB(skb)->dccpd_seq; - entry->ccid3hrx_win_count = dh->dccph_ccval; - entry->ccid3hrx_type = dh->dccph_type; - entry->ccid3hrx_ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp; - do_gettimeofday(&(entry->ccid3hrx_tstamp)); - } - - return entry; -} - -static inline void ccid3_rx_hist_entry_delete(struct ccid3_rx_hist_entry *entry) -{ - if (entry != NULL) - kmem_cache_free(ccid3_rx_hist_slab, entry); -} +struct dccp_tx_hist *ccid3_tx_hist; +struct dccp_rx_hist *ccid3_rx_hist; -static void ccid3_rx_history_delete(struct list_head *hist) -{ - struct ccid3_rx_hist_entry *entry, *next; - - list_for_each_entry_safe(entry, next, hist, ccid3hrx_node) { - list_del_init(&entry->ccid3hrx_node); - kmem_cache_free(ccid3_rx_hist_slab, entry); - } -} +static kmem_cache_t *ccid3_loss_interval_hist_slab; static inline struct ccid3_loss_interval_hist_entry *ccid3_loss_interval_hist_entry_new(int prio) { @@ -982,7 +934,7 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, { struct dccp_sock *dp = dccp_sk(sk); struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private; - struct ccid3_tx_hist_entry *new_packet = NULL; + struct dccp_tx_hist_entry *new_packet; struct timeval now; long delay; int rc = -ENOTCONN; @@ -997,12 +949,9 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, goto out; /* See if last packet allocated was not sent */ - if (!list_empty(&hctx->ccid3hctx_hist)) - new_packet = list_entry(hctx->ccid3hctx_hist.next, - struct ccid3_tx_hist_entry, ccid3htx_node); - - if (new_packet == NULL || new_packet->ccid3htx_sent) { - new_packet = ccid3_tx_hist_entry_new(SLAB_ATOMIC); + new_packet = dccp_tx_hist_head(&hctx->ccid3hctx_hist); + if (new_packet == NULL || new_packet->dccphtx_sent) { + new_packet = dccp_tx_hist_entry_new(ccid3_tx_hist, SLAB_ATOMIC); rc = -ENOBUFS; if (new_packet == NULL) { @@ -1011,7 +960,7 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, goto out; } - list_add(&new_packet->ccid3htx_node, &hctx->ccid3hctx_hist); + dccp_tx_hist_add_entry(&hctx->ccid3hctx_hist, new_packet); } do_gettimeofday(&now); @@ -1054,7 +1003,9 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, /* Can we send? if so add options and add to packet history */ if (rc == 0) - new_packet->ccid3htx_win_count = DCCP_SKB_CB(skb)->dccpd_ccval = hctx->ccid3hctx_last_win_count; + new_packet->dccphtx_win_count = + DCCP_SKB_CB(skb)->dccpd_ccval = + hctx->ccid3hctx_last_win_count; out: return rc; } @@ -1063,7 +1014,6 @@ static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, int len) { struct dccp_sock *dp = dccp_sk(sk); struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private; - struct ccid3_tx_hist_entry *packet = NULL; struct timeval now; // ccid3_pr_debug("%s, sk=%p, more=%d, len=%d\n", dccp_role(sk), sk, more, len); @@ -1080,20 +1030,23 @@ static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, int len) /* check if we have sent a data packet */ if (len > 0) { unsigned long quarter_rtt; + struct dccp_tx_hist_entry *packet; - if (list_empty(&hctx->ccid3hctx_hist)) { + packet = dccp_tx_hist_head(&hctx->ccid3hctx_hist); + if (packet == NULL) { printk(KERN_CRIT "%s: packet doesn't exists in history!\n", __FUNCTION__); return; } - packet = list_entry(hctx->ccid3hctx_hist.next, struct ccid3_tx_hist_entry, ccid3htx_node); - if (packet->ccid3htx_sent) { + if (packet->dccphtx_sent) { printk(KERN_CRIT "%s: no unsent packet in history!\n", __FUNCTION__); return; } - packet->ccid3htx_tstamp = now; - packet->ccid3htx_seqno = dp->dccps_gss; - // ccid3_pr_debug("%s, sk=%p, seqno=%llu inserted!\n", dccp_role(sk), sk, packet->ccid3htx_seqno); - + packet->dccphtx_tstamp = now; + packet->dccphtx_seqno = dp->dccps_gss; +#if 0 + ccid3_pr_debug("%s, sk=%p, seqno=%llu inserted!\n", + dccp_role(sk), sk, packet->dccphtx_seqno); +#endif /* * Check if win_count have changed */ /* COMPLIANCE_BEGIN @@ -1106,18 +1059,18 @@ static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, int len) min_t(unsigned long, quarter_rtt, 5)) % 16; ccid3_pr_debug("%s, sk=%p, window changed from %u to %u!\n", dccp_role(sk), sk, - packet->ccid3htx_win_count, + packet->dccphtx_win_count, hctx->ccid3hctx_last_win_count); } /* COMPLIANCE_END */ #if 0 ccid3_pr_debug("%s, sk=%p, packet sent (%llu,%u)\n", dccp_role(sk), sk, - packet->ccid3htx_seqno, - packet->ccid3htx_win_count); + packet->dccphtx_seqno, + packet->dccphtx_win_count); #endif hctx->ccid3hctx_idle = 0; - packet->ccid3htx_sent = 1; + packet->dccphtx_sent = 1; } else ccid3_pr_debug("%s, sk=%p, seqno=%llu NOT inserted!\n", dccp_role(sk), sk, dp->dccps_gss); @@ -1152,7 +1105,7 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) struct dccp_sock *dp = dccp_sk(sk); struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private; struct ccid3_options_received *opt_recv; - struct ccid3_tx_hist_entry *entry, *next, *packet; + struct dccp_tx_hist_entry *packet; unsigned long next_tmout; u16 t_elapsed; u32 pinv; @@ -1191,13 +1144,8 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) /* Calculate new round trip sample by * R_sample = (now - t_recvdata) - t_delay */ /* get t_recvdata from history */ - packet = NULL; - list_for_each_entry_safe(entry, next, &hctx->ccid3hctx_hist, ccid3htx_node) - if (entry->ccid3htx_seqno == DCCP_SKB_CB(skb)->dccpd_ack_seq) { - packet = entry; - break; - } - + packet = dccp_tx_hist_find_entry(&hctx->ccid3hctx_hist, + DCCP_SKB_CB(skb)->dccpd_ack_seq); if (packet == NULL) { ccid3_pr_debug("%s, sk=%p, seqno %llu(%s) does't exist in history!\n", dccp_role(sk), sk, DCCP_SKB_CB(skb)->dccpd_ack_seq, @@ -1206,7 +1154,7 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) } /* Update RTT */ - r_sample = now_delta(packet->ccid3htx_tstamp); + r_sample = now_delta(packet->dccphtx_tstamp); /* FIXME: */ // r_sample -= usecs_to_jiffies(t_elapsed * 10); @@ -1273,10 +1221,9 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) ccid3_calc_new_delta(hctx); /* remove all packets older than the one acked from history */ - list_for_each_entry_safe_continue(entry, next, &hctx->ccid3hctx_hist, ccid3htx_node) { - list_del_init(&entry->ccid3htx_node); - ccid3_tx_hist_entry_delete(entry); - } + dccp_tx_hist_purge_older(ccid3_tx_hist, + &hctx->ccid3hctx_hist, packet); + if (hctx->ccid3hctx_x < 10) { ccid3_pr_debug("ccid3_hc_tx_packet_recv hctx->ccid3hctx_x < 10\n"); hctx->ccid3hctx_x = 10; @@ -1285,7 +1232,8 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) /* Schedule no feedback timer to expire in max(4 * R, 2 * s / X) */ next_tmout = max(inet_csk(sk)->icsk_rto, - 2 * (hctx->ccid3hctx_s * 100000) / (hctx->ccid3hctx_x/10)); + (2 * (hctx->ccid3hctx_s * 100000) / + (hctx->ccid3hctx_x / 10))); /* maths with 100000 and 10 is to prevent overflow with 32 bit */ ccid3_pr_debug("%s, sk=%p, Scheduled no feedback timer to expire in %lu jiffies (%luus)\n", @@ -1408,7 +1356,6 @@ static void ccid3_hc_tx_exit(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private; - struct ccid3_tx_hist_entry *entry, *next; ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk); BUG_ON(hctx == NULL); @@ -1417,10 +1364,7 @@ static void ccid3_hc_tx_exit(struct sock *sk) sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer); /* Empty packet history */ - list_for_each_entry_safe(entry, next, &hctx->ccid3hctx_hist, ccid3htx_node) { - list_del_init(&entry->ccid3htx_node); - ccid3_tx_hist_entry_delete(entry); - } + dccp_tx_hist_purge(ccid3_tx_hist, &hctx->ccid3hctx_hist); kfree(dp->dccps_hc_tx_ccid_private); dp->dccps_hc_tx_ccid_private = NULL; @@ -1462,39 +1406,40 @@ static inline void ccid3_hc_rx_set_state(struct sock *sk, enum ccid3_hc_rx_state hcrx->ccid3hcrx_state = state; } -static int ccid3_hc_rx_add_hist(struct sock *sk, struct ccid3_rx_hist_entry *packet) +static int ccid3_hc_rx_add_hist(struct sock *sk, + struct dccp_rx_hist_entry *packet) { struct dccp_sock *dp = dccp_sk(sk); struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; - struct ccid3_rx_hist_entry *entry, *next; + struct dccp_rx_hist_entry *entry, *next, *iter; u8 num_later = 0; - if (list_empty(&hcrx->ccid3hcrx_hist)) - list_add(&packet->ccid3hrx_node, &hcrx->ccid3hcrx_hist); + iter = dccp_rx_hist_head(&hcrx->ccid3hcrx_hist); + if (iter == NULL) + dccp_rx_hist_add_entry(&hcrx->ccid3hcrx_hist, packet); else { - u64 seqno = packet->ccid3hrx_seqno; - struct ccid3_rx_hist_entry *iter = list_entry(hcrx->ccid3hcrx_hist.next, - struct ccid3_rx_hist_entry, - ccid3hrx_node); - if (after48(seqno, iter->ccid3hrx_seqno)) - list_add(&packet->ccid3hrx_node, &hcrx->ccid3hcrx_hist); + const u64 seqno = packet->dccphrx_seqno; + + if (after48(seqno, iter->dccphrx_seqno)) + dccp_rx_hist_add_entry(&hcrx->ccid3hcrx_hist, packet); else { - if (iter->ccid3hrx_type == DCCP_PKT_DATA || - iter->ccid3hrx_type == DCCP_PKT_DATAACK) + if (dccp_rx_hist_entry_data_packet(iter)) num_later = 1; - list_for_each_entry_continue(iter, &hcrx->ccid3hcrx_hist, ccid3hrx_node) { - if (after48(seqno, iter->ccid3hrx_seqno)) { - list_add(&packet->ccid3hrx_node, &iter->ccid3hrx_node); + list_for_each_entry_continue(iter, + &hcrx->ccid3hcrx_hist, + dccphrx_node) { + if (after48(seqno, iter->dccphrx_seqno)) { + dccp_rx_hist_add_entry(&iter->dccphrx_node, + packet); goto trim_history; } - if (iter->ccid3hrx_type == DCCP_PKT_DATA || - iter->ccid3hrx_type == DCCP_PKT_DATAACK) + if (dccp_rx_hist_entry_data_packet(iter)) num_later++; if (num_later == TFRC_RECV_NUM_LATE_LOSS) { - ccid3_rx_hist_entry_delete(packet); + dccp_rx_hist_entry_delete(ccid3_rx_hist, packet); ccid3_pr_debug("%s, sk=%p, packet(%llu) already lost!\n", dccp_role(sk), sk, seqno); return 1; @@ -1502,7 +1447,8 @@ static int ccid3_hc_rx_add_hist(struct sock *sk, struct ccid3_rx_hist_entry *pac } if (num_later < TFRC_RECV_NUM_LATE_LOSS) - list_add_tail(&packet->ccid3hrx_node, &hcrx->ccid3hcrx_hist); + dccp_rx_hist_add_entry(&hcrx->ccid3hcrx_hist, + packet); /* FIXME: else what? should we destroy the packet like above? */ } } @@ -1512,12 +1458,12 @@ trim_history: num_later = TFRC_RECV_NUM_LATE_LOSS + 1; if (!list_empty(&hcrx->ccid3hcrx_loss_interval_hist)) { - list_for_each_entry_safe(entry, next, &hcrx->ccid3hcrx_hist, ccid3hrx_node) { + list_for_each_entry_safe(entry, next, &hcrx->ccid3hcrx_hist, + dccphrx_node) { if (num_later == 0) { - list_del_init(&entry->ccid3hrx_node); - ccid3_rx_hist_entry_delete(entry); - } else if (entry->ccid3hrx_type == DCCP_PKT_DATA || - entry->ccid3hrx_type == DCCP_PKT_DATAACK) + list_del_init(&entry->dccphrx_node); + dccp_rx_hist_entry_delete(ccid3_rx_hist, entry); + } else if (dccp_rx_hist_entry_data_packet(entry)) --num_later; } } else { @@ -1528,7 +1474,8 @@ trim_history: * We have no loss interval history so we need at least one * rtt:s of data packets to approximate rtt. */ - list_for_each_entry_safe(entry, next, &hcrx->ccid3hcrx_hist, ccid3hrx_node) { + list_for_each_entry_safe(entry, next, &hcrx->ccid3hcrx_hist, + dccphrx_node) { if (num_later == 0) { switch (step) { case 0: @@ -1540,10 +1487,11 @@ trim_history: step = 2; /* OK, find next data packet */ num_later = 1; - win_count = entry->ccid3hrx_win_count; + win_count = entry->dccphrx_win_count; break; case 2: - tmp = win_count - entry->ccid3hrx_win_count; + tmp = (win_count - + entry->dccphrx_win_count); if (tmp < 0) tmp += TFRC_WIN_COUNT_LIMIT; if (tmp > TFRC_WIN_COUNT_PER_RTT + 1) { @@ -1554,12 +1502,11 @@ trim_history: num_later = 1; break; case 3: - list_del_init(&entry->ccid3hrx_node); - ccid3_rx_hist_entry_delete(entry); + list_del_init(&entry->dccphrx_node); + dccp_rx_hist_entry_delete(ccid3_rx_hist, entry); break; } - } else if (entry->ccid3hrx_type == DCCP_PKT_DATA || - entry->ccid3hrx_type == DCCP_PKT_DATAACK) + } else if (dccp_rx_hist_entry_data_packet(entry)) --num_later; } } @@ -1571,7 +1518,7 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; - struct ccid3_rx_hist_entry *entry, *packet; + struct dccp_rx_hist_entry *packet; ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk); @@ -1594,14 +1541,7 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk) return; } - packet = NULL; - list_for_each_entry(entry, &hcrx->ccid3hcrx_hist, ccid3hrx_node) - if (entry->ccid3hrx_type == DCCP_PKT_DATA || - entry->ccid3hrx_type == DCCP_PKT_DATAACK) { - packet = entry; - break; - } - + packet = dccp_rx_hist_find_data_packet(&hcrx->ccid3hcrx_hist); if (packet == NULL) { printk(KERN_CRIT "%s: %s, sk=%p, no data packet in history!\n", __FUNCTION__, dccp_role(sk), sk); @@ -1610,12 +1550,12 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk) } do_gettimeofday(&(hcrx->ccid3hcrx_tstamp_last_feedback)); - hcrx->ccid3hcrx_last_counter = packet->ccid3hrx_win_count; - hcrx->ccid3hcrx_seqno_last_counter = packet->ccid3hrx_seqno; + hcrx->ccid3hcrx_last_counter = packet->dccphrx_win_count; + hcrx->ccid3hcrx_seqno_last_counter = packet->dccphrx_seqno; hcrx->ccid3hcrx_bytes_recv = 0; /* Convert to multiples of 10us */ - hcrx->ccid3hcrx_elapsed_time = now_delta(packet->ccid3hrx_tstamp) / 10; + hcrx->ccid3hcrx_elapsed_time = now_delta(packet->dccphrx_tstamp) / 10; if (hcrx->ccid3hcrx_p == 0) hcrx->ccid3hcrx_pinv = ~0; else @@ -1686,7 +1626,7 @@ static u32 ccid3_hc_rx_calc_first_li(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; - struct ccid3_rx_hist_entry *entry, *next, *tail = NULL; + struct dccp_rx_hist_entry *entry, *next, *tail = NULL; u32 rtt, delta, x_recv, fval, p, tmp2; struct timeval tstamp, tmp_tv; int interval = 0; @@ -1694,19 +1634,19 @@ static u32 ccid3_hc_rx_calc_first_li(struct sock *sk) int step = 0; u64 tmp1; - list_for_each_entry_safe(entry, next, &hcrx->ccid3hcrx_hist, ccid3hrx_node) { - if (entry->ccid3hrx_type == DCCP_PKT_DATA || - entry->ccid3hrx_type == DCCP_PKT_DATAACK) { + list_for_each_entry_safe(entry, next, &hcrx->ccid3hcrx_hist, + dccphrx_node) { + if (dccp_rx_hist_entry_data_packet(entry)) { tail = entry; switch (step) { case 0: - tstamp = entry->ccid3hrx_tstamp; - win_count = entry->ccid3hrx_win_count; + tstamp = entry->dccphrx_tstamp; + win_count = entry->dccphrx_win_count; step = 1; break; case 1: - interval = win_count - entry->ccid3hrx_win_count; + interval = win_count - entry->dccphrx_win_count; if (interval < 0) interval += TFRC_WIN_COUNT_LIMIT; if (interval > 4) @@ -1728,7 +1668,7 @@ static u32 ccid3_hc_rx_calc_first_li(struct sock *sk) interval = 1; } found: - timeval_sub(tstamp,tail->ccid3hrx_tstamp,&tmp_tv); + timeval_sub(tstamp,tail->dccphrx_tstamp,&tmp_tv); rtt = (tmp_tv.tv_sec * USEC_IN_SEC + tmp_tv.tv_usec) * 4 / interval; ccid3_pr_debug("%s, sk=%p, approximated RTT to %uus\n", dccp_role(sk), sk, rtt); @@ -1797,34 +1737,33 @@ static void ccid3_hc_rx_detect_loss(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; - struct ccid3_rx_hist_entry *entry, *a_next, *b_next, *packet; - struct ccid3_rx_hist_entry *a_loss = NULL; - struct ccid3_rx_hist_entry *b_loss = NULL; + struct dccp_rx_hist_entry *entry, *next, *packet; + struct dccp_rx_hist_entry *a_loss = NULL; + struct dccp_rx_hist_entry *b_loss = NULL; u64 seq_loss = DCCP_MAX_SEQNO + 1; u8 win_loss = 0; u8 num_later = TFRC_RECV_NUM_LATE_LOSS; - list_for_each_entry_safe(entry, b_next, &hcrx->ccid3hcrx_hist, ccid3hrx_node) { + list_for_each_entry_safe(entry, next, &hcrx->ccid3hcrx_hist, + dccphrx_node) { if (num_later == 0) { b_loss = entry; break; - } else if (entry->ccid3hrx_type == DCCP_PKT_DATA || - entry->ccid3hrx_type == DCCP_PKT_DATAACK) + } else if (dccp_rx_hist_entry_data_packet(entry)) --num_later; } if (b_loss == NULL) goto out_update_li; - a_next = b_next; num_later = 1; - list_for_each_entry_safe_continue(entry, a_next, &hcrx->ccid3hcrx_hist, ccid3hrx_node) { + list_for_each_entry_safe_continue(entry, next, &hcrx->ccid3hcrx_hist, + dccphrx_node) { if (num_later == 0) { a_loss = entry; break; - } else if (entry->ccid3hrx_type == DCCP_PKT_DATA || - entry->ccid3hrx_type == DCCP_PKT_DATAACK) + } else if (dccp_rx_hist_entry_data_packet(entry)) --num_later; } @@ -1844,12 +1783,13 @@ static void ccid3_hc_rx_detect_loss(struct sock *sk) /* Locate a lost data packet */ entry = packet = b_loss; - list_for_each_entry_safe_continue(entry, b_next, &hcrx->ccid3hcrx_hist, ccid3hrx_node) { - u64 delta = dccp_delta_seqno(entry->ccid3hrx_seqno, packet->ccid3hrx_seqno); + list_for_each_entry_safe_continue(entry, next, &hcrx->ccid3hcrx_hist, + dccphrx_node) { + u64 delta = dccp_delta_seqno(entry->dccphrx_seqno, + packet->dccphrx_seqno); if (delta != 0) { - if (packet->ccid3hrx_type == DCCP_PKT_DATA || - packet->ccid3hrx_type == DCCP_PKT_DATAACK) + if (dccp_rx_hist_entry_data_packet(packet)) --delta; /* * FIXME: check this, probably this % usage is because @@ -1858,10 +1798,12 @@ static void ccid3_hc_rx_detect_loss(struct sock *sk) */ #if 0 if (delta % DCCP_NDP_LIMIT != - (packet->ccid3hrx_ndp - entry->ccid3hrx_ndp) % DCCP_NDP_LIMIT) + (packet->dccphrx_ndp - + entry->dccphrx_ndp) % DCCP_NDP_LIMIT) #endif - if (delta != packet->ccid3hrx_ndp - entry->ccid3hrx_ndp) { - seq_loss = entry->ccid3hrx_seqno; + if (delta != + packet->dccphrx_ndp - entry->dccphrx_ndp) { + seq_loss = entry->dccphrx_seqno; dccp_inc_seqno(&seq_loss); } } @@ -1871,7 +1813,7 @@ static void ccid3_hc_rx_detect_loss(struct sock *sk) } if (seq_loss != DCCP_MAX_SEQNO + 1) - win_loss = a_loss->ccid3hrx_win_count; + win_loss = a_loss->dccphrx_win_count; out_update_li: ccid3_hc_rx_update_li(sk, seq_loss, win_loss); @@ -1920,7 +1862,7 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) { struct dccp_sock *dp = dccp_sk(sk); struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; - struct ccid3_rx_hist_entry *packet; + struct dccp_rx_hist_entry *packet; struct timeval now; u8 win_count; u32 p_prev; @@ -1964,14 +1906,16 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) return; } - packet = ccid3_rx_hist_entry_new(sk, skb, SLAB_ATOMIC); + packet = dccp_rx_hist_entry_new(ccid3_rx_hist, + dp->dccps_options_received.dccpor_ndp, + skb, SLAB_ATOMIC); if (packet == NULL) { ccid3_pr_debug("%s, sk=%p, Not enough mem to add rx packet to history (consider it lost)!", dccp_role(sk), sk); return; } - win_count = packet->ccid3hrx_win_count; + win_count = packet->dccphrx_win_count; ins = ccid3_hc_rx_add_hist(sk, packet); @@ -2060,7 +2004,7 @@ static void ccid3_hc_rx_exit(struct sock *sk) ccid3_hc_rx_set_state(sk, TFRC_RSTATE_TERM); /* Empty packet history */ - ccid3_rx_history_delete(&hcrx->ccid3hcrx_hist); + dccp_rx_hist_purge(ccid3_rx_hist, &hcrx->ccid3hcrx_hist); /* Empty loss interval history */ ccid3_loss_interval_history_delete(&hcrx->ccid3hcrx_loss_interval_hist); @@ -2093,41 +2037,38 @@ MODULE_PARM_DESC(ccid3_debug, "Enable debug messages"); static __init int ccid3_module_init(void) { - int rc = -ENOMEM; + int rc = -ENOBUFS; - ccid3_tx_hist_slab = kmem_cache_create("dccp_ccid3_tx_history", - sizeof(struct ccid3_tx_hist_entry), 0, - SLAB_HWCACHE_ALIGN, NULL, NULL); - if (ccid3_tx_hist_slab == NULL) + ccid3_rx_hist = dccp_rx_hist_new("ccid3"); + if (ccid3_rx_hist == NULL) goto out; - ccid3_rx_hist_slab = kmem_cache_create("dccp_ccid3_rx_history", - sizeof(struct ccid3_rx_hist_entry), 0, - SLAB_HWCACHE_ALIGN, NULL, NULL); - if (ccid3_rx_hist_slab == NULL) - goto out_free_tx_history; + ccid3_tx_hist = dccp_tx_hist_new("ccid3"); + if (ccid3_tx_hist == NULL) + goto out_free_rx; - ccid3_loss_interval_hist_slab = kmem_cache_create("dccp_ccid3_loss_interval_history", - sizeof(struct ccid3_loss_interval_hist_entry), 0, - SLAB_HWCACHE_ALIGN, NULL, NULL); + ccid3_loss_interval_hist_slab = kmem_cache_create("li_hist_ccid3", + sizeof(struct ccid3_loss_interval_hist_entry), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); if (ccid3_loss_interval_hist_slab == NULL) - goto out_free_rx_history; + goto out_free_tx; rc = ccid_register(&ccid3); if (rc != 0) goto out_free_loss_interval_history; - out: return rc; + out_free_loss_interval_history: kmem_cache_destroy(ccid3_loss_interval_hist_slab); ccid3_loss_interval_hist_slab = NULL; -out_free_rx_history: - kmem_cache_destroy(ccid3_rx_hist_slab); - ccid3_rx_hist_slab = NULL; -out_free_tx_history: - kmem_cache_destroy(ccid3_tx_hist_slab); - ccid3_tx_hist_slab = NULL; +out_free_tx: + dccp_tx_hist_delete(ccid3_tx_hist); + ccid3_tx_hist = NULL; +out_free_rx: + dccp_rx_hist_delete(ccid3_rx_hist); + ccid3_rx_hist = NULL; goto out; } module_init(ccid3_module_init); @@ -2136,13 +2077,13 @@ static __exit void ccid3_module_exit(void) { ccid_unregister(&ccid3); - if (ccid3_tx_hist_slab != NULL) { - kmem_cache_destroy(ccid3_tx_hist_slab); - ccid3_tx_hist_slab = NULL; + if (ccid3_tx_hist != NULL) { + dccp_tx_hist_delete(ccid3_tx_hist); + ccid3_tx_hist = NULL; } - if (ccid3_rx_hist_slab != NULL) { - kmem_cache_destroy(ccid3_rx_hist_slab); - ccid3_rx_hist_slab = NULL; + if (ccid3_rx_hist != NULL) { + dccp_rx_hist_delete(ccid3_rx_hist); + ccid3_rx_hist = NULL; } if (ccid3_loss_interval_hist_slab != NULL) { kmem_cache_destroy(ccid3_loss_interval_hist_slab); diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h index 5d6b623..d2705fb 100644 --- a/net/dccp/ccids/ccid3.h +++ b/net/dccp/ccids/ccid3.h @@ -38,15 +38,6 @@ #include #include -#include - -struct ccid3_tx_hist_entry { - struct list_head ccid3htx_node; - u64 ccid3htx_seqno:48, - ccid3htx_win_count:8, - ccid3htx_sent:1; - struct timeval ccid3htx_tstamp; -}; struct ccid3_options_received { u64 ccid3or_seqno:48, @@ -102,15 +93,6 @@ struct ccid3_loss_interval_hist_entry { u32 ccid3lih_interval; }; -struct ccid3_rx_hist_entry { - struct list_head ccid3hrx_node; - u64 ccid3hrx_seqno:48, - ccid3hrx_win_count:4, - ccid3hrx_type:4; - u32 ccid3hrx_ndp; /* In fact it is from 8 to 24 bits */ - struct timeval ccid3hrx_tstamp; -}; - struct ccid3_hc_rx_sock { u64 ccid3hcrx_seqno_last_counter:48, ccid3hcrx_state:8, diff --git a/net/dccp/packet_history.c b/net/dccp/packet_history.c new file mode 100644 index 0000000..6b41489 --- /dev/null +++ b/net/dccp/packet_history.c @@ -0,0 +1,198 @@ +/* + * net/dccp/packet_history.h + * + * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand. + * + * An implementation of the DCCP protocol + * + * This code has been developed by the University of Waikato WAND + * research group. For further information please see http://www.wand.net.nz/ + * or e-mail Ian McDonald - iam4@cs.waikato.ac.nz + * + * This code also uses code from Lulea University, rereleased as GPL by its + * authors: + * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon + * + * Changes to meet Linux coding standards, to make it meet latest ccid3 draft + * and to make it work as a loadable module in the DCCP stack written by + * Arnaldo Carvalho de Melo . + * + * Copyright (c) 2005 Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include + +#include "packet_history.h" + +struct dccp_rx_hist *dccp_rx_hist_new(const char *name) +{ + struct dccp_rx_hist *hist = kmalloc(sizeof(*hist), GFP_ATOMIC); + static const char dccp_rx_hist_mask[] = "rx_hist_%s"; + char *slab_name; + + if (hist == NULL) + goto out; + + slab_name = kmalloc(strlen(name) + sizeof(dccp_rx_hist_mask) - 1, + GFP_ATOMIC); + if (slab_name == NULL) + goto out_free_hist; + + sprintf(slab_name, dccp_rx_hist_mask, name); + hist->dccprxh_slab = kmem_cache_create(slab_name, + sizeof(struct dccp_rx_hist_entry), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (hist->dccprxh_slab == NULL) + goto out_free_slab_name; +out: + return hist; +out_free_slab_name: + kfree(slab_name); +out_free_hist: + kfree(hist); + hist = NULL; + goto out; +} + +EXPORT_SYMBOL_GPL(dccp_rx_hist_new); + +void dccp_rx_hist_delete(struct dccp_rx_hist *hist) +{ + const char* name = kmem_cache_name(hist->dccprxh_slab); + + kmem_cache_destroy(hist->dccprxh_slab); + kfree(name); + kfree(hist); +} + +EXPORT_SYMBOL_GPL(dccp_rx_hist_delete); + +void dccp_rx_hist_purge(struct dccp_rx_hist *hist, struct list_head *list) +{ + struct dccp_rx_hist_entry *entry, *next; + + list_for_each_entry_safe(entry, next, list, dccphrx_node) { + list_del_init(&entry->dccphrx_node); + kmem_cache_free(hist->dccprxh_slab, entry); + } +} + +EXPORT_SYMBOL_GPL(dccp_rx_hist_purge); + +struct dccp_rx_hist_entry * + dccp_rx_hist_find_data_packet(const struct list_head *list) +{ + struct dccp_rx_hist_entry *entry, *packet = NULL; + + list_for_each_entry(entry, list, dccphrx_node) + if (entry->dccphrx_type == DCCP_PKT_DATA || + entry->dccphrx_type == DCCP_PKT_DATAACK) { + packet = entry; + break; + } + + return packet; +} + +EXPORT_SYMBOL_GPL(dccp_rx_hist_find_data_packet); + +struct dccp_tx_hist *dccp_tx_hist_new(const char *name) +{ + struct dccp_tx_hist *hist = kmalloc(sizeof(*hist), GFP_ATOMIC); + static const char dccp_tx_hist_mask[] = "tx_hist_%s"; + char *slab_name; + + if (hist == NULL) + goto out; + + slab_name = kmalloc(strlen(name) + sizeof(dccp_tx_hist_mask) - 1, + GFP_ATOMIC); + if (slab_name == NULL) + goto out_free_hist; + + sprintf(slab_name, dccp_tx_hist_mask, name); + hist->dccptxh_slab = kmem_cache_create(slab_name, + sizeof(struct dccp_tx_hist_entry), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (hist->dccptxh_slab == NULL) + goto out_free_slab_name; +out: + return hist; +out_free_slab_name: + kfree(slab_name); +out_free_hist: + kfree(hist); + hist = NULL; + goto out; +} + +EXPORT_SYMBOL_GPL(dccp_tx_hist_new); + +void dccp_tx_hist_delete(struct dccp_tx_hist *hist) +{ + const char* name = kmem_cache_name(hist->dccptxh_slab); + + kmem_cache_destroy(hist->dccptxh_slab); + kfree(name); + kfree(hist); +} + +EXPORT_SYMBOL_GPL(dccp_tx_hist_delete); + +struct dccp_tx_hist_entry *dccp_tx_hist_find_entry(const struct list_head *list, + const u64 seq) +{ + struct dccp_tx_hist_entry *packet = NULL, *entry; + + list_for_each_entry(entry, list, dccphtx_node) + if (entry->dccphtx_seqno == seq) { + packet = entry; + break; + } + + return packet; +} + +EXPORT_SYMBOL_GPL(dccp_tx_hist_find_entry); + +void dccp_tx_hist_purge_older(struct dccp_tx_hist *hist, struct list_head *list, + struct dccp_tx_hist_entry *packet) +{ + struct dccp_tx_hist_entry *next; + + list_for_each_entry_safe_continue(packet, next, list, dccphtx_node) { + list_del_init(&packet->dccphtx_node); + dccp_tx_hist_entry_delete(hist, packet); + } +} + +EXPORT_SYMBOL_GPL(dccp_tx_hist_purge_older); + +void dccp_tx_hist_purge(struct dccp_tx_hist *hist, struct list_head *list) +{ + struct dccp_tx_hist_entry *entry, *next; + + list_for_each_entry_safe(entry, next, list, dccphtx_node) { + list_del_init(&entry->dccphtx_node); + dccp_tx_hist_entry_delete(hist, entry); + } +} + +EXPORT_SYMBOL_GPL(dccp_tx_hist_purge); diff --git a/net/dccp/packet_history.h b/net/dccp/packet_history.h new file mode 100644 index 0000000..565dc96 --- /dev/null +++ b/net/dccp/packet_history.h @@ -0,0 +1,182 @@ +/* + * net/dccp/packet_history.h + * + * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand. + * + * An implementation of the DCCP protocol + * + * This code has been developed by the University of Waikato WAND + * research group. For further information please see http://www.wand.net.nz/ + * or e-mail Ian McDonald - iam4@cs.waikato.ac.nz + * + * This code also uses code from Lulea University, rereleased as GPL by its + * authors: + * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon + * + * Changes to meet Linux coding standards, to make it meet latest ccid3 draft + * and to make it work as a loadable module in the DCCP stack written by + * Arnaldo Carvalho de Melo . + * + * Copyright (c) 2005 Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef _DCCP_PKT_HIST_ +#define _DCCP_PKT_HIST_ + +#include +#include +#include +#include + +#include "dccp.h" + +struct dccp_tx_hist_entry { + struct list_head dccphtx_node; + u64 dccphtx_seqno:48, + dccphtx_win_count:8, + dccphtx_sent:1; + struct timeval dccphtx_tstamp; +}; + +struct dccp_rx_hist_entry { + struct list_head dccphrx_node; + u64 dccphrx_seqno:48, + dccphrx_win_count:4, + dccphrx_type:4; + u32 dccphrx_ndp; /* In fact it is from 8 to 24 bits */ + struct timeval dccphrx_tstamp; +}; + +struct dccp_tx_hist { + kmem_cache_t *dccptxh_slab; +}; + +extern struct dccp_tx_hist *dccp_tx_hist_new(const char *name); +extern void dccp_tx_hist_delete(struct dccp_tx_hist *hist); + +struct dccp_rx_hist { + kmem_cache_t *dccprxh_slab; +}; + +extern struct dccp_rx_hist *dccp_rx_hist_new(const char *name); +extern void dccp_rx_hist_delete(struct dccp_rx_hist *hist); +extern struct dccp_rx_hist_entry * + dccp_rx_hist_find_data_packet(const struct list_head *list); + +static inline struct dccp_tx_hist_entry * + dccp_tx_hist_entry_new(struct dccp_tx_hist *hist, + const int prio) +{ + struct dccp_tx_hist_entry *entry = kmem_cache_alloc(hist->dccptxh_slab, + prio); + + if (entry != NULL) + entry->dccphtx_sent = 0; + + return entry; +} + +static inline void dccp_tx_hist_entry_delete(struct dccp_tx_hist *hist, + struct dccp_tx_hist_entry *entry) +{ + if (entry != NULL) + kmem_cache_free(hist->dccptxh_slab, entry); +} + +extern struct dccp_tx_hist_entry * + dccp_tx_hist_find_entry(const struct list_head *list, + const u64 seq); + +static inline void dccp_tx_hist_add_entry(struct list_head *list, + struct dccp_tx_hist_entry *entry) +{ + list_add(&entry->dccphtx_node, list); +} + +extern void dccp_tx_hist_purge_older(struct dccp_tx_hist *hist, + struct list_head *list, + struct dccp_tx_hist_entry *next); + +extern void dccp_tx_hist_purge(struct dccp_tx_hist *hist, + struct list_head *list); + +static inline struct dccp_tx_hist_entry *dccp_tx_hist_head(struct list_head *list) +{ + struct dccp_tx_hist_entry *head = NULL; + + if (!list_empty(list)) + head = list_entry(list->next, struct dccp_tx_hist_entry, + dccphtx_node); + return head; +} + +static inline struct dccp_rx_hist_entry * + dccp_rx_hist_entry_new(struct dccp_rx_hist *hist, + const u32 ndp, + const struct sk_buff *skb, + const int prio) +{ + struct dccp_rx_hist_entry *entry = kmem_cache_alloc(hist->dccprxh_slab, + prio); + + if (entry != NULL) { + const struct dccp_hdr *dh = dccp_hdr(skb); + + entry->dccphrx_seqno = DCCP_SKB_CB(skb)->dccpd_seq; + entry->dccphrx_win_count = dh->dccph_ccval; + entry->dccphrx_type = dh->dccph_type; + entry->dccphrx_ndp = ndp; + do_gettimeofday(&(entry->dccphrx_tstamp)); + } + + return entry; +} + +static inline void dccp_rx_hist_entry_delete(struct dccp_rx_hist *hist, + struct dccp_rx_hist_entry *entry) +{ + if (entry != NULL) + kmem_cache_free(hist->dccprxh_slab, entry); +} + +extern void dccp_rx_hist_purge(struct dccp_rx_hist *hist, + struct list_head *list); + +static inline void dccp_rx_hist_add_entry(struct list_head *list, + struct dccp_rx_hist_entry *entry) +{ + list_add(&entry->dccphrx_node, list); +} + +static inline struct dccp_rx_hist_entry *dccp_rx_hist_head(struct list_head *list) +{ + struct dccp_rx_hist_entry *head = NULL; + + if (!list_empty(list)) + head = list_entry(list->next, struct dccp_rx_hist_entry, + dccphrx_node); + return head; +} + +static inline int + dccp_rx_hist_entry_data_packet(const struct dccp_rx_hist_entry *entry) +{ + return entry->dccphrx_type == DCCP_PKT_DATA || + entry->dccphrx_type == DCCP_PKT_DATAACK; +} + +#endif /* _DCCP_PKT_HIST_ */ -- cgit v0.10.2 From cef07fd6029c20f95571d09cefce45ee3276a920 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 10 Aug 2005 13:29:27 -0300 Subject: [CCID3]: Ditch USEC_IN_SEC as time.h has USEC_PER_SEC That is equivalent, no need to have a private one. Signed-off-by: Arnaldo Carvalho de Melo diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 15c25f6..80f12c9 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -55,12 +55,10 @@ extern int ccid3_debug; #define TFRC_STD_PACKET_SIZE 256 #define TFRC_MAX_PACKET_SIZE 65535 -#define USEC_IN_SEC 1000000 - -#define TFRC_INITIAL_TIMEOUT (2 * USEC_IN_SEC) +#define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC) /* two seconds as per CCID3 spec 11 */ -#define TFRC_OPSYS_HALF_TIME_GRAN (USEC_IN_SEC / (2 * HZ)) +#define TFRC_OPSYS_HALF_TIME_GRAN (USEC_PER_SEC / (2 * HZ)) /* above is in usecs - half the scheduling granularity as per RFC3448 4.6 */ #define TFRC_WIN_COUNT_PER_RTT 4 @@ -155,20 +153,23 @@ static inline void ccid3_hc_tx_set_state(struct sock *sk, enum ccid3_hc_tx_state hctx->ccid3hctx_state = state; } -static void timeval_sub(struct timeval large, struct timeval small, struct timeval *result) { - +static void timeval_sub(struct timeval large, struct timeval small, + struct timeval *result) +{ result->tv_sec = large.tv_sec-small.tv_sec; if (large.tv_usec < small.tv_usec) { (result->tv_sec)--; - result->tv_usec = USEC_IN_SEC+large.tv_usec-small.tv_usec; + result->tv_usec = USEC_PER_SEC + + large.tv_usec - small.tv_usec; } else result->tv_usec = large.tv_usec-small.tv_usec; } -static inline void timeval_fix(struct timeval *tv) { - if (tv->tv_usec >= USEC_IN_SEC) { +static inline void timeval_fix(struct timeval *tv) +{ + if (tv->tv_usec >= USEC_PER_SEC) { tv->tv_sec++; - tv->tv_usec -= USEC_IN_SEC; + tv->tv_usec -= USEC_PER_SEC; } } @@ -1185,7 +1186,8 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) r_sample); /* Update timeout interval */ - inet_csk(sk)->icsk_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt, USEC_IN_SEC); + inet_csk(sk)->icsk_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt, + USEC_PER_SEC); /* Update receive rate */ hctx->ccid3hctx_x_recv = x_recv; /* x_recv in bytes per second */ @@ -1210,7 +1212,7 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) /* Update next send time */ if (hctx->ccid3hctx_t_ipi > (hctx->ccid3hctx_t_nom).tv_usec) { - (hctx->ccid3hctx_t_nom).tv_usec += USEC_IN_SEC; + hctx->ccid3hctx_t_nom.tv_usec += USEC_PER_SEC; (hctx->ccid3hctx_t_nom).tv_sec--; } /* FIXME - if no feedback then t_ipi can go > 1 second */ @@ -1344,7 +1346,7 @@ static int ccid3_hc_tx_init(struct sock *sk) hctx->ccid3hctx_x = hctx->ccid3hctx_s; /* set transmission rate to 1 packet per second */ hctx->ccid3hctx_rtt = 4; /* See ccid3_hc_tx_packet_sent win_count calculatation */ - inet_csk(sk)->icsk_rto = USEC_IN_SEC; + inet_csk(sk)->icsk_rto = USEC_PER_SEC; hctx->ccid3hctx_state = TFRC_SSTATE_NO_SENT; INIT_LIST_HEAD(&hctx->ccid3hctx_hist); init_timer(&hctx->ccid3hctx_no_feedback_timer); @@ -1531,7 +1533,8 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk) if (delta == 0) delta = 1; /* to prevent divide by zero */ - hcrx->ccid3hcrx_x_recv = (hcrx->ccid3hcrx_bytes_recv * USEC_IN_SEC) / delta; + hcrx->ccid3hcrx_x_recv = (hcrx->ccid3hcrx_bytes_recv * + USEC_PER_SEC) / delta; } break; default: @@ -1669,7 +1672,7 @@ static u32 ccid3_hc_rx_calc_first_li(struct sock *sk) } found: timeval_sub(tstamp,tail->dccphrx_tstamp,&tmp_tv); - rtt = (tmp_tv.tv_sec * USEC_IN_SEC + tmp_tv.tv_usec) * 4 / interval; + rtt = (tmp_tv.tv_sec * USEC_PER_SEC + tmp_tv.tv_usec) * 4 / interval; ccid3_pr_debug("%s, sk=%p, approximated RTT to %uus\n", dccp_role(sk), sk, rtt); if (rtt == 0) @@ -1679,7 +1682,7 @@ found: if (delta == 0) delta = 1; - x_recv = (hcrx->ccid3hcrx_bytes_recv * USEC_IN_SEC) / delta; + x_recv = (hcrx->ccid3hcrx_bytes_recv * USEC_PER_SEC) / delta; tmp1 = (u64)x_recv * (u64)rtt; do_div(tmp1,10000000); -- cgit v0.10.2 From 07dc3f0718d2c88c3094a0aadeeb4744effc5439 Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Wed, 10 Aug 2005 14:16:04 -0700 Subject: [NET]: Make use of ->private_data in sockfd_lookup Please consider the patch below which makes use of file->private_data to store the pointer to the socket, which avoids touching several unused cachelines in the dentry and inode in sockfd_lookup. Signed-off-by: Benjamin LaHaise Signed-off-by: David S. Miller diff --git a/net/socket.c b/net/socket.c index 6f2a1788..5f76ab8 100644 --- a/net/socket.c +++ b/net/socket.c @@ -404,6 +404,7 @@ int sock_map_fd(struct socket *sock) file->f_mode = FMODE_READ | FMODE_WRITE; file->f_flags = O_RDWR; file->f_pos = 0; + file->private_data = sock; fd_install(fd, file); } @@ -436,6 +437,9 @@ struct socket *sockfd_lookup(int fd, int *err) return NULL; } + if (file->f_op == &socket_file_ops) + return file->private_data; /* set in sock_map_fd */ + inode = file->f_dentry->d_inode; if (!S_ISSOCK(inode->i_mode)) { *err = -ENOTSOCK; -- cgit v0.10.2 From 14ab9b867aa6c107b4886bdc5b23f277ab10792e Mon Sep 17 00:00:00 2001 From: Peter Hagervall Date: Wed, 10 Aug 2005 14:18:16 -0700 Subject: [BNX2]: Possible sparse fixes, take two This patch contains the following possible cleanups/fixes: - use C99 struct initializers - make a few arrays and structs static - remove a few uses of literal 0 as NULL pointer - use convenience function instead of cast+dereference in bnx2_ioctl() - remove superfluous casts to u8 * in calls to readl/writel Signed-off-by: Peter Hagervall Acked-by: Michael Chan Signed-off-by: David S. Miller diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c index 8acc655..3a9d6a8 100644 --- a/drivers/net/bnx2.c +++ b/drivers/net/bnx2.c @@ -52,7 +52,6 @@ static struct { { "HP NC370i Multifunction Gigabit Server Adapter" }, { "Broadcom NetXtreme II BCM5706 1000Base-SX" }, { "HP NC370F Multifunction Gigabit Server Adapter" }, - { 0 }, }; static struct pci_device_id bnx2_pci_tbl[] = { @@ -3507,11 +3506,11 @@ bnx2_test_registers(struct bnx2 *bp) rw_mask = reg_tbl[i].rw_mask; ro_mask = reg_tbl[i].ro_mask; - save_val = readl((u8 *) bp->regview + offset); + save_val = readl(bp->regview + offset); - writel(0, (u8 *) bp->regview + offset); + writel(0, bp->regview + offset); - val = readl((u8 *) bp->regview + offset); + val = readl(bp->regview + offset); if ((val & rw_mask) != 0) { goto reg_test_err; } @@ -3520,9 +3519,9 @@ bnx2_test_registers(struct bnx2 *bp) goto reg_test_err; } - writel(0xffffffff, (u8 *) bp->regview + offset); + writel(0xffffffff, bp->regview + offset); - val = readl((u8 *) bp->regview + offset); + val = readl(bp->regview + offset); if ((val & rw_mask) != rw_mask) { goto reg_test_err; } @@ -3531,11 +3530,11 @@ bnx2_test_registers(struct bnx2 *bp) goto reg_test_err; } - writel(save_val, (u8 *) bp->regview + offset); + writel(save_val, bp->regview + offset); continue; reg_test_err: - writel(save_val, (u8 *) bp->regview + offset); + writel(save_val, bp->regview + offset); ret = -ENODEV; break; } @@ -4698,7 +4697,7 @@ bnx2_set_rx_csum(struct net_device *dev, u32 data) #define BNX2_NUM_STATS 45 -struct { +static struct { char string[ETH_GSTRING_LEN]; } bnx2_stats_str_arr[BNX2_NUM_STATS] = { { "rx_bytes" }, @@ -4750,7 +4749,7 @@ struct { #define STATS_OFFSET32(offset_name) (offsetof(struct statistics_block, offset_name) / 4) -unsigned long bnx2_stats_offset_arr[BNX2_NUM_STATS] = { +static unsigned long bnx2_stats_offset_arr[BNX2_NUM_STATS] = { STATS_OFFSET32(stat_IfHCInOctets_hi), STATS_OFFSET32(stat_IfHCInBadOctets_hi), STATS_OFFSET32(stat_IfHCOutOctets_hi), @@ -4801,7 +4800,7 @@ unsigned long bnx2_stats_offset_arr[BNX2_NUM_STATS] = { /* stat_IfHCInBadOctets and stat_Dot3StatsCarrierSenseErrors are * skipped because of errata. */ -u8 bnx2_5706_stats_len_arr[BNX2_NUM_STATS] = { +static u8 bnx2_5706_stats_len_arr[BNX2_NUM_STATS] = { 8,0,8,8,8,8,8,8,8,8, 4,0,4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4,4,4, @@ -4811,7 +4810,7 @@ u8 bnx2_5706_stats_len_arr[BNX2_NUM_STATS] = { #define BNX2_NUM_TESTS 6 -struct { +static struct { char string[ETH_GSTRING_LEN]; } bnx2_tests_str_arr[BNX2_NUM_TESTS] = { { "register_test (offline)" }, @@ -4910,7 +4909,7 @@ bnx2_get_ethtool_stats(struct net_device *dev, struct bnx2 *bp = dev->priv; int i; u32 *hw_stats = (u32 *) bp->stats_blk; - u8 *stats_len_arr = 0; + u8 *stats_len_arr = NULL; if (hw_stats == NULL) { memset(buf, 0, sizeof(u64) * BNX2_NUM_STATS); @@ -5012,7 +5011,7 @@ static struct ethtool_ops bnx2_ethtool_ops = { static int bnx2_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { - struct mii_ioctl_data *data = (struct mii_ioctl_data *)&ifr->ifr_data; + struct mii_ioctl_data *data = if_mii(ifr); struct bnx2 *bp = dev->priv; int err; @@ -5505,12 +5504,12 @@ bnx2_resume(struct pci_dev *pdev) } static struct pci_driver bnx2_pci_driver = { - name: DRV_MODULE_NAME, - id_table: bnx2_pci_tbl, - probe: bnx2_init_one, - remove: __devexit_p(bnx2_remove_one), - suspend: bnx2_suspend, - resume: bnx2_resume, + .name = DRV_MODULE_NAME, + .id_table = bnx2_pci_tbl, + .probe = bnx2_init_one, + .remove = __devexit_p(bnx2_remove_one), + .suspend = bnx2_suspend, + .resume = bnx2_resume, }; static int __init bnx2_init(void) -- cgit v0.10.2 From e41aac41e3856c87fee52c5b8bca71705d15449b Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 11 Aug 2005 14:37:16 -0700 Subject: [TCPDIAG]: Introduce CONFIG_IP_TCPDIAG_DCCP Similar to CONFIG_IP_TCPDIAG_IPV6 Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 0b3d9f1..c844954 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -419,15 +419,18 @@ config IP_TCPDIAG ---help--- Support for TCP socket monitoring interface used by native Linux tools such as ss. ss is included in iproute2, currently downloadable - at . If you want IPv6 support - and have selected IPv6 as a module, you need to build this as a - module too. + at . If you want IPv6 or DCCP + support and have selected IPv6 or DCCP as a module, you need to build + this as a module too. If unsure, say Y. config IP_TCPDIAG_IPV6 def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6) +config IP_TCPDIAG_DCCP + def_bool (IP_TCPDIAG=y && IP_DCCP=y) || (IP_TCPDIAG=m && IP_DCCP) + config TCP_CONG_ADVANCED bool "TCP: advanced congestion control" ---help--- diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index f5fc84a..8bf495c 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -45,7 +45,7 @@ static struct sock *tcpnl; #define TCPDIAG_PUT(skb, attrtype, attrlen) \ RTA_DATA(__RTA_PUT(skb, attrtype, attrlen)) -#if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE) +#ifdef CONFIG_IP_TCPDIAG_DCCP extern struct inet_hashinfo dccp_hashinfo; #endif @@ -216,7 +216,7 @@ static int tcpdiag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh) struct tcpdiagreq *req = NLMSG_DATA(nlh); struct sk_buff *rep; struct inet_hashinfo *hashinfo = &tcp_hashinfo; -#if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE) +#ifdef CONFIG_IP_TCPDIAG_DCCP if (nlh->nlmsg_type == DCCPDIAG_GETSOCK) hashinfo = &dccp_hashinfo; #endif @@ -614,7 +614,7 @@ static int tcpdiag_dump(struct sk_buff *skb, struct netlink_callback *cb) s_i = cb->args[1]; s_num = num = cb->args[2]; hashinfo = &tcp_hashinfo; -#if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE) +#ifdef CONFIG_IP_TCPDIAG_DCCP if (cb->nlh->nlmsg_type == DCCPDIAG_GETSOCK) hashinfo = &dccp_hashinfo; #endif @@ -752,7 +752,7 @@ tcpdiag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return 0; if (nlh->nlmsg_type != TCPDIAG_GETSOCK -#if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE) +#ifdef CONFIG_IP_TCPDIAG_DCCP && nlh->nlmsg_type != DCCPDIAG_GETSOCK #endif ) -- cgit v0.10.2 From 622439270c74f3ad4f69d1417aca4bb3b79514f4 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Thu, 11 Aug 2005 15:30:45 -0700 Subject: [NETFILTER]: Fix compilation when no PROC_FS enabled Signed-off-by: Harald Welte Signed-off-by: David S. Miller diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 573e76a..3e76bd0 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -167,12 +167,12 @@ int __init netfilter_log_init(void) { #ifdef CONFIG_PROC_FS struct proc_dir_entry *pde; + pde = create_proc_entry("nf_log", S_IRUGO, proc_net_netfilter); -#endif if (!pde) return -1; pde->proc_fops = &nflog_file_ops; - +#endif return 0; } -- cgit v0.10.2 From 5917ed961def82a4dba9198d11a75f79d115a8cb Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Thu, 11 Aug 2005 15:31:15 -0700 Subject: [NETFILTER]: Fix NF_QUEUE_NR() macro I obviously wanted to use bitwise-or, not logical or. Signed-off-by: Harald Welte Signed-off-by: David S. Miller diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index ac3c614..189ba67 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -29,7 +29,7 @@ #define NF_VERDICT_QMASK 0xffff0000 #define NF_VERDICT_QBITS 16 -#define NF_QUEUE_NR(x) ((x << NF_VERDICT_QBITS) & NF_VERDICT_QMASK || NF_QUEUE) +#define NF_QUEUE_NR(x) ((x << NF_VERDICT_QBITS) & NF_VERDICT_QMASK | NF_QUEUE) /* only for userspace compatibility */ #ifndef __KERNEL__ -- cgit v0.10.2 From 0a242efc4fb859b2da506cdf8f3366231602e4ff Mon Sep 17 00:00:00 2001 From: Denis Vlasenko Date: Thu, 11 Aug 2005 15:32:53 -0700 Subject: [NET]: Deinline netif_carrier_{on,off}(). # grep -r 'netif_carrier_o[nf]' linux-2.6.12 | wc -l 246 # size vmlinux.org vmlinux.carrier text data bss dec hex filename 4339634 1054414 259296 5653344 564360 vmlinux.org 4337710 1054414 259296 5651420 563bdc vmlinux.carrier And this ain't an allyesconfig kernel! Signed-off-by: David S. Miller diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 296cf93..d8e52ed 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -699,19 +699,9 @@ static inline int netif_carrier_ok(const struct net_device *dev) extern void __netdev_watchdog_up(struct net_device *dev); -static inline void netif_carrier_on(struct net_device *dev) -{ - if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) - linkwatch_fire_event(dev); - if (netif_running(dev)) - __netdev_watchdog_up(dev); -} +extern void netif_carrier_on(struct net_device *dev); -static inline void netif_carrier_off(struct net_device *dev) -{ - if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) - linkwatch_fire_event(dev); -} +extern void netif_carrier_off(struct net_device *dev); /* Hot-plugging. */ static inline int netif_device_present(struct net_device *dev) diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 0d066c9..99ceb91 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -238,6 +238,20 @@ static void dev_watchdog_down(struct net_device *dev) spin_unlock_bh(&dev->xmit_lock); } +void netif_carrier_on(struct net_device *dev) +{ + if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) + linkwatch_fire_event(dev); + if (netif_running(dev)) + __netdev_watchdog_up(dev); +} + +void netif_carrier_off(struct net_device *dev) +{ + if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) + linkwatch_fire_event(dev); +} + /* "NOOP" scheduler: the best scheduler, recommended for all interfaces under all circumstances. It is difficult to invent anything faster or cheaper. @@ -600,6 +614,8 @@ void dev_shutdown(struct net_device *dev) } EXPORT_SYMBOL(__netdev_watchdog_up); +EXPORT_SYMBOL(netif_carrier_on); +EXPORT_SYMBOL(netif_carrier_off); EXPORT_SYMBOL(noop_qdisc); EXPORT_SYMBOL(noop_qdisc_ops); EXPORT_SYMBOL(qdisc_create_dflt); -- cgit v0.10.2 From b766b305d3f2d8be173e5d9853534ea1afdbabba Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Fri, 12 Aug 2005 11:36:44 -0700 Subject: [NETFILTER]: Fix gcc-3.4.x warning about iplicit operator precedence Fix gcc-3.4.x warning about iplicit operator precedence in NF_QUEUE_NR() Signed-off-by: Harald Welte Signed-off-by: David S. Miller diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 189ba67..be365e7 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -29,7 +29,7 @@ #define NF_VERDICT_QMASK 0xffff0000 #define NF_VERDICT_QBITS 16 -#define NF_QUEUE_NR(x) ((x << NF_VERDICT_QBITS) & NF_VERDICT_QMASK | NF_QUEUE) +#define NF_QUEUE_NR(x) (((x << NF_VERDICT_QBITS) & NF_VERDICT_QMASK) | NF_QUEUE) /* only for userspace compatibility */ #ifndef __KERNEL__ -- cgit v0.10.2 From 505cbfc577f3fa778005e2800b869eca25727d5f Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 12 Aug 2005 09:19:38 -0300 Subject: [IPV6]: Generalise the tcp_v6_lookup routines In the same way as was done with the v4 counterparts, this will be moved to inet6_hashtables.c. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 777339b..3c7dbc6 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -193,6 +193,11 @@ struct inet6_skb_parm { #define IP6CB(skb) ((struct inet6_skb_parm*)((skb)->cb)) +static inline int inet6_iif(const struct sk_buff *skb) +{ + return IP6CB(skb)->iif; +} + struct tcp6_request_sock { struct tcp_request_sock req; struct in6_addr loc_addr; diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h new file mode 100644 index 0000000..297c2b1 --- /dev/null +++ b/include/net/inet6_hashtables.h @@ -0,0 +1,26 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Authors: Lotsa people, from code originally in tcp + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _INET6_HASHTABLES_H +#define _INET6_HASHTABLES_H + +#include + +struct in6_addr; +struct inet_hashinfo; + +extern struct sock *inet6_lookup(struct inet_hashinfo *hashinfo, + const struct in6_addr *saddr, const u16 sport, + const struct in6_addr *daddr, const u16 dport, + const int dif); +#endif /* _INET6_HASHTABLES_H */ diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index c844954..a79b4f9 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -425,9 +425,6 @@ config IP_TCPDIAG If unsure, say Y. -config IP_TCPDIAG_IPV6 - def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6) - config IP_TCPDIAG_DCCP def_bool (IP_TCPDIAG=y && IP_DCCP=y) || (IP_TCPDIAG=m && IP_DCCP) diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 8bf495c..b812191 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -24,6 +24,10 @@ #include #include #include +#include +#include +#include +#include #include #include @@ -102,7 +106,7 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, r->tcpdiag_wqueue = 0; r->tcpdiag_uid = 0; r->tcpdiag_inode = 0; -#ifdef CONFIG_IP_TCPDIAG_IPV6 +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) if (r->tcpdiag_family == AF_INET6) { const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk); @@ -121,7 +125,7 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, r->id.tcpdiag_src[0] = inet->rcv_saddr; r->id.tcpdiag_dst[0] = inet->daddr; -#ifdef CONFIG_IP_TCPDIAG_IPV6 +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) if (r->tcpdiag_family == AF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); @@ -196,19 +200,6 @@ nlmsg_failure: return -1; } -#ifdef CONFIG_IP_TCPDIAG_IPV6 -extern struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport, - struct in6_addr *daddr, u16 dport, - int dif); -#else -static inline struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport, - struct in6_addr *daddr, u16 dport, - int dif) -{ - return NULL; -} -#endif - static int tcpdiag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh) { int err; @@ -225,11 +216,14 @@ static int tcpdiag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh) req->id.tcpdiag_dport, req->id.tcpdiag_src[0], req->id.tcpdiag_sport, req->id.tcpdiag_if); } -#ifdef CONFIG_IP_TCPDIAG_IPV6 +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) else if (req->tcpdiag_family == AF_INET6) { - sk = tcp_v6_lookup((struct in6_addr*)req->id.tcpdiag_dst, req->id.tcpdiag_dport, - (struct in6_addr*)req->id.tcpdiag_src, req->id.tcpdiag_sport, - req->id.tcpdiag_if); + sk = inet6_lookup(hashinfo, + (struct in6_addr*)req->id.tcpdiag_dst, + req->id.tcpdiag_dport, + (struct in6_addr*)req->id.tcpdiag_src, + req->id.tcpdiag_sport, + req->id.tcpdiag_if); } #endif else { @@ -440,7 +434,7 @@ static int tcpdiag_dump_sock(struct sk_buff *skb, struct sock *sk, struct inet_sock *inet = inet_sk(sk); entry.family = sk->sk_family; -#ifdef CONFIG_IP_TCPDIAG_IPV6 +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) if (entry.family == AF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); @@ -502,7 +496,7 @@ static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk, r->tcpdiag_wqueue = 0; r->tcpdiag_uid = sock_i_uid(sk); r->tcpdiag_inode = 0; -#ifdef CONFIG_IP_TCPDIAG_IPV6 +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) if (r->tcpdiag_family == AF_INET6) { ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src, &tcp6_rsk(req)->loc_addr); @@ -567,13 +561,13 @@ static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk, if (bc) { entry.saddr = -#ifdef CONFIG_IP_TCPDIAG_IPV6 +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) (entry.family == AF_INET6) ? tcp6_rsk(req)->loc_addr.s6_addr32 : #endif &ireq->loc_addr; entry.daddr = -#ifdef CONFIG_IP_TCPDIAG_IPV6 +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) (entry.family == AF_INET6) ? tcp6_rsk(req)->rmt_addr.s6_addr32 : #endif diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 3312cb8..2bc7faf 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -76,26 +76,27 @@ static struct tcp_func ipv6_mapped; static struct tcp_func ipv6_specific; /* I have no idea if this is a good hash for v6 or not. -DaveM */ -static __inline__ int tcp_v6_hashfn(struct in6_addr *laddr, u16 lport, - struct in6_addr *faddr, u16 fport) +static inline int inet6_ehashfn(const struct in6_addr *laddr, const u16 lport, + const struct in6_addr *faddr, const u16 fport, + const int ehash_size) { int hashent = (lport ^ fport); hashent ^= (laddr->s6_addr32[3] ^ faddr->s6_addr32[3]); hashent ^= hashent>>16; hashent ^= hashent>>8; - return (hashent & (tcp_hashinfo.ehash_size - 1)); + return (hashent & (ehash_size - 1)); } -static __inline__ int tcp_v6_sk_hashfn(struct sock *sk) +static inline int inet6_sk_ehashfn(const struct sock *sk, const int ehash_size) { - struct inet_sock *inet = inet_sk(sk); - struct ipv6_pinfo *np = inet6_sk(sk); - struct in6_addr *laddr = &np->rcv_saddr; - struct in6_addr *faddr = &np->daddr; - __u16 lport = inet->num; - __u16 fport = inet->dport; - return tcp_v6_hashfn(laddr, lport, faddr, fport); + const struct inet_sock *inet = inet_sk(sk); + const struct ipv6_pinfo *np = inet6_sk(sk); + const struct in6_addr *laddr = &np->rcv_saddr; + const struct in6_addr *faddr = &np->daddr; + const __u16 lport = inet->num; + const __u16 fport = inet->dport; + return inet6_ehashfn(laddr, lport, faddr, fport, ehash_size); } static inline int tcp_v6_bind_conflict(const struct sock *sk, @@ -231,7 +232,7 @@ static __inline__ void __tcp_v6_hash(struct sock *sk) lock = &tcp_hashinfo.lhash_lock; inet_listen_wlock(&tcp_hashinfo); } else { - sk->sk_hashent = tcp_v6_sk_hashfn(sk); + sk->sk_hashent = inet6_sk_ehashfn(sk, tcp_hashinfo.ehash_size); list = &tcp_hashinfo.ehash[sk->sk_hashent].chain; lock = &tcp_hashinfo.ehash[sk->sk_hashent].lock; write_lock(lock); @@ -258,7 +259,10 @@ static void tcp_v6_hash(struct sock *sk) } } -static struct sock *tcp_v6_lookup_listener(struct in6_addr *daddr, unsigned short hnum, int dif) +static struct sock *inet6_lookup_listener(struct inet_hashinfo *hashinfo, + const struct in6_addr *daddr, + const unsigned short hnum, + const int dif) { struct sock *sk; struct hlist_node *node; @@ -266,8 +270,8 @@ static struct sock *tcp_v6_lookup_listener(struct in6_addr *daddr, unsigned shor int score, hiscore; hiscore=0; - read_lock(&tcp_hashinfo.lhash_lock); - sk_for_each(sk, node, &tcp_hashinfo.listening_hash[inet_lhashfn(hnum)]) { + read_lock(&hashinfo->lhash_lock); + sk_for_each(sk, node, &hashinfo->listening_hash[inet_lhashfn(hnum)]) { if (inet_sk(sk)->num == hnum && sk->sk_family == PF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); @@ -294,7 +298,7 @@ static struct sock *tcp_v6_lookup_listener(struct in6_addr *daddr, unsigned shor } if (result) sock_hold(result); - read_unlock(&tcp_hashinfo.lhash_lock); + read_unlock(&hashinfo->lhash_lock); return result; } @@ -304,9 +308,13 @@ static struct sock *tcp_v6_lookup_listener(struct in6_addr *daddr, unsigned shor * The sockhash lock must be held as a reader here. */ -static inline struct sock *__tcp_v6_lookup_established(struct in6_addr *saddr, u16 sport, - struct in6_addr *daddr, u16 hnum, - int dif) +static inline struct sock * + __inet6_lookup_established(struct inet_hashinfo *hashinfo, + const struct in6_addr *saddr, + const u16 sport, + const struct in6_addr *daddr, + const u16 hnum, + const int dif) { struct sock *sk; const struct hlist_node *node; @@ -314,8 +322,9 @@ static inline struct sock *__tcp_v6_lookup_established(struct in6_addr *saddr, u /* Optimize here for direct hit, only listening connections can * have wildcards anyways. */ - const int hash = tcp_v6_hashfn(daddr, hnum, saddr, sport); - struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash]; + const int hash = inet6_ehashfn(daddr, hnum, saddr, sport, + hashinfo->ehash_size); + struct inet_ehash_bucket *head = &hashinfo->ehash[hash]; read_lock(&head->lock); sk_for_each(sk, node, &head->chain) { @@ -324,7 +333,7 @@ static inline struct sock *__tcp_v6_lookup_established(struct in6_addr *saddr, u goto hit; /* You sunk my battleship! */ } /* Must check for a TIME_WAIT'er before going to listener hash. */ - sk_for_each(sk, node, &(head + tcp_hashinfo.ehash_size)->chain) { + sk_for_each(sk, node, &(head + hashinfo->ehash_size)->chain) { const struct inet_timewait_sock *tw = inet_twsk(sk); if(*((__u32 *)&(tw->tw_dport)) == ports && @@ -347,34 +356,36 @@ hit: } -static inline struct sock *__tcp_v6_lookup(struct in6_addr *saddr, u16 sport, - struct in6_addr *daddr, u16 hnum, - int dif) +static inline struct sock *__inet6_lookup(struct inet_hashinfo *hashinfo, + const struct in6_addr *saddr, + const u16 sport, + const struct in6_addr *daddr, + const u16 hnum, + const int dif) { - struct sock *sk; - - sk = __tcp_v6_lookup_established(saddr, sport, daddr, hnum, dif); - + struct sock *sk = __inet6_lookup_established(hashinfo, saddr, sport, + daddr, hnum, dif); if (sk) return sk; - return tcp_v6_lookup_listener(daddr, hnum, dif); + return inet6_lookup_listener(hashinfo, daddr, hnum, dif); } -inline struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport, - struct in6_addr *daddr, u16 dport, - int dif) +inline struct sock *inet6_lookup(struct inet_hashinfo *hashinfo, + const struct in6_addr *saddr, const u16 sport, + const struct in6_addr *daddr, const u16 dport, + const int dif) { struct sock *sk; local_bh_disable(); - sk = __tcp_v6_lookup(saddr, sport, daddr, ntohs(dport), dif); + sk = __inet6_lookup(hashinfo, saddr, sport, daddr, ntohs(dport), dif); local_bh_enable(); return sk; } -EXPORT_SYMBOL_GPL(tcp_v6_lookup); +EXPORT_SYMBOL_GPL(inet6_lookup); /* @@ -454,16 +465,17 @@ static __u32 tcp_v6_init_sequence(struct sock *sk, struct sk_buff *skb) } } -static int __tcp_v6_check_established(struct sock *sk, __u16 lport, +static int __tcp_v6_check_established(struct sock *sk, const __u16 lport, struct inet_timewait_sock **twp) { struct inet_sock *inet = inet_sk(sk); - struct ipv6_pinfo *np = inet6_sk(sk); - struct in6_addr *daddr = &np->rcv_saddr; - struct in6_addr *saddr = &np->daddr; - int dif = sk->sk_bound_dev_if; + const struct ipv6_pinfo *np = inet6_sk(sk); + const struct in6_addr *daddr = &np->rcv_saddr; + const struct in6_addr *saddr = &np->daddr; + const int dif = sk->sk_bound_dev_if; const u32 ports = INET_COMBINED_PORTS(inet->dport, lport); - const int hash = tcp_v6_hashfn(daddr, inet->num, saddr, inet->dport); + const int hash = inet6_ehashfn(daddr, inet->num, saddr, inet->dport, + tcp_hashinfo.ehash_size); struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash]; struct sock *sk2; const struct hlist_node *node; @@ -637,11 +649,6 @@ out: } } -static __inline__ int tcp_v6_iif(struct sk_buff *skb) -{ - return IP6CB(skb)->iif; -} - static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { @@ -833,14 +840,15 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, int type, int code, int offset, __u32 info) { struct ipv6hdr *hdr = (struct ipv6hdr*)skb->data; - struct tcphdr *th = (struct tcphdr *)(skb->data+offset); + const struct tcphdr *th = (struct tcphdr *)(skb->data+offset); struct ipv6_pinfo *np; struct sock *sk; int err; struct tcp_sock *tp; __u32 seq; - sk = tcp_v6_lookup(&hdr->daddr, th->dest, &hdr->saddr, th->source, skb->dev->ifindex); + sk = inet6_lookup(&tcp_hashinfo, &hdr->daddr, th->dest, &hdr->saddr, + th->source, skb->dev->ifindex); if (sk == NULL) { ICMP6_INC_STATS_BH(__in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); @@ -927,7 +935,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, goto out; req = tcp_v6_search_req(sk, &prev, th->dest, &hdr->daddr, - &hdr->saddr, tcp_v6_iif(skb)); + &hdr->saddr, inet6_iif(skb)); if (!req) goto out; @@ -1138,7 +1146,7 @@ static void tcp_v6_send_reset(struct sk_buff *skb) buff->csum); fl.proto = IPPROTO_TCP; - fl.oif = tcp_v6_iif(skb); + fl.oif = inet6_iif(skb); fl.fl_ip_dport = t1->dest; fl.fl_ip_sport = t1->source; @@ -1207,7 +1215,7 @@ static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 buff->csum); fl.proto = IPPROTO_TCP; - fl.oif = tcp_v6_iif(skb); + fl.oif = inet6_iif(skb); fl.fl_ip_dport = t1->dest; fl.fl_ip_sport = t1->source; @@ -1245,20 +1253,18 @@ static void tcp_v6_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req) static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) { struct request_sock *req, **prev; - struct tcphdr *th = skb->h.th; + const struct tcphdr *th = skb->h.th; struct sock *nsk; /* Find possible connection requests. */ req = tcp_v6_search_req(sk, &prev, th->source, &skb->nh.ipv6h->saddr, - &skb->nh.ipv6h->daddr, tcp_v6_iif(skb)); + &skb->nh.ipv6h->daddr, inet6_iif(skb)); if (req) return tcp_check_req(sk, skb, req, prev); - nsk = __tcp_v6_lookup_established(&skb->nh.ipv6h->saddr, - th->source, - &skb->nh.ipv6h->daddr, - ntohs(th->dest), - tcp_v6_iif(skb)); + nsk = __inet6_lookup_established(&tcp_hashinfo, &skb->nh.ipv6h->saddr, + th->source, &skb->nh.ipv6h->daddr, + ntohs(th->dest), inet6_iif(skb)); if (nsk) { if (nsk->sk_state != TCP_TIME_WAIT) { @@ -1346,7 +1352,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) /* So that link locals have meaning */ if (!sk->sk_bound_dev_if && ipv6_addr_type(&treq->rmt_addr) & IPV6_ADDR_LINKLOCAL) - treq->iif = tcp_v6_iif(skb); + treq->iif = inet6_iif(skb); if (isn == 0) isn = tcp_v6_init_sequence(sk,skb); @@ -1411,7 +1417,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newsk->sk_backlog_rcv = tcp_v4_do_rcv; newnp->pktoptions = NULL; newnp->opt = NULL; - newnp->mcast_oif = tcp_v6_iif(skb); + newnp->mcast_oif = inet6_iif(skb); newnp->mcast_hops = skb->nh.ipv6h->hop_limit; /* @@ -1516,7 +1522,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, skb_set_owner_r(newnp->pktoptions, newsk); } newnp->opt = NULL; - newnp->mcast_oif = tcp_v6_iif(skb); + newnp->mcast_oif = inet6_iif(skb); newnp->mcast_hops = skb->nh.ipv6h->hop_limit; /* Clone native IPv6 options from listening socket (if any) @@ -1691,7 +1697,7 @@ ipv6_pktoptions: if (TCP_SKB_CB(opt_skb)->end_seq == tp->rcv_nxt && !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { if (np->rxopt.bits.rxinfo) - np->mcast_oif = tcp_v6_iif(opt_skb); + np->mcast_oif = inet6_iif(opt_skb); if (np->rxopt.bits.rxhlim) np->mcast_hops = opt_skb->nh.ipv6h->hop_limit; if (ipv6_opt_accepted(sk, opt_skb)) { @@ -1746,8 +1752,9 @@ static int tcp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) TCP_SKB_CB(skb)->flags = ipv6_get_dsfield(skb->nh.ipv6h); TCP_SKB_CB(skb)->sacked = 0; - sk = __tcp_v6_lookup(&skb->nh.ipv6h->saddr, th->source, - &skb->nh.ipv6h->daddr, ntohs(th->dest), tcp_v6_iif(skb)); + sk = __inet6_lookup(&tcp_hashinfo, &skb->nh.ipv6h->saddr, th->source, + &skb->nh.ipv6h->daddr, ntohs(th->dest), + inet6_iif(skb)); if (!sk) goto no_tcp_socket; @@ -1818,7 +1825,9 @@ do_time_wait: { struct sock *sk2; - sk2 = tcp_v6_lookup_listener(&skb->nh.ipv6h->daddr, ntohs(th->dest), tcp_v6_iif(skb)); + sk2 = inet6_lookup_listener(&tcp_hashinfo, + &skb->nh.ipv6h->daddr, + ntohs(th->dest), inet6_iif(skb)); if (sk2 != NULL) { struct inet_timewait_sock *tw = inet_twsk(sk); inet_twsk_deschedule(tw, &tcp_death_row); -- cgit v0.10.2 From 5324a040ccc708998e61ea93e669b81312f0ae11 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 12 Aug 2005 09:26:18 -0300 Subject: [INET6_HASHTABLES]: Move inet6_lookup functions to net/ipv6/inet6_hashtables.c Doing this we allow tcp_diag to support IPV6 even if tcp_diag is compiled statically and IPV6 is compiled as a module, removing the previous restriction while not building any IPV6 code if it is not selected. Now to work on the tcpdiag_register infrastructure and then to rename the whole thing to inetdiag, reflecting its by then completely generic nature. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index 297c2b1..03df3b1 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -14,13 +14,117 @@ #ifndef _INET6_HASHTABLES_H #define _INET6_HASHTABLES_H +#include + +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) +#include +#include #include -struct in6_addr; +#include + struct inet_hashinfo; +/* I have no idea if this is a good hash for v6 or not. -DaveM */ +static inline int inet6_ehashfn(const struct in6_addr *laddr, const u16 lport, + const struct in6_addr *faddr, const u16 fport, + const int ehash_size) +{ + int hashent = (lport ^ fport); + + hashent ^= (laddr->s6_addr32[3] ^ faddr->s6_addr32[3]); + hashent ^= hashent >> 16; + hashent ^= hashent >> 8; + return (hashent & (ehash_size - 1)); +} + +static inline int inet6_sk_ehashfn(const struct sock *sk, const int ehash_size) +{ + const struct inet_sock *inet = inet_sk(sk); + const struct ipv6_pinfo *np = inet6_sk(sk); + const struct in6_addr *laddr = &np->rcv_saddr; + const struct in6_addr *faddr = &np->daddr; + const __u16 lport = inet->num; + const __u16 fport = inet->dport; + return inet6_ehashfn(laddr, lport, faddr, fport, ehash_size); +} + +/* + * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so + * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM + * + * The sockhash lock must be held as a reader here. + */ +static inline struct sock * + __inet6_lookup_established(struct inet_hashinfo *hashinfo, + const struct in6_addr *saddr, + const u16 sport, + const struct in6_addr *daddr, + const u16 hnum, + const int dif) +{ + struct sock *sk; + const struct hlist_node *node; + const __u32 ports = INET_COMBINED_PORTS(sport, hnum); + /* Optimize here for direct hit, only listening connections can + * have wildcards anyways. + */ + const int hash = inet6_ehashfn(daddr, hnum, saddr, sport, + hashinfo->ehash_size); + struct inet_ehash_bucket *head = &hashinfo->ehash[hash]; + + read_lock(&head->lock); + sk_for_each(sk, node, &head->chain) { + /* For IPV6 do the cheaper port and family tests first. */ + if (INET6_MATCH(sk, saddr, daddr, ports, dif)) + goto hit; /* You sunk my battleship! */ + } + /* Must check for a TIME_WAIT'er before going to listener hash. */ + sk_for_each(sk, node, &(head + hashinfo->ehash_size)->chain) { + const struct inet_timewait_sock *tw = inet_twsk(sk); + + if(*((__u32 *)&(tw->tw_dport)) == ports && + sk->sk_family == PF_INET6) { + const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk); + + if (ipv6_addr_equal(&tcp6tw->tw_v6_daddr, saddr) && + ipv6_addr_equal(&tcp6tw->tw_v6_rcv_saddr, daddr) && + (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dif)) + goto hit; + } + } + read_unlock(&head->lock); + return NULL; + +hit: + sock_hold(sk); + read_unlock(&head->lock); + return sk; +} + +extern struct sock *inet6_lookup_listener(struct inet_hashinfo *hashinfo, + const struct in6_addr *daddr, + const unsigned short hnum, + const int dif); + +static inline struct sock *__inet6_lookup(struct inet_hashinfo *hashinfo, + const struct in6_addr *saddr, + const u16 sport, + const struct in6_addr *daddr, + const u16 hnum, + const int dif) +{ + struct sock *sk = __inet6_lookup_established(hashinfo, saddr, sport, + daddr, hnum, dif); + if (sk) + return sk; + + return inet6_lookup_listener(hashinfo, daddr, hnum, dif); +} + extern struct sock *inet6_lookup(struct inet_hashinfo *hashinfo, const struct in6_addr *saddr, const u16 sport, const struct in6_addr *daddr, const u16 dport, const int dif); +#endif /* defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) */ #endif /* _INET6_HASHTABLES_H */ diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index a79b4f9..960c02f 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -419,9 +419,7 @@ config IP_TCPDIAG ---help--- Support for TCP socket monitoring interface used by native Linux tools such as ss. ss is included in iproute2, currently downloadable - at . If you want IPv6 or DCCP - support and have selected IPv6 or DCCP as a module, you need to build - this as a module too. + at . If unsure, say Y. diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index 5bccea2..6460eec 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -23,3 +23,5 @@ obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o obj-y += exthdrs_core.o + +obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c new file mode 100644 index 0000000..01d5f46 --- /dev/null +++ b/net/ipv6/inet6_hashtables.c @@ -0,0 +1,81 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Generic INET6 transport hashtables + * + * Authors: Lotsa people, from code originally in tcp + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include + +#include + +#include +#include +#include + +struct sock *inet6_lookup_listener(struct inet_hashinfo *hashinfo, + const struct in6_addr *daddr, + const unsigned short hnum, const int dif) +{ + struct sock *sk; + const struct hlist_node *node; + struct sock *result = NULL; + int score, hiscore = 0; + + read_lock(&hashinfo->lhash_lock); + sk_for_each(sk, node, &hashinfo->listening_hash[inet_lhashfn(hnum)]) { + if (inet_sk(sk)->num == hnum && sk->sk_family == PF_INET6) { + const struct ipv6_pinfo *np = inet6_sk(sk); + + score = 1; + if (!ipv6_addr_any(&np->rcv_saddr)) { + if (!ipv6_addr_equal(&np->rcv_saddr, daddr)) + continue; + score++; + } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + continue; + score++; + } + if (score == 3) { + result = sk; + break; + } + if (score > hiscore) { + hiscore = score; + result = sk; + } + } + } + if (result) + sock_hold(result); + read_unlock(&hashinfo->lhash_lock); + return result; +} + +EXPORT_SYMBOL_GPL(inet6_lookup_listener); + +struct sock *inet6_lookup(struct inet_hashinfo *hashinfo, + const struct in6_addr *saddr, const u16 sport, + const struct in6_addr *daddr, const u16 dport, + const int dif) +{ + struct sock *sk; + + local_bh_disable(); + sk = __inet6_lookup(hashinfo, saddr, sport, daddr, ntohs(dport), dif); + local_bh_enable(); + + return sk; +} + +EXPORT_SYMBOL_GPL(inet6_lookup); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 2bc7faf..fb291b8 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -47,6 +47,7 @@ #include #include +#include #include #include #include @@ -75,30 +76,6 @@ static int tcp_v6_xmit(struct sk_buff *skb, int ipfragok); static struct tcp_func ipv6_mapped; static struct tcp_func ipv6_specific; -/* I have no idea if this is a good hash for v6 or not. -DaveM */ -static inline int inet6_ehashfn(const struct in6_addr *laddr, const u16 lport, - const struct in6_addr *faddr, const u16 fport, - const int ehash_size) -{ - int hashent = (lport ^ fport); - - hashent ^= (laddr->s6_addr32[3] ^ faddr->s6_addr32[3]); - hashent ^= hashent>>16; - hashent ^= hashent>>8; - return (hashent & (ehash_size - 1)); -} - -static inline int inet6_sk_ehashfn(const struct sock *sk, const int ehash_size) -{ - const struct inet_sock *inet = inet_sk(sk); - const struct ipv6_pinfo *np = inet6_sk(sk); - const struct in6_addr *laddr = &np->rcv_saddr; - const struct in6_addr *faddr = &np->daddr; - const __u16 lport = inet->num; - const __u16 fport = inet->dport; - return inet6_ehashfn(laddr, lport, faddr, fport, ehash_size); -} - static inline int tcp_v6_bind_conflict(const struct sock *sk, const struct inet_bind_bucket *tb) { @@ -259,135 +236,6 @@ static void tcp_v6_hash(struct sock *sk) } } -static struct sock *inet6_lookup_listener(struct inet_hashinfo *hashinfo, - const struct in6_addr *daddr, - const unsigned short hnum, - const int dif) -{ - struct sock *sk; - struct hlist_node *node; - struct sock *result = NULL; - int score, hiscore; - - hiscore=0; - read_lock(&hashinfo->lhash_lock); - sk_for_each(sk, node, &hashinfo->listening_hash[inet_lhashfn(hnum)]) { - if (inet_sk(sk)->num == hnum && sk->sk_family == PF_INET6) { - struct ipv6_pinfo *np = inet6_sk(sk); - - score = 1; - if (!ipv6_addr_any(&np->rcv_saddr)) { - if (!ipv6_addr_equal(&np->rcv_saddr, daddr)) - continue; - score++; - } - if (sk->sk_bound_dev_if) { - if (sk->sk_bound_dev_if != dif) - continue; - score++; - } - if (score == 3) { - result = sk; - break; - } - if (score > hiscore) { - hiscore = score; - result = sk; - } - } - } - if (result) - sock_hold(result); - read_unlock(&hashinfo->lhash_lock); - return result; -} - -/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so - * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM - * - * The sockhash lock must be held as a reader here. - */ - -static inline struct sock * - __inet6_lookup_established(struct inet_hashinfo *hashinfo, - const struct in6_addr *saddr, - const u16 sport, - const struct in6_addr *daddr, - const u16 hnum, - const int dif) -{ - struct sock *sk; - const struct hlist_node *node; - const __u32 ports = INET_COMBINED_PORTS(sport, hnum); - /* Optimize here for direct hit, only listening connections can - * have wildcards anyways. - */ - const int hash = inet6_ehashfn(daddr, hnum, saddr, sport, - hashinfo->ehash_size); - struct inet_ehash_bucket *head = &hashinfo->ehash[hash]; - - read_lock(&head->lock); - sk_for_each(sk, node, &head->chain) { - /* For IPV6 do the cheaper port and family tests first. */ - if (INET6_MATCH(sk, saddr, daddr, ports, dif)) - goto hit; /* You sunk my battleship! */ - } - /* Must check for a TIME_WAIT'er before going to listener hash. */ - sk_for_each(sk, node, &(head + hashinfo->ehash_size)->chain) { - const struct inet_timewait_sock *tw = inet_twsk(sk); - - if(*((__u32 *)&(tw->tw_dport)) == ports && - sk->sk_family == PF_INET6) { - const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk); - - if (ipv6_addr_equal(&tcp6tw->tw_v6_daddr, saddr) && - ipv6_addr_equal(&tcp6tw->tw_v6_rcv_saddr, daddr) && - (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dif)) - goto hit; - } - } - read_unlock(&head->lock); - return NULL; - -hit: - sock_hold(sk); - read_unlock(&head->lock); - return sk; -} - - -static inline struct sock *__inet6_lookup(struct inet_hashinfo *hashinfo, - const struct in6_addr *saddr, - const u16 sport, - const struct in6_addr *daddr, - const u16 hnum, - const int dif) -{ - struct sock *sk = __inet6_lookup_established(hashinfo, saddr, sport, - daddr, hnum, dif); - if (sk) - return sk; - - return inet6_lookup_listener(hashinfo, daddr, hnum, dif); -} - -inline struct sock *inet6_lookup(struct inet_hashinfo *hashinfo, - const struct in6_addr *saddr, const u16 sport, - const struct in6_addr *daddr, const u16 dport, - const int dif) -{ - struct sock *sk; - - local_bh_disable(); - sk = __inet6_lookup(hashinfo, saddr, sport, daddr, ntohs(dport), dif); - local_bh_enable(); - - return sk; -} - -EXPORT_SYMBOL_GPL(inet6_lookup); - - /* * Open request hash tables. */ -- cgit v0.10.2 From 4f5736c4c7cf6f9bd8db82b712cfdd51c87e06b9 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 12 Aug 2005 09:27:49 -0300 Subject: [TCPDIAG]: Introduce inet_diag_{register,unregister} Next changeset will rename tcp_diag to inet_diag and move the tcp_diag code out of it and into a new tcp_diag.c, similar to the net/dccp/diag.c introduced in this changeset, completing the transition to a generic inet_diag infrastructure. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/linux/tcp_diag.h b/include/linux/tcp_diag.h index 190494e..910c34b 100644 --- a/include/linux/tcp_diag.h +++ b/include/linux/tcp_diag.h @@ -5,6 +5,8 @@ #define TCPDIAG_GETSOCK 18 #define DCCPDIAG_GETSOCK 19 +#define INET_DIAG_GETSOCK_MAX 24 + /* Socket identity */ struct tcpdiag_sockid { @@ -125,4 +127,21 @@ struct tcpvegas_info { __u32 tcpv_minrtt; }; +#ifdef __KERNEL__ +struct sock; +struct inet_hashinfo; + +struct inet_diag_handler { + struct inet_hashinfo *idiag_hashinfo; + void (*idiag_get_info)(struct sock *sk, + struct tcpdiagmsg *r, + void *info); + __u16 idiag_info_size; + __u16 idiag_type; +}; + +extern int inet_diag_register(const struct inet_diag_handler *handler); +extern void inet_diag_unregister(const struct inet_diag_handler *handler); +#endif /* __KERNEL__ */ + #endif /* _TCP_DIAG_H_ */ diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig index 90460bc..ff5b545 100644 --- a/net/dccp/Kconfig +++ b/net/dccp/Kconfig @@ -19,6 +19,11 @@ config IP_DCCP If in doubt, say N. +config IP_DCCP_DIAG + depends on IP_DCCP && IP_TCPDIAG + def_tristate y if (IP_DCCP = y && IP_TCPDIAG = y) + def_tristate m + source "net/dccp/ccids/Kconfig" endmenu diff --git a/net/dccp/Makefile b/net/dccp/Makefile index 25a50bd..5741fff 100644 --- a/net/dccp/Makefile +++ b/net/dccp/Makefile @@ -3,4 +3,8 @@ obj-$(CONFIG_IP_DCCP) += dccp.o dccp-y := ccid.o input.o ipv4.o minisocks.o options.o output.o proto.o \ timer.o packet_history.o +obj-$(CONFIG_IP_DCCP_DIAG) += dccp_diag.o + obj-y += ccids/ + +dccp_diag-y := diag.o diff --git a/net/dccp/diag.c b/net/dccp/diag.c new file mode 100644 index 0000000..4d9037c --- /dev/null +++ b/net/dccp/diag.c @@ -0,0 +1,47 @@ +/* + * net/dccp/diag.c + * + * An implementation of the DCCP protocol + * Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include + +#include +#include + +#include "dccp.h" + +static void dccp_diag_get_info(struct sock *sk, struct tcpdiagmsg *r, + void *_info) +{ + r->tcpdiag_rqueue = r->tcpdiag_wqueue = 0; +} + +static struct inet_diag_handler dccp_diag_handler = { + .idiag_hashinfo = &dccp_hashinfo, + .idiag_get_info = dccp_diag_get_info, + .idiag_type = DCCPDIAG_GETSOCK, + .idiag_info_size = 0, +}; + +static int __init dccp_diag_init(void) +{ + return inet_diag_register(&dccp_diag_handler); +} + +static void __exit dccp_diag_fini(void) +{ + inet_diag_unregister(&dccp_diag_handler); +} + +module_init(dccp_diag_init); +module_exit(dccp_diag_fini); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Arnaldo Carvalho de Melo "); +MODULE_DESCRIPTION("DCCP inet_diag handler"); diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 960c02f..1e6db2a 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -423,9 +423,6 @@ config IP_TCPDIAG If unsure, say Y. -config IP_TCPDIAG_DCCP - def_bool (IP_TCPDIAG=y && IP_DCCP=y) || (IP_TCPDIAG=m && IP_DCCP) - config TCP_CONG_ADVANCED bool "TCP: advanced congestion control" ---help--- diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index b812191..b13b71c 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -34,6 +34,8 @@ #include +static const struct inet_diag_handler **inet_diag_table; + struct tcpdiag_entry { u32 *saddr; @@ -61,18 +63,24 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, const struct inet_connection_sock *icsk = inet_csk(sk); struct tcpdiagmsg *r; struct nlmsghdr *nlh; - struct tcp_info *info = NULL; + void *info = NULL; struct tcpdiag_meminfo *minfo = NULL; unsigned char *b = skb->tail; + const struct inet_diag_handler *handler; + + handler = inet_diag_table[unlh->nlmsg_type]; + BUG_ON(handler == NULL); nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r)); nlh->nlmsg_flags = nlmsg_flags; + r = NLMSG_DATA(nlh); if (sk->sk_state != TCP_TIME_WAIT) { if (ext & (1<<(TCPDIAG_MEMINFO-1))) minfo = TCPDIAG_PUT(skb, TCPDIAG_MEMINFO, sizeof(*minfo)); if (ext & (1<<(TCPDIAG_INFO-1))) - info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info)); + info = TCPDIAG_PUT(skb, TCPDIAG_INFO, + handler->idiag_info_size); if ((ext & (1 << (TCPDIAG_CONG - 1))) && icsk->icsk_ca_ops) { size_t len = strlen(icsk->icsk_ca_ops->name); @@ -155,19 +163,6 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, r->tcpdiag_expires = 0; } #undef EXPIRES_IN_MS - /* - * Ahem... for now we'll have some knowledge about TCP -acme - * But this is just one of two small exceptions, both in this - * function, so lets close our eyes for some 15 lines or so... 8) - * -acme - */ - if (sk->sk_protocol == IPPROTO_TCP) { - const struct tcp_sock *tp = tcp_sk(sk); - - r->tcpdiag_rqueue = tp->rcv_nxt - tp->copied_seq; - r->tcpdiag_wqueue = tp->write_seq - tp->snd_una; - } else - r->tcpdiag_rqueue = r->tcpdiag_wqueue = 0; r->tcpdiag_uid = sock_i_uid(sk); r->tcpdiag_inode = sock_i_ino(sk); @@ -179,13 +174,7 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, minfo->tcpdiag_tmem = atomic_read(&sk->sk_wmem_alloc); } - /* Ahem... for now we'll have some knowledge about TCP -acme */ - if (info) { - if (sk->sk_protocol == IPPROTO_TCP) - tcp_get_info(sk, info); - else - memset(info, 0, sizeof(*info)); - } + handler->idiag_get_info(sk, r, info); if (sk->sk_state < TCP_TIME_WAIT && icsk->icsk_ca_ops && icsk->icsk_ca_ops->get_info) @@ -206,11 +195,13 @@ static int tcpdiag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh) struct sock *sk; struct tcpdiagreq *req = NLMSG_DATA(nlh); struct sk_buff *rep; - struct inet_hashinfo *hashinfo = &tcp_hashinfo; -#ifdef CONFIG_IP_TCPDIAG_DCCP - if (nlh->nlmsg_type == DCCPDIAG_GETSOCK) - hashinfo = &dccp_hashinfo; -#endif + struct inet_hashinfo *hashinfo; + const struct inet_diag_handler *handler; + + handler = inet_diag_table[nlh->nlmsg_type]; + BUG_ON(handler == NULL); + hashinfo = handler->idiag_hashinfo; + if (req->tcpdiag_family == AF_INET) { sk = inet_lookup(hashinfo, req->id.tcpdiag_dst[0], req->id.tcpdiag_dport, req->id.tcpdiag_src[0], @@ -241,9 +232,10 @@ static int tcpdiag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh) goto out; err = -ENOMEM; - rep = alloc_skb(NLMSG_SPACE(sizeof(struct tcpdiagmsg)+ - sizeof(struct tcpdiag_meminfo)+ - sizeof(struct tcp_info)+64), GFP_KERNEL); + rep = alloc_skb(NLMSG_SPACE((sizeof(struct tcpdiagmsg) + + sizeof(struct tcpdiag_meminfo) + + handler->idiag_info_size + 64)), + GFP_KERNEL); if (!rep) goto out; @@ -603,15 +595,16 @@ static int tcpdiag_dump(struct sk_buff *skb, struct netlink_callback *cb) int i, num; int s_i, s_num; struct tcpdiagreq *r = NLMSG_DATA(cb->nlh); + const struct inet_diag_handler *handler; struct inet_hashinfo *hashinfo; + handler = inet_diag_table[cb->nlh->nlmsg_type]; + BUG_ON(handler == NULL); + hashinfo = handler->idiag_hashinfo; + s_i = cb->args[1]; s_num = num = cb->args[2]; - hashinfo = &tcp_hashinfo; -#ifdef CONFIG_IP_TCPDIAG_DCCP - if (cb->nlh->nlmsg_type == DCCPDIAG_GETSOCK) - hashinfo = &dccp_hashinfo; -#endif + if (cb->args[0] == 0) { if (!(r->tcpdiag_states&(TCPF_LISTEN|TCPF_SYN_RECV))) goto skip_listen_ht; @@ -745,13 +738,12 @@ tcpdiag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (!(nlh->nlmsg_flags&NLM_F_REQUEST)) return 0; - if (nlh->nlmsg_type != TCPDIAG_GETSOCK -#ifdef CONFIG_IP_TCPDIAG_DCCP - && nlh->nlmsg_type != DCCPDIAG_GETSOCK -#endif - ) + if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX) goto err_inval; + if (inet_diag_table[nlh->nlmsg_type] == NULL) + return -ENOENT; + if (NLMSG_LENGTH(sizeof(struct tcpdiagreq)) > skb->len) goto err_inval; @@ -803,18 +795,95 @@ static void tcpdiag_rcv(struct sock *sk, int len) } } +static void tcp_diag_get_info(struct sock *sk, struct tcpdiagmsg *r, + void *_info) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct tcp_info *info = _info; + + r->tcpdiag_rqueue = tp->rcv_nxt - tp->copied_seq; + r->tcpdiag_wqueue = tp->write_seq - tp->snd_una; + if (info != NULL) + tcp_get_info(sk, info); +} + +static struct inet_diag_handler tcp_diag_handler = { + .idiag_hashinfo = &tcp_hashinfo, + .idiag_get_info = tcp_diag_get_info, + .idiag_type = TCPDIAG_GETSOCK, + .idiag_info_size = sizeof(struct tcp_info), +}; + +static DEFINE_SPINLOCK(inet_diag_register_lock); + +int inet_diag_register(const struct inet_diag_handler *h) +{ + const __u16 type = h->idiag_type; + int err = -EINVAL; + + if (type >= INET_DIAG_GETSOCK_MAX) + goto out; + + spin_lock(&inet_diag_register_lock); + err = -EEXIST; + if (inet_diag_table[type] == NULL) { + inet_diag_table[type] = h; + err = 0; + } + spin_unlock(&inet_diag_register_lock); +out: + return err; +} +EXPORT_SYMBOL_GPL(inet_diag_register); + +void inet_diag_unregister(const struct inet_diag_handler *h) +{ + const __u16 type = h->idiag_type; + + if (type >= INET_DIAG_GETSOCK_MAX) + return; + + spin_lock(&inet_diag_register_lock); + inet_diag_table[type] = NULL; + spin_unlock(&inet_diag_register_lock); + + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(inet_diag_unregister); + static int __init tcpdiag_init(void) { + const int inet_diag_table_size = (INET_DIAG_GETSOCK_MAX * + sizeof(struct inet_diag_handler *)); + int err = -ENOMEM; + + inet_diag_table = kmalloc(inet_diag_table_size, GFP_KERNEL); + if (!inet_diag_table) + goto out; + + memset(inet_diag_table, 0, inet_diag_table_size); + tcpnl = netlink_kernel_create(NETLINK_TCPDIAG, tcpdiag_rcv, THIS_MODULE); if (tcpnl == NULL) - return -ENOMEM; - return 0; + goto out_free_table; + + err = inet_diag_register(&tcp_diag_handler); + if (err) + goto out_sock_release; +out: + return err; +out_sock_release: + sock_release(tcpnl->sk_socket); +out_free_table: + kfree(inet_diag_table); + goto out; } static void __exit tcpdiag_exit(void) { sock_release(tcpnl->sk_socket); + kfree(inet_diag_table); } module_init(tcpdiag_init); -- cgit v0.10.2 From 73c1f4a033675f168df7e98bbeeafca3c644b8a6 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 12 Aug 2005 12:51:49 -0300 Subject: [TCPDIAG]: Just rename everything to inet_diag Next changeset will rename tcp_diag.[ch] to inet_diag.[ch]. I'm taking this longer route so as to easy review, making clear the changes made all along the way. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 1c50fea..d5e09bc 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -8,7 +8,7 @@ #define NETLINK_W1 1 /* 1-wire subsystem */ #define NETLINK_USERSOCK 2 /* Reserved for user mode socket protocols */ #define NETLINK_FIREWALL 3 /* Firewalling hook */ -#define NETLINK_TCPDIAG 4 /* TCP socket monitoring */ +#define NETLINK_INET_DIAG 4 /* INET socket monitoring */ #define NETLINK_NFLOG 5 /* netfilter/iptables ULOG */ #define NETLINK_XFRM 6 /* ipsec */ #define NETLINK_SELINUX 7 /* SELinux event notifications */ diff --git a/include/linux/tcp_diag.h b/include/linux/tcp_diag.h index 910c34b..a4606e5 100644 --- a/include/linux/tcp_diag.h +++ b/include/linux/tcp_diag.h @@ -1,5 +1,5 @@ -#ifndef _TCP_DIAG_H_ -#define _TCP_DIAG_H_ 1 +#ifndef _INET_DIAG_H_ +#define _INET_DIAG_H_ 1 /* Just some random number */ #define TCPDIAG_GETSOCK 18 @@ -8,39 +8,36 @@ #define INET_DIAG_GETSOCK_MAX 24 /* Socket identity */ -struct tcpdiag_sockid -{ - __u16 tcpdiag_sport; - __u16 tcpdiag_dport; - __u32 tcpdiag_src[4]; - __u32 tcpdiag_dst[4]; - __u32 tcpdiag_if; - __u32 tcpdiag_cookie[2]; -#define TCPDIAG_NOCOOKIE (~0U) +struct inet_diag_sockid { + __u16 idiag_sport; + __u16 idiag_dport; + __u32 idiag_src[4]; + __u32 idiag_dst[4]; + __u32 idiag_if; + __u32 idiag_cookie[2]; +#define INET_DIAG_NOCOOKIE (~0U) }; /* Request structure */ -struct tcpdiagreq -{ - __u8 tcpdiag_family; /* Family of addresses. */ - __u8 tcpdiag_src_len; - __u8 tcpdiag_dst_len; - __u8 tcpdiag_ext; /* Query extended information */ +struct inet_diag_req { + __u8 idiag_family; /* Family of addresses. */ + __u8 idiag_src_len; + __u8 idiag_dst_len; + __u8 idiag_ext; /* Query extended information */ - struct tcpdiag_sockid id; + struct inet_diag_sockid id; - __u32 tcpdiag_states; /* States to dump */ - __u32 tcpdiag_dbs; /* Tables to dump (NI) */ + __u32 idiag_states; /* States to dump */ + __u32 idiag_dbs; /* Tables to dump (NI) */ }; -enum -{ - TCPDIAG_REQ_NONE, - TCPDIAG_REQ_BYTECODE, +enum { + INET_DIAG_REQ_NONE, + INET_DIAG_REQ_BYTECODE, }; -#define TCPDIAG_REQ_MAX TCPDIAG_REQ_BYTECODE +#define INET_DIAG_REQ_MAX INET_DIAG_REQ_BYTECODE /* Bytecode is sequence of 4 byte commands followed by variable arguments. * All the commands identified by "code" are conditional jumps forward: @@ -48,28 +45,25 @@ enum * length of the command and its arguments. */ -struct tcpdiag_bc_op -{ +struct inet_diag_bc_op { unsigned char code; unsigned char yes; unsigned short no; }; -enum -{ - TCPDIAG_BC_NOP, - TCPDIAG_BC_JMP, - TCPDIAG_BC_S_GE, - TCPDIAG_BC_S_LE, - TCPDIAG_BC_D_GE, - TCPDIAG_BC_D_LE, - TCPDIAG_BC_AUTO, - TCPDIAG_BC_S_COND, - TCPDIAG_BC_D_COND, +enum { + INET_DIAG_BC_NOP, + INET_DIAG_BC_JMP, + INET_DIAG_BC_S_GE, + INET_DIAG_BC_S_LE, + INET_DIAG_BC_D_GE, + INET_DIAG_BC_D_LE, + INET_DIAG_BC_AUTO, + INET_DIAG_BC_S_COND, + INET_DIAG_BC_D_COND, }; -struct tcpdiag_hostcond -{ +struct inet_diag_hostcond { __u8 family; __u8 prefix_len; int port; @@ -78,47 +72,44 @@ struct tcpdiag_hostcond /* Base info structure. It contains socket identity (addrs/ports/cookie) * and, alas, the information shown by netstat. */ -struct tcpdiagmsg -{ - __u8 tcpdiag_family; - __u8 tcpdiag_state; - __u8 tcpdiag_timer; - __u8 tcpdiag_retrans; - - struct tcpdiag_sockid id; - - __u32 tcpdiag_expires; - __u32 tcpdiag_rqueue; - __u32 tcpdiag_wqueue; - __u32 tcpdiag_uid; - __u32 tcpdiag_inode; +struct inet_diag_msg { + __u8 idiag_family; + __u8 idiag_state; + __u8 idiag_timer; + __u8 idiag_retrans; + + struct inet_diag_sockid id; + + __u32 idiag_expires; + __u32 idiag_rqueue; + __u32 idiag_wqueue; + __u32 idiag_uid; + __u32 idiag_inode; }; /* Extensions */ -enum -{ - TCPDIAG_NONE, - TCPDIAG_MEMINFO, - TCPDIAG_INFO, - TCPDIAG_VEGASINFO, - TCPDIAG_CONG, +enum { + INET_DIAG_NONE, + INET_DIAG_MEMINFO, + INET_DIAG_INFO, + INET_DIAG_VEGASINFO, + INET_DIAG_CONG, }; -#define TCPDIAG_MAX TCPDIAG_CONG +#define INET_DIAG_MAX INET_DIAG_CONG -/* TCPDIAG_MEM */ +/* INET_DIAG_MEM */ -struct tcpdiag_meminfo -{ - __u32 tcpdiag_rmem; - __u32 tcpdiag_wmem; - __u32 tcpdiag_fmem; - __u32 tcpdiag_tmem; +struct inet_diag_meminfo { + __u32 idiag_rmem; + __u32 idiag_wmem; + __u32 idiag_fmem; + __u32 idiag_tmem; }; -/* TCPDIAG_VEGASINFO */ +/* INET_DIAG_VEGASINFO */ struct tcpvegas_info { __u32 tcpv_enabled; @@ -134,7 +125,7 @@ struct inet_hashinfo; struct inet_diag_handler { struct inet_hashinfo *idiag_hashinfo; void (*idiag_get_info)(struct sock *sk, - struct tcpdiagmsg *r, + struct inet_diag_msg *r, void *info); __u16 idiag_info_size; __u16 idiag_type; @@ -144,4 +135,4 @@ extern int inet_diag_register(const struct inet_diag_handler *handler); extern void inet_diag_unregister(const struct inet_diag_handler *handler); #endif /* __KERNEL__ */ -#endif /* _TCP_DIAG_H_ */ +#endif /* _INET_DIAG_H_ */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 0b3f729..fef1227 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -690,7 +690,7 @@ struct tcp_congestion_ops { u32 (*undo_cwnd)(struct sock *sk); /* hook for packet ack accounting (optional) */ void (*pkts_acked)(struct sock *sk, u32 num_acked); - /* get info for tcp_diag (optional) */ + /* get info for inet_diag (optional) */ void (*get_info)(struct sock *sk, u32 ext, struct sk_buff *skb); char name[TCP_CA_NAME_MAX]; diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig index ff5b545..efce4f3 100644 --- a/net/dccp/Kconfig +++ b/net/dccp/Kconfig @@ -20,8 +20,8 @@ config IP_DCCP If in doubt, say N. config IP_DCCP_DIAG - depends on IP_DCCP && IP_TCPDIAG - def_tristate y if (IP_DCCP = y && IP_TCPDIAG = y) + depends on IP_DCCP && IP_INET_DIAG + def_tristate y if (IP_DCCP = y && IP_INET_DIAG = y) def_tristate m source "net/dccp/ccids/Kconfig" diff --git a/net/dccp/diag.c b/net/dccp/diag.c index 4d9037c..9f07eff 100644 --- a/net/dccp/diag.c +++ b/net/dccp/diag.c @@ -16,10 +16,10 @@ #include "dccp.h" -static void dccp_diag_get_info(struct sock *sk, struct tcpdiagmsg *r, +static void dccp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, void *_info) { - r->tcpdiag_rqueue = r->tcpdiag_wqueue = 0; + r->idiag_rqueue = r->idiag_wqueue = 0; } static struct inet_diag_handler dccp_diag_handler = { diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 1e6db2a..019e88d 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -413,13 +413,13 @@ config INET_TUNNEL If unsure, say Y. -config IP_TCPDIAG - tristate "IP: TCP socket monitoring interface" +config IP_INET_DIAG + tristate "IP: INET socket monitoring interface" default y ---help--- - Support for TCP socket monitoring interface used by native Linux - tools such as ss. ss is included in iproute2, currently downloadable - at . + Support for INET (TCP, DCCP, etc) socket monitoring interface used by + native Linux tools such as ss. ss is included in iproute2, currently + downloadable at . If unsure, say Y. diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index ea0e1d8..9b1c894 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -30,7 +30,7 @@ obj-$(CONFIG_IP_ROUTE_MULTIPATH_WRANDOM) += multipath_wrandom.o obj-$(CONFIG_IP_ROUTE_MULTIPATH_DRR) += multipath_drr.o obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_IP_VS) += ipvs/ -obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o +obj-$(CONFIG_IP_INET_DIAG) += tcp_diag.o obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index b13b71c..24abe82 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -1,7 +1,7 @@ /* - * tcp_diag.c Module for monitoring TCP sockets. + * inet_diag.c Module for monitoring INET transport protocols sockets. * - * Version: $Id: tcp_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $ + * Version: $Id: inet_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $ * * Authors: Alexey Kuznetsov, * @@ -36,8 +36,7 @@ static const struct inet_diag_handler **inet_diag_table; -struct tcpdiag_entry -{ +struct inet_diag_entry { u32 *saddr; u32 *daddr; u16 sport; @@ -46,25 +45,21 @@ struct tcpdiag_entry u16 userlocks; }; -static struct sock *tcpnl; +static struct sock *idiagnl; -#define TCPDIAG_PUT(skb, attrtype, attrlen) \ +#define INET_DIAG_PUT(skb, attrtype, attrlen) \ RTA_DATA(__RTA_PUT(skb, attrtype, attrlen)) -#ifdef CONFIG_IP_TCPDIAG_DCCP -extern struct inet_hashinfo dccp_hashinfo; -#endif - -static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, +static int inet_diag_fill(struct sk_buff *skb, struct sock *sk, int ext, u32 pid, u32 seq, u16 nlmsg_flags, const struct nlmsghdr *unlh) { const struct inet_sock *inet = inet_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); - struct tcpdiagmsg *r; + struct inet_diag_msg *r; struct nlmsghdr *nlh; void *info = NULL; - struct tcpdiag_meminfo *minfo = NULL; + struct inet_diag_meminfo *minfo = NULL; unsigned char *b = skb->tail; const struct inet_diag_handler *handler; @@ -76,51 +71,52 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, r = NLMSG_DATA(nlh); if (sk->sk_state != TCP_TIME_WAIT) { - if (ext & (1<<(TCPDIAG_MEMINFO-1))) - minfo = TCPDIAG_PUT(skb, TCPDIAG_MEMINFO, sizeof(*minfo)); - if (ext & (1<<(TCPDIAG_INFO-1))) - info = TCPDIAG_PUT(skb, TCPDIAG_INFO, + if (ext & (1 << (INET_DIAG_MEMINFO - 1))) + minfo = INET_DIAG_PUT(skb, INET_DIAG_MEMINFO, + sizeof(*minfo)); + if (ext & (1 << (INET_DIAG_INFO - 1))) + info = INET_DIAG_PUT(skb, INET_DIAG_INFO, handler->idiag_info_size); - if ((ext & (1 << (TCPDIAG_CONG - 1))) && icsk->icsk_ca_ops) { + if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) { size_t len = strlen(icsk->icsk_ca_ops->name); - strcpy(TCPDIAG_PUT(skb, TCPDIAG_CONG, len+1), + strcpy(INET_DIAG_PUT(skb, INET_DIAG_CONG, len + 1), icsk->icsk_ca_ops->name); } } - r->tcpdiag_family = sk->sk_family; - r->tcpdiag_state = sk->sk_state; - r->tcpdiag_timer = 0; - r->tcpdiag_retrans = 0; + r->idiag_family = sk->sk_family; + r->idiag_state = sk->sk_state; + r->idiag_timer = 0; + r->idiag_retrans = 0; - r->id.tcpdiag_if = sk->sk_bound_dev_if; - r->id.tcpdiag_cookie[0] = (u32)(unsigned long)sk; - r->id.tcpdiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1); + r->id.idiag_if = sk->sk_bound_dev_if; + r->id.idiag_cookie[0] = (u32)(unsigned long)sk; + r->id.idiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1); - if (r->tcpdiag_state == TCP_TIME_WAIT) { + if (r->idiag_state == TCP_TIME_WAIT) { const struct inet_timewait_sock *tw = inet_twsk(sk); long tmo = tw->tw_ttd - jiffies; if (tmo < 0) tmo = 0; - r->id.tcpdiag_sport = tw->tw_sport; - r->id.tcpdiag_dport = tw->tw_dport; - r->id.tcpdiag_src[0] = tw->tw_rcv_saddr; - r->id.tcpdiag_dst[0] = tw->tw_daddr; - r->tcpdiag_state = tw->tw_substate; - r->tcpdiag_timer = 3; - r->tcpdiag_expires = (tmo*1000+HZ-1)/HZ; - r->tcpdiag_rqueue = 0; - r->tcpdiag_wqueue = 0; - r->tcpdiag_uid = 0; - r->tcpdiag_inode = 0; + r->id.idiag_sport = tw->tw_sport; + r->id.idiag_dport = tw->tw_dport; + r->id.idiag_src[0] = tw->tw_rcv_saddr; + r->id.idiag_dst[0] = tw->tw_daddr; + r->idiag_state = tw->tw_substate; + r->idiag_timer = 3; + r->idiag_expires = (tmo * 1000 + HZ - 1) / HZ; + r->idiag_rqueue = 0; + r->idiag_wqueue = 0; + r->idiag_uid = 0; + r->idiag_inode = 0; #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) - if (r->tcpdiag_family == AF_INET6) { + if (r->idiag_family == AF_INET6) { const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk); - ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src, + ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, &tcp6tw->tw_v6_rcv_saddr); - ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst, + ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, &tcp6tw->tw_v6_daddr); } #endif @@ -128,18 +124,18 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, return skb->len; } - r->id.tcpdiag_sport = inet->sport; - r->id.tcpdiag_dport = inet->dport; - r->id.tcpdiag_src[0] = inet->rcv_saddr; - r->id.tcpdiag_dst[0] = inet->daddr; + r->id.idiag_sport = inet->sport; + r->id.idiag_dport = inet->dport; + r->id.idiag_src[0] = inet->rcv_saddr; + r->id.idiag_dst[0] = inet->daddr; #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) - if (r->tcpdiag_family == AF_INET6) { + if (r->idiag_family == AF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); - ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src, + ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, &np->rcv_saddr); - ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst, + ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, &np->daddr); } #endif @@ -147,31 +143,31 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, #define EXPIRES_IN_MS(tmo) ((tmo-jiffies)*1000+HZ-1)/HZ if (icsk->icsk_pending == ICSK_TIME_RETRANS) { - r->tcpdiag_timer = 1; - r->tcpdiag_retrans = icsk->icsk_retransmits; - r->tcpdiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout); + r->idiag_timer = 1; + r->idiag_retrans = icsk->icsk_retransmits; + r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout); } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { - r->tcpdiag_timer = 4; - r->tcpdiag_retrans = icsk->icsk_probes_out; - r->tcpdiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout); + r->idiag_timer = 4; + r->idiag_retrans = icsk->icsk_probes_out; + r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout); } else if (timer_pending(&sk->sk_timer)) { - r->tcpdiag_timer = 2; - r->tcpdiag_retrans = icsk->icsk_probes_out; - r->tcpdiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires); + r->idiag_timer = 2; + r->idiag_retrans = icsk->icsk_probes_out; + r->idiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires); } else { - r->tcpdiag_timer = 0; - r->tcpdiag_expires = 0; + r->idiag_timer = 0; + r->idiag_expires = 0; } #undef EXPIRES_IN_MS - r->tcpdiag_uid = sock_i_uid(sk); - r->tcpdiag_inode = sock_i_ino(sk); + r->idiag_uid = sock_i_uid(sk); + r->idiag_inode = sock_i_ino(sk); if (minfo) { - minfo->tcpdiag_rmem = atomic_read(&sk->sk_rmem_alloc); - minfo->tcpdiag_wmem = sk->sk_wmem_queued; - minfo->tcpdiag_fmem = sk->sk_forward_alloc; - minfo->tcpdiag_tmem = atomic_read(&sk->sk_wmem_alloc); + minfo->idiag_rmem = atomic_read(&sk->sk_rmem_alloc); + minfo->idiag_wmem = sk->sk_wmem_queued; + minfo->idiag_fmem = sk->sk_forward_alloc; + minfo->idiag_tmem = atomic_read(&sk->sk_wmem_alloc); } handler->idiag_get_info(sk, r, info); @@ -189,11 +185,11 @@ nlmsg_failure: return -1; } -static int tcpdiag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh) +static int inet_diag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh) { int err; struct sock *sk; - struct tcpdiagreq *req = NLMSG_DATA(nlh); + struct inet_diag_req *req = NLMSG_DATA(nlh); struct sk_buff *rep; struct inet_hashinfo *hashinfo; const struct inet_diag_handler *handler; @@ -202,19 +198,19 @@ static int tcpdiag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh) BUG_ON(handler == NULL); hashinfo = handler->idiag_hashinfo; - if (req->tcpdiag_family == AF_INET) { - sk = inet_lookup(hashinfo, req->id.tcpdiag_dst[0], - req->id.tcpdiag_dport, req->id.tcpdiag_src[0], - req->id.tcpdiag_sport, req->id.tcpdiag_if); + if (req->idiag_family == AF_INET) { + sk = inet_lookup(hashinfo, req->id.idiag_dst[0], + req->id.idiag_dport, req->id.idiag_src[0], + req->id.idiag_sport, req->id.idiag_if); } #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) - else if (req->tcpdiag_family == AF_INET6) { + else if (req->idiag_family == AF_INET6) { sk = inet6_lookup(hashinfo, - (struct in6_addr*)req->id.tcpdiag_dst, - req->id.tcpdiag_dport, - (struct in6_addr*)req->id.tcpdiag_src, - req->id.tcpdiag_sport, - req->id.tcpdiag_if); + (struct in6_addr *)req->id.idiag_dst, + req->id.idiag_dport, + (struct in6_addr *)req->id.idiag_src, + req->id.idiag_sport, + req->id.idiag_if); } #endif else { @@ -225,26 +221,27 @@ static int tcpdiag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh) return -ENOENT; err = -ESTALE; - if ((req->id.tcpdiag_cookie[0] != TCPDIAG_NOCOOKIE || - req->id.tcpdiag_cookie[1] != TCPDIAG_NOCOOKIE) && - ((u32)(unsigned long)sk != req->id.tcpdiag_cookie[0] || - (u32)((((unsigned long)sk) >> 31) >> 1) != req->id.tcpdiag_cookie[1])) + if ((req->id.idiag_cookie[0] != INET_DIAG_NOCOOKIE || + req->id.idiag_cookie[1] != INET_DIAG_NOCOOKIE) && + ((u32)(unsigned long)sk != req->id.idiag_cookie[0] || + (u32)((((unsigned long)sk) >> 31) >> 1) != req->id.idiag_cookie[1])) goto out; err = -ENOMEM; - rep = alloc_skb(NLMSG_SPACE((sizeof(struct tcpdiagmsg) + - sizeof(struct tcpdiag_meminfo) + + rep = alloc_skb(NLMSG_SPACE((sizeof(struct inet_diag_msg) + + sizeof(struct inet_diag_meminfo) + handler->idiag_info_size + 64)), GFP_KERNEL); if (!rep) goto out; - if (tcpdiag_fill(rep, sk, req->tcpdiag_ext, + if (inet_diag_fill(rep, sk, req->idiag_ext, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, 0, nlh) <= 0) BUG(); - err = netlink_unicast(tcpnl, rep, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); + err = netlink_unicast(idiagnl, rep, NETLINK_CB(in_skb).pid, + MSG_DONTWAIT); if (err > 0) err = 0; @@ -285,42 +282,42 @@ static int bitstring_match(const u32 *a1, const u32 *a2, int bits) } -static int tcpdiag_bc_run(const void *bc, int len, - const struct tcpdiag_entry *entry) +static int inet_diag_bc_run(const void *bc, int len, + const struct inet_diag_entry *entry) { while (len > 0) { int yes = 1; - const struct tcpdiag_bc_op *op = bc; + const struct inet_diag_bc_op *op = bc; switch (op->code) { - case TCPDIAG_BC_NOP: + case INET_DIAG_BC_NOP: break; - case TCPDIAG_BC_JMP: + case INET_DIAG_BC_JMP: yes = 0; break; - case TCPDIAG_BC_S_GE: + case INET_DIAG_BC_S_GE: yes = entry->sport >= op[1].no; break; - case TCPDIAG_BC_S_LE: + case INET_DIAG_BC_S_LE: yes = entry->dport <= op[1].no; break; - case TCPDIAG_BC_D_GE: + case INET_DIAG_BC_D_GE: yes = entry->dport >= op[1].no; break; - case TCPDIAG_BC_D_LE: + case INET_DIAG_BC_D_LE: yes = entry->dport <= op[1].no; break; - case TCPDIAG_BC_AUTO: + case INET_DIAG_BC_AUTO: yes = !(entry->userlocks & SOCK_BINDPORT_LOCK); break; - case TCPDIAG_BC_S_COND: - case TCPDIAG_BC_D_COND: + case INET_DIAG_BC_S_COND: + case INET_DIAG_BC_D_COND: { - struct tcpdiag_hostcond *cond = (struct tcpdiag_hostcond*)(op+1); + struct inet_diag_hostcond *cond = (struct inet_diag_hostcond*)(op+1); u32 *addr; if (cond->port != -1 && - cond->port != (op->code == TCPDIAG_BC_S_COND ? + cond->port != (op->code == INET_DIAG_BC_S_COND ? entry->sport : entry->dport)) { yes = 0; break; @@ -329,7 +326,7 @@ static int tcpdiag_bc_run(const void *bc, int len, if (cond->prefix_len == 0) break; - if (op->code == TCPDIAG_BC_S_COND) + if (op->code == INET_DIAG_BC_S_COND) addr = entry->saddr; else addr = entry->daddr; @@ -362,7 +359,7 @@ static int tcpdiag_bc_run(const void *bc, int len, static int valid_cc(const void *bc, int len, int cc) { while (len >= 0) { - const struct tcpdiag_bc_op *op = bc; + const struct inet_diag_bc_op *op = bc; if (cc > len) return 0; @@ -376,33 +373,33 @@ static int valid_cc(const void *bc, int len, int cc) return 0; } -static int tcpdiag_bc_audit(const void *bytecode, int bytecode_len) +static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) { const unsigned char *bc = bytecode; int len = bytecode_len; while (len > 0) { - struct tcpdiag_bc_op *op = (struct tcpdiag_bc_op*)bc; + struct inet_diag_bc_op *op = (struct inet_diag_bc_op*)bc; //printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len); switch (op->code) { - case TCPDIAG_BC_AUTO: - case TCPDIAG_BC_S_COND: - case TCPDIAG_BC_D_COND: - case TCPDIAG_BC_S_GE: - case TCPDIAG_BC_S_LE: - case TCPDIAG_BC_D_GE: - case TCPDIAG_BC_D_LE: + case INET_DIAG_BC_AUTO: + case INET_DIAG_BC_S_COND: + case INET_DIAG_BC_D_COND: + case INET_DIAG_BC_S_GE: + case INET_DIAG_BC_S_LE: + case INET_DIAG_BC_D_GE: + case INET_DIAG_BC_D_LE: if (op->yes < 4 || op->yes > len+4) return -EINVAL; - case TCPDIAG_BC_JMP: + case INET_DIAG_BC_JMP: if (op->no < 4 || op->no > len+4) return -EINVAL; if (op->no < len && !valid_cc(bytecode, bytecode_len, len-op->no)) return -EINVAL; break; - case TCPDIAG_BC_NOP: + case INET_DIAG_BC_NOP: if (op->yes < 4 || op->yes > len+4) return -EINVAL; break; @@ -415,13 +412,13 @@ static int tcpdiag_bc_audit(const void *bytecode, int bytecode_len) return len == 0 ? 0 : -EINVAL; } -static int tcpdiag_dump_sock(struct sk_buff *skb, struct sock *sk, +static int inet_diag_dump_sock(struct sk_buff *skb, struct sock *sk, struct netlink_callback *cb) { - struct tcpdiagreq *r = NLMSG_DATA(cb->nlh); + struct inet_diag_req *r = NLMSG_DATA(cb->nlh); if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { - struct tcpdiag_entry entry; + struct inet_diag_entry entry; struct rtattr *bc = (struct rtattr *)(r + 1); struct inet_sock *inet = inet_sk(sk); @@ -442,15 +439,15 @@ static int tcpdiag_dump_sock(struct sk_buff *skb, struct sock *sk, entry.dport = ntohs(inet->dport); entry.userlocks = sk->sk_userlocks; - if (!tcpdiag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry)) + if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry)) return 0; } - return tcpdiag_fill(skb, sk, r->tcpdiag_ext, NETLINK_CB(cb->skb).pid, + return inet_diag_fill(skb, sk, r->idiag_ext, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); } -static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk, +static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, struct request_sock *req, u32 pid, u32 seq, const struct nlmsghdr *unlh) @@ -458,7 +455,7 @@ static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk, const struct inet_request_sock *ireq = inet_rsk(req); struct inet_sock *inet = inet_sk(sk); unsigned char *b = skb->tail; - struct tcpdiagmsg *r; + struct inet_diag_msg *r; struct nlmsghdr *nlh; long tmo; @@ -466,33 +463,33 @@ static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk, nlh->nlmsg_flags = NLM_F_MULTI; r = NLMSG_DATA(nlh); - r->tcpdiag_family = sk->sk_family; - r->tcpdiag_state = TCP_SYN_RECV; - r->tcpdiag_timer = 1; - r->tcpdiag_retrans = req->retrans; + r->idiag_family = sk->sk_family; + r->idiag_state = TCP_SYN_RECV; + r->idiag_timer = 1; + r->idiag_retrans = req->retrans; - r->id.tcpdiag_if = sk->sk_bound_dev_if; - r->id.tcpdiag_cookie[0] = (u32)(unsigned long)req; - r->id.tcpdiag_cookie[1] = (u32)(((unsigned long)req >> 31) >> 1); + r->id.idiag_if = sk->sk_bound_dev_if; + r->id.idiag_cookie[0] = (u32)(unsigned long)req; + r->id.idiag_cookie[1] = (u32)(((unsigned long)req >> 31) >> 1); tmo = req->expires - jiffies; if (tmo < 0) tmo = 0; - r->id.tcpdiag_sport = inet->sport; - r->id.tcpdiag_dport = ireq->rmt_port; - r->id.tcpdiag_src[0] = ireq->loc_addr; - r->id.tcpdiag_dst[0] = ireq->rmt_addr; - r->tcpdiag_expires = jiffies_to_msecs(tmo), - r->tcpdiag_rqueue = 0; - r->tcpdiag_wqueue = 0; - r->tcpdiag_uid = sock_i_uid(sk); - r->tcpdiag_inode = 0; + r->id.idiag_sport = inet->sport; + r->id.idiag_dport = ireq->rmt_port; + r->id.idiag_src[0] = ireq->loc_addr; + r->id.idiag_dst[0] = ireq->rmt_addr; + r->idiag_expires = jiffies_to_msecs(tmo); + r->idiag_rqueue = 0; + r->idiag_wqueue = 0; + r->idiag_uid = sock_i_uid(sk); + r->idiag_inode = 0; #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) - if (r->tcpdiag_family == AF_INET6) { - ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src, + if (r->idiag_family == AF_INET6) { + ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, &tcp6_rsk(req)->loc_addr); - ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst, + ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, &tcp6_rsk(req)->rmt_addr); } #endif @@ -505,11 +502,11 @@ nlmsg_failure: return -1; } -static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk, +static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, struct netlink_callback *cb) { - struct tcpdiag_entry entry; - struct tcpdiagreq *r = NLMSG_DATA(cb->nlh); + struct inet_diag_entry entry; + struct inet_diag_req *r = NLMSG_DATA(cb->nlh); struct inet_connection_sock *icsk = inet_csk(sk); struct listen_sock *lopt; struct rtattr *bc = NULL; @@ -547,8 +544,8 @@ static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk, if (reqnum < s_reqnum) continue; - if (r->id.tcpdiag_dport != ireq->rmt_port && - r->id.tcpdiag_dport) + if (r->id.idiag_dport != ireq->rmt_port && + r->id.idiag_dport) continue; if (bc) { @@ -566,12 +563,12 @@ static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk, &ireq->rmt_addr; entry.dport = ntohs(ireq->rmt_port); - if (!tcpdiag_bc_run(RTA_DATA(bc), + if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry)) continue; } - err = tcpdiag_fill_req(skb, sk, req, + err = inet_diag_fill_req(skb, sk, req, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, cb->nlh); if (err < 0) { @@ -590,11 +587,11 @@ out: return err; } -static int tcpdiag_dump(struct sk_buff *skb, struct netlink_callback *cb) +static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) { int i, num; int s_i, s_num; - struct tcpdiagreq *r = NLMSG_DATA(cb->nlh); + struct inet_diag_req *r = NLMSG_DATA(cb->nlh); const struct inet_diag_handler *handler; struct inet_hashinfo *hashinfo; @@ -606,7 +603,7 @@ static int tcpdiag_dump(struct sk_buff *skb, struct netlink_callback *cb) s_num = num = cb->args[2]; if (cb->args[0] == 0) { - if (!(r->tcpdiag_states&(TCPF_LISTEN|TCPF_SYN_RECV))) + if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV))) goto skip_listen_ht; inet_listen_lock(hashinfo); @@ -623,25 +620,25 @@ static int tcpdiag_dump(struct sk_buff *skb, struct netlink_callback *cb) continue; } - if (r->id.tcpdiag_sport != inet->sport && - r->id.tcpdiag_sport) + if (r->id.idiag_sport != inet->sport && + r->id.idiag_sport) goto next_listen; - if (!(r->tcpdiag_states&TCPF_LISTEN) || - r->id.tcpdiag_dport || + if (!(r->idiag_states & TCPF_LISTEN) || + r->id.idiag_dport || cb->args[3] > 0) goto syn_recv; - if (tcpdiag_dump_sock(skb, sk, cb) < 0) { + if (inet_diag_dump_sock(skb, sk, cb) < 0) { inet_listen_unlock(hashinfo); goto done; } syn_recv: - if (!(r->tcpdiag_states&TCPF_SYN_RECV)) + if (!(r->idiag_states & TCPF_SYN_RECV)) goto next_listen; - if (tcpdiag_dump_reqs(skb, sk, cb) < 0) { + if (inet_diag_dump_reqs(skb, sk, cb) < 0) { inet_listen_unlock(hashinfo); goto done; } @@ -662,7 +659,7 @@ skip_listen_ht: s_i = num = s_num = 0; } - if (!(r->tcpdiag_states&~(TCPF_LISTEN|TCPF_SYN_RECV))) + if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV))) return skb->len; for (i = s_i; i < hashinfo->ehash_size; i++) { @@ -681,14 +678,14 @@ skip_listen_ht: if (num < s_num) goto next_normal; - if (!(r->tcpdiag_states & (1 << sk->sk_state))) + if (!(r->idiag_states & (1 << sk->sk_state))) goto next_normal; - if (r->id.tcpdiag_sport != inet->sport && - r->id.tcpdiag_sport) + if (r->id.idiag_sport != inet->sport && + r->id.idiag_sport) goto next_normal; - if (r->id.tcpdiag_dport != inet->dport && r->id.tcpdiag_dport) + if (r->id.idiag_dport != inet->dport && r->id.idiag_dport) goto next_normal; - if (tcpdiag_dump_sock(skb, sk, cb) < 0) { + if (inet_diag_dump_sock(skb, sk, cb) < 0) { read_unlock_bh(&head->lock); goto done; } @@ -696,20 +693,20 @@ next_normal: ++num; } - if (r->tcpdiag_states&TCPF_TIME_WAIT) { + if (r->idiag_states & TCPF_TIME_WAIT) { sk_for_each(sk, node, &hashinfo->ehash[i + hashinfo->ehash_size].chain) { struct inet_sock *inet = inet_sk(sk); if (num < s_num) goto next_dying; - if (r->id.tcpdiag_sport != inet->sport && - r->id.tcpdiag_sport) + if (r->id.idiag_sport != inet->sport && + r->id.idiag_sport) goto next_dying; - if (r->id.tcpdiag_dport != inet->dport && - r->id.tcpdiag_dport) + if (r->id.idiag_dport != inet->dport && + r->id.idiag_dport) goto next_dying; - if (tcpdiag_dump_sock(skb, sk, cb) < 0) { + if (inet_diag_dump_sock(skb, sk, cb) < 0) { read_unlock_bh(&head->lock); goto done; } @@ -726,14 +723,14 @@ done: return skb->len; } -static int tcpdiag_dump_done(struct netlink_callback *cb) +static int inet_diag_dump_done(struct netlink_callback *cb) { return 0; } static __inline__ int -tcpdiag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { if (!(nlh->nlmsg_flags&NLM_F_REQUEST)) return 0; @@ -744,24 +741,28 @@ tcpdiag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (inet_diag_table[nlh->nlmsg_type] == NULL) return -ENOENT; - if (NLMSG_LENGTH(sizeof(struct tcpdiagreq)) > skb->len) + if (NLMSG_LENGTH(sizeof(struct inet_diag_req)) > skb->len) goto err_inval; if (nlh->nlmsg_flags&NLM_F_DUMP) { - if (nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(struct tcpdiagreq))) { - struct rtattr *rta = (struct rtattr*)(NLMSG_DATA(nlh) + sizeof(struct tcpdiagreq)); - if (rta->rta_type != TCPDIAG_REQ_BYTECODE || + if (nlh->nlmsg_len > + (4 + NLMSG_SPACE(sizeof(struct inet_diag_req)))) { + struct rtattr *rta = (void *)(NLMSG_DATA(nlh) + + sizeof(struct inet_diag_req)); + if (rta->rta_type != INET_DIAG_REQ_BYTECODE || rta->rta_len < 8 || - rta->rta_len > nlh->nlmsg_len - NLMSG_SPACE(sizeof(struct tcpdiagreq))) + rta->rta_len > + (nlh->nlmsg_len - + NLMSG_SPACE(sizeof(struct inet_diag_req)))) goto err_inval; - if (tcpdiag_bc_audit(RTA_DATA(rta), RTA_PAYLOAD(rta))) + if (inet_diag_bc_audit(RTA_DATA(rta), RTA_PAYLOAD(rta))) goto err_inval; } - return netlink_dump_start(tcpnl, skb, nlh, - tcpdiag_dump, - tcpdiag_dump_done); + return netlink_dump_start(idiagnl, skb, nlh, + inet_diag_dump, + inet_diag_dump_done); } else { - return tcpdiag_get_exact(skb, nlh); + return inet_diag_get_exact(skb, nlh); } err_inval: @@ -769,7 +770,7 @@ err_inval: } -static inline void tcpdiag_rcv_skb(struct sk_buff *skb) +static inline void inet_diag_rcv_skb(struct sk_buff *skb) { int err; struct nlmsghdr * nlh; @@ -778,31 +779,31 @@ static inline void tcpdiag_rcv_skb(struct sk_buff *skb) nlh = (struct nlmsghdr *)skb->data; if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) return; - err = tcpdiag_rcv_msg(skb, nlh); + err = inet_diag_rcv_msg(skb, nlh); if (err || nlh->nlmsg_flags & NLM_F_ACK) netlink_ack(skb, nlh, err); } } -static void tcpdiag_rcv(struct sock *sk, int len) +static void inet_diag_rcv(struct sock *sk, int len) { struct sk_buff *skb; unsigned int qlen = skb_queue_len(&sk->sk_receive_queue); while (qlen-- && (skb = skb_dequeue(&sk->sk_receive_queue))) { - tcpdiag_rcv_skb(skb); + inet_diag_rcv_skb(skb); kfree_skb(skb); } } -static void tcp_diag_get_info(struct sock *sk, struct tcpdiagmsg *r, +static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, void *_info) { const struct tcp_sock *tp = tcp_sk(sk); struct tcp_info *info = _info; - r->tcpdiag_rqueue = tp->rcv_nxt - tp->copied_seq; - r->tcpdiag_wqueue = tp->write_seq - tp->snd_una; + r->idiag_rqueue = tp->rcv_nxt - tp->copied_seq; + r->idiag_wqueue = tp->write_seq - tp->snd_una; if (info != NULL) tcp_get_info(sk, info); } @@ -851,7 +852,7 @@ void inet_diag_unregister(const struct inet_diag_handler *h) } EXPORT_SYMBOL_GPL(inet_diag_unregister); -static int __init tcpdiag_init(void) +static int __init inet_diag_init(void) { const int inet_diag_table_size = (INET_DIAG_GETSOCK_MAX * sizeof(struct inet_diag_handler *)); @@ -863,9 +864,9 @@ static int __init tcpdiag_init(void) memset(inet_diag_table, 0, inet_diag_table_size); - tcpnl = netlink_kernel_create(NETLINK_TCPDIAG, tcpdiag_rcv, - THIS_MODULE); - if (tcpnl == NULL) + idiagnl = netlink_kernel_create(NETLINK_INET_DIAG, inet_diag_rcv, + THIS_MODULE); + if (idiagnl == NULL) goto out_free_table; err = inet_diag_register(&tcp_diag_handler); @@ -874,18 +875,18 @@ static int __init tcpdiag_init(void) out: return err; out_sock_release: - sock_release(tcpnl->sk_socket); + sock_release(idiagnl->sk_socket); out_free_table: kfree(inet_diag_table); goto out; } -static void __exit tcpdiag_exit(void) +static void __exit inet_diag_exit(void) { - sock_release(tcpnl->sk_socket); + sock_release(idiagnl->sk_socket); kfree(inet_diag_table); } -module_init(tcpdiag_init); -module_exit(tcpdiag_exit); +module_init(inet_diag_init); +module_exit(inet_diag_exit); MODULE_LICENSE("GPL"); diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 054de24..8cef9dc 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -365,10 +365,10 @@ static void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) { const struct vegas *ca = inet_csk_ca(sk); - if (ext & (1<<(TCPDIAG_VEGASINFO-1))) { + if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { struct tcpvegas_info *info; - info = RTA_DATA(__RTA_PUT(skb, TCPDIAG_VEGASINFO, + info = RTA_DATA(__RTA_PUT(skb, INET_DIAG_VEGASINFO, sizeof(*info))); info->tcpv_enabled = ca->doing_vegas_now; diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index d8a5a2b..3951003 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -216,11 +216,11 @@ static void tcp_westwood_info(struct sock *sk, u32 ext, struct sk_buff *skb) { const struct westwood *ca = inet_csk_ca(sk); - if (ext & (1<<(TCPDIAG_VEGASINFO-1))) { + if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { struct rtattr *rta; struct tcpvegas_info *info; - rta = __RTA_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*info)); + rta = __RTA_PUT(skb, INET_DIAG_VEGASINFO, sizeof(*info)); info = RTA_DATA(rta); info->tcpv_enabled = 1; info->tcpv_rttcnt = 0; -- cgit v0.10.2 From a8c2190ee7da1a1dc68ff1a6b5f03feb61e523a5 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 12 Aug 2005 12:56:38 -0300 Subject: [INET_DIAG]: Rename tcp_diag.[ch] to inet_diag.[ch] Next changeset will introduce net/ipv4/tcp_diag.c, moving the code that was put transitioanlly in inet_diag.c. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h new file mode 100644 index 0000000..a4606e5 --- /dev/null +++ b/include/linux/inet_diag.h @@ -0,0 +1,138 @@ +#ifndef _INET_DIAG_H_ +#define _INET_DIAG_H_ 1 + +/* Just some random number */ +#define TCPDIAG_GETSOCK 18 +#define DCCPDIAG_GETSOCK 19 + +#define INET_DIAG_GETSOCK_MAX 24 + +/* Socket identity */ +struct inet_diag_sockid { + __u16 idiag_sport; + __u16 idiag_dport; + __u32 idiag_src[4]; + __u32 idiag_dst[4]; + __u32 idiag_if; + __u32 idiag_cookie[2]; +#define INET_DIAG_NOCOOKIE (~0U) +}; + +/* Request structure */ + +struct inet_diag_req { + __u8 idiag_family; /* Family of addresses. */ + __u8 idiag_src_len; + __u8 idiag_dst_len; + __u8 idiag_ext; /* Query extended information */ + + struct inet_diag_sockid id; + + __u32 idiag_states; /* States to dump */ + __u32 idiag_dbs; /* Tables to dump (NI) */ +}; + +enum { + INET_DIAG_REQ_NONE, + INET_DIAG_REQ_BYTECODE, +}; + +#define INET_DIAG_REQ_MAX INET_DIAG_REQ_BYTECODE + +/* Bytecode is sequence of 4 byte commands followed by variable arguments. + * All the commands identified by "code" are conditional jumps forward: + * to offset cc+"yes" or to offset cc+"no". "yes" is supposed to be + * length of the command and its arguments. + */ + +struct inet_diag_bc_op { + unsigned char code; + unsigned char yes; + unsigned short no; +}; + +enum { + INET_DIAG_BC_NOP, + INET_DIAG_BC_JMP, + INET_DIAG_BC_S_GE, + INET_DIAG_BC_S_LE, + INET_DIAG_BC_D_GE, + INET_DIAG_BC_D_LE, + INET_DIAG_BC_AUTO, + INET_DIAG_BC_S_COND, + INET_DIAG_BC_D_COND, +}; + +struct inet_diag_hostcond { + __u8 family; + __u8 prefix_len; + int port; + __u32 addr[0]; +}; + +/* Base info structure. It contains socket identity (addrs/ports/cookie) + * and, alas, the information shown by netstat. */ +struct inet_diag_msg { + __u8 idiag_family; + __u8 idiag_state; + __u8 idiag_timer; + __u8 idiag_retrans; + + struct inet_diag_sockid id; + + __u32 idiag_expires; + __u32 idiag_rqueue; + __u32 idiag_wqueue; + __u32 idiag_uid; + __u32 idiag_inode; +}; + +/* Extensions */ + +enum { + INET_DIAG_NONE, + INET_DIAG_MEMINFO, + INET_DIAG_INFO, + INET_DIAG_VEGASINFO, + INET_DIAG_CONG, +}; + +#define INET_DIAG_MAX INET_DIAG_CONG + + +/* INET_DIAG_MEM */ + +struct inet_diag_meminfo { + __u32 idiag_rmem; + __u32 idiag_wmem; + __u32 idiag_fmem; + __u32 idiag_tmem; +}; + +/* INET_DIAG_VEGASINFO */ + +struct tcpvegas_info { + __u32 tcpv_enabled; + __u32 tcpv_rttcnt; + __u32 tcpv_rtt; + __u32 tcpv_minrtt; +}; + +#ifdef __KERNEL__ +struct sock; +struct inet_hashinfo; + +struct inet_diag_handler { + struct inet_hashinfo *idiag_hashinfo; + void (*idiag_get_info)(struct sock *sk, + struct inet_diag_msg *r, + void *info); + __u16 idiag_info_size; + __u16 idiag_type; +}; + +extern int inet_diag_register(const struct inet_diag_handler *handler); +extern void inet_diag_unregister(const struct inet_diag_handler *handler); +#endif /* __KERNEL__ */ + +#endif /* _INET_DIAG_H_ */ diff --git a/include/linux/tcp_diag.h b/include/linux/tcp_diag.h deleted file mode 100644 index a4606e5..0000000 --- a/include/linux/tcp_diag.h +++ /dev/null @@ -1,138 +0,0 @@ -#ifndef _INET_DIAG_H_ -#define _INET_DIAG_H_ 1 - -/* Just some random number */ -#define TCPDIAG_GETSOCK 18 -#define DCCPDIAG_GETSOCK 19 - -#define INET_DIAG_GETSOCK_MAX 24 - -/* Socket identity */ -struct inet_diag_sockid { - __u16 idiag_sport; - __u16 idiag_dport; - __u32 idiag_src[4]; - __u32 idiag_dst[4]; - __u32 idiag_if; - __u32 idiag_cookie[2]; -#define INET_DIAG_NOCOOKIE (~0U) -}; - -/* Request structure */ - -struct inet_diag_req { - __u8 idiag_family; /* Family of addresses. */ - __u8 idiag_src_len; - __u8 idiag_dst_len; - __u8 idiag_ext; /* Query extended information */ - - struct inet_diag_sockid id; - - __u32 idiag_states; /* States to dump */ - __u32 idiag_dbs; /* Tables to dump (NI) */ -}; - -enum { - INET_DIAG_REQ_NONE, - INET_DIAG_REQ_BYTECODE, -}; - -#define INET_DIAG_REQ_MAX INET_DIAG_REQ_BYTECODE - -/* Bytecode is sequence of 4 byte commands followed by variable arguments. - * All the commands identified by "code" are conditional jumps forward: - * to offset cc+"yes" or to offset cc+"no". "yes" is supposed to be - * length of the command and its arguments. - */ - -struct inet_diag_bc_op { - unsigned char code; - unsigned char yes; - unsigned short no; -}; - -enum { - INET_DIAG_BC_NOP, - INET_DIAG_BC_JMP, - INET_DIAG_BC_S_GE, - INET_DIAG_BC_S_LE, - INET_DIAG_BC_D_GE, - INET_DIAG_BC_D_LE, - INET_DIAG_BC_AUTO, - INET_DIAG_BC_S_COND, - INET_DIAG_BC_D_COND, -}; - -struct inet_diag_hostcond { - __u8 family; - __u8 prefix_len; - int port; - __u32 addr[0]; -}; - -/* Base info structure. It contains socket identity (addrs/ports/cookie) - * and, alas, the information shown by netstat. */ -struct inet_diag_msg { - __u8 idiag_family; - __u8 idiag_state; - __u8 idiag_timer; - __u8 idiag_retrans; - - struct inet_diag_sockid id; - - __u32 idiag_expires; - __u32 idiag_rqueue; - __u32 idiag_wqueue; - __u32 idiag_uid; - __u32 idiag_inode; -}; - -/* Extensions */ - -enum { - INET_DIAG_NONE, - INET_DIAG_MEMINFO, - INET_DIAG_INFO, - INET_DIAG_VEGASINFO, - INET_DIAG_CONG, -}; - -#define INET_DIAG_MAX INET_DIAG_CONG - - -/* INET_DIAG_MEM */ - -struct inet_diag_meminfo { - __u32 idiag_rmem; - __u32 idiag_wmem; - __u32 idiag_fmem; - __u32 idiag_tmem; -}; - -/* INET_DIAG_VEGASINFO */ - -struct tcpvegas_info { - __u32 tcpv_enabled; - __u32 tcpv_rttcnt; - __u32 tcpv_rtt; - __u32 tcpv_minrtt; -}; - -#ifdef __KERNEL__ -struct sock; -struct inet_hashinfo; - -struct inet_diag_handler { - struct inet_hashinfo *idiag_hashinfo; - void (*idiag_get_info)(struct sock *sk, - struct inet_diag_msg *r, - void *info); - __u16 idiag_info_size; - __u16 idiag_type; -}; - -extern int inet_diag_register(const struct inet_diag_handler *handler); -extern void inet_diag_unregister(const struct inet_diag_handler *handler); -#endif /* __KERNEL__ */ - -#endif /* _INET_DIAG_H_ */ diff --git a/net/dccp/diag.c b/net/dccp/diag.c index 9f07eff..0b10c17 100644 --- a/net/dccp/diag.c +++ b/net/dccp/diag.c @@ -12,7 +12,7 @@ #include #include -#include +#include #include "dccp.h" diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 9b1c894..fe5accb 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -30,7 +30,7 @@ obj-$(CONFIG_IP_ROUTE_MULTIPATH_WRANDOM) += multipath_wrandom.o obj-$(CONFIG_IP_ROUTE_MULTIPATH_DRR) += multipath_drr.o obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_IP_VS) += ipvs/ -obj-$(CONFIG_IP_INET_DIAG) += tcp_diag.o +obj-$(CONFIG_IP_INET_DIAG) += inet_diag.o obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c new file mode 100644 index 0000000..3bd5109 --- /dev/null +++ b/net/ipv4/inet_diag.c @@ -0,0 +1,893 @@ +/* + * inet_diag.c Module for monitoring INET transport protocols sockets. + * + * Version: $Id: inet_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $ + * + * Authors: Alexey Kuznetsov, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +static const struct inet_diag_handler **inet_diag_table; + +struct inet_diag_entry { + u32 *saddr; + u32 *daddr; + u16 sport; + u16 dport; + u16 family; + u16 userlocks; +}; + +static struct sock *idiagnl; + +#define INET_DIAG_PUT(skb, attrtype, attrlen) \ + RTA_DATA(__RTA_PUT(skb, attrtype, attrlen)) + +static int inet_diag_fill(struct sk_buff *skb, struct sock *sk, + int ext, u32 pid, u32 seq, u16 nlmsg_flags, + const struct nlmsghdr *unlh) +{ + const struct inet_sock *inet = inet_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_diag_msg *r; + struct nlmsghdr *nlh; + void *info = NULL; + struct inet_diag_meminfo *minfo = NULL; + unsigned char *b = skb->tail; + const struct inet_diag_handler *handler; + + handler = inet_diag_table[unlh->nlmsg_type]; + BUG_ON(handler == NULL); + + nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r)); + nlh->nlmsg_flags = nlmsg_flags; + + r = NLMSG_DATA(nlh); + if (sk->sk_state != TCP_TIME_WAIT) { + if (ext & (1 << (INET_DIAG_MEMINFO - 1))) + minfo = INET_DIAG_PUT(skb, INET_DIAG_MEMINFO, + sizeof(*minfo)); + if (ext & (1 << (INET_DIAG_INFO - 1))) + info = INET_DIAG_PUT(skb, INET_DIAG_INFO, + handler->idiag_info_size); + + if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) { + size_t len = strlen(icsk->icsk_ca_ops->name); + strcpy(INET_DIAG_PUT(skb, INET_DIAG_CONG, len + 1), + icsk->icsk_ca_ops->name); + } + } + r->idiag_family = sk->sk_family; + r->idiag_state = sk->sk_state; + r->idiag_timer = 0; + r->idiag_retrans = 0; + + r->id.idiag_if = sk->sk_bound_dev_if; + r->id.idiag_cookie[0] = (u32)(unsigned long)sk; + r->id.idiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1); + + if (r->idiag_state == TCP_TIME_WAIT) { + const struct inet_timewait_sock *tw = inet_twsk(sk); + long tmo = tw->tw_ttd - jiffies; + if (tmo < 0) + tmo = 0; + + r->id.idiag_sport = tw->tw_sport; + r->id.idiag_dport = tw->tw_dport; + r->id.idiag_src[0] = tw->tw_rcv_saddr; + r->id.idiag_dst[0] = tw->tw_daddr; + r->idiag_state = tw->tw_substate; + r->idiag_timer = 3; + r->idiag_expires = (tmo * 1000 + HZ - 1) / HZ; + r->idiag_rqueue = 0; + r->idiag_wqueue = 0; + r->idiag_uid = 0; + r->idiag_inode = 0; +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + if (r->idiag_family == AF_INET6) { + const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk); + + ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, + &tcp6tw->tw_v6_rcv_saddr); + ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, + &tcp6tw->tw_v6_daddr); + } +#endif + nlh->nlmsg_len = skb->tail - b; + return skb->len; + } + + r->id.idiag_sport = inet->sport; + r->id.idiag_dport = inet->dport; + r->id.idiag_src[0] = inet->rcv_saddr; + r->id.idiag_dst[0] = inet->daddr; + +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + if (r->idiag_family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, + &np->rcv_saddr); + ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, + &np->daddr); + } +#endif + +#define EXPIRES_IN_MS(tmo) ((tmo - jiffies) * 1000 + HZ - 1) / HZ + + if (icsk->icsk_pending == ICSK_TIME_RETRANS) { + r->idiag_timer = 1; + r->idiag_retrans = icsk->icsk_retransmits; + r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout); + } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { + r->idiag_timer = 4; + r->idiag_retrans = icsk->icsk_probes_out; + r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout); + } else if (timer_pending(&sk->sk_timer)) { + r->idiag_timer = 2; + r->idiag_retrans = icsk->icsk_probes_out; + r->idiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires); + } else { + r->idiag_timer = 0; + r->idiag_expires = 0; + } +#undef EXPIRES_IN_MS + + r->idiag_uid = sock_i_uid(sk); + r->idiag_inode = sock_i_ino(sk); + + if (minfo) { + minfo->idiag_rmem = atomic_read(&sk->sk_rmem_alloc); + minfo->idiag_wmem = sk->sk_wmem_queued; + minfo->idiag_fmem = sk->sk_forward_alloc; + minfo->idiag_tmem = atomic_read(&sk->sk_wmem_alloc); + } + + handler->idiag_get_info(sk, r, info); + + if (sk->sk_state < TCP_TIME_WAIT && + icsk->icsk_ca_ops && icsk->icsk_ca_ops->get_info) + icsk->icsk_ca_ops->get_info(sk, ext, skb); + + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +rtattr_failure: +nlmsg_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int inet_diag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh) +{ + int err; + struct sock *sk; + struct inet_diag_req *req = NLMSG_DATA(nlh); + struct sk_buff *rep; + struct inet_hashinfo *hashinfo; + const struct inet_diag_handler *handler; + + handler = inet_diag_table[nlh->nlmsg_type]; + BUG_ON(handler == NULL); + hashinfo = handler->idiag_hashinfo; + + if (req->idiag_family == AF_INET) { + sk = inet_lookup(hashinfo, req->id.idiag_dst[0], + req->id.idiag_dport, req->id.idiag_src[0], + req->id.idiag_sport, req->id.idiag_if); + } +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + else if (req->idiag_family == AF_INET6) { + sk = inet6_lookup(hashinfo, + (struct in6_addr *)req->id.idiag_dst, + req->id.idiag_dport, + (struct in6_addr *)req->id.idiag_src, + req->id.idiag_sport, + req->id.idiag_if); + } +#endif + else { + return -EINVAL; + } + + if (sk == NULL) + return -ENOENT; + + err = -ESTALE; + if ((req->id.idiag_cookie[0] != INET_DIAG_NOCOOKIE || + req->id.idiag_cookie[1] != INET_DIAG_NOCOOKIE) && + ((u32)(unsigned long)sk != req->id.idiag_cookie[0] || + (u32)((((unsigned long)sk) >> 31) >> 1) != req->id.idiag_cookie[1])) + goto out; + + err = -ENOMEM; + rep = alloc_skb(NLMSG_SPACE((sizeof(struct inet_diag_msg) + + sizeof(struct inet_diag_meminfo) + + handler->idiag_info_size + 64)), + GFP_KERNEL); + if (!rep) + goto out; + + if (inet_diag_fill(rep, sk, req->idiag_ext, + NETLINK_CB(in_skb).pid, + nlh->nlmsg_seq, 0, nlh) <= 0) + BUG(); + + err = netlink_unicast(idiagnl, rep, NETLINK_CB(in_skb).pid, + MSG_DONTWAIT); + if (err > 0) + err = 0; + +out: + if (sk) { + if (sk->sk_state == TCP_TIME_WAIT) + inet_twsk_put((struct inet_timewait_sock *)sk); + else + sock_put(sk); + } + return err; +} + +static int bitstring_match(const u32 *a1, const u32 *a2, int bits) +{ + int words = bits >> 5; + + bits &= 0x1f; + + if (words) { + if (memcmp(a1, a2, words << 2)) + return 0; + } + if (bits) { + __u32 w1, w2; + __u32 mask; + + w1 = a1[words]; + w2 = a2[words]; + + mask = htonl((0xffffffff) << (32 - bits)); + + if ((w1 ^ w2) & mask) + return 0; + } + + return 1; +} + + +static int inet_diag_bc_run(const void *bc, int len, + const struct inet_diag_entry *entry) +{ + while (len > 0) { + int yes = 1; + const struct inet_diag_bc_op *op = bc; + + switch (op->code) { + case INET_DIAG_BC_NOP: + break; + case INET_DIAG_BC_JMP: + yes = 0; + break; + case INET_DIAG_BC_S_GE: + yes = entry->sport >= op[1].no; + break; + case INET_DIAG_BC_S_LE: + yes = entry->dport <= op[1].no; + break; + case INET_DIAG_BC_D_GE: + yes = entry->dport >= op[1].no; + break; + case INET_DIAG_BC_D_LE: + yes = entry->dport <= op[1].no; + break; + case INET_DIAG_BC_AUTO: + yes = !(entry->userlocks & SOCK_BINDPORT_LOCK); + break; + case INET_DIAG_BC_S_COND: + case INET_DIAG_BC_D_COND: { + struct inet_diag_hostcond *cond; + u32 *addr; + + cond = (struct inet_diag_hostcond *)(op + 1); + if (cond->port != -1 && + cond->port != (op->code == INET_DIAG_BC_S_COND ? + entry->sport : entry->dport)) { + yes = 0; + break; + } + + if (cond->prefix_len == 0) + break; + + if (op->code == INET_DIAG_BC_S_COND) + addr = entry->saddr; + else + addr = entry->daddr; + + if (bitstring_match(addr, cond->addr, cond->prefix_len)) + break; + if (entry->family == AF_INET6 && + cond->family == AF_INET) { + if (addr[0] == 0 && addr[1] == 0 && + addr[2] == htonl(0xffff) && + bitstring_match(addr + 3, cond->addr, + cond->prefix_len)) + break; + } + yes = 0; + break; + } + } + + if (yes) { + len -= op->yes; + bc += op->yes; + } else { + len -= op->no; + bc += op->no; + } + } + return (len == 0); +} + +static int valid_cc(const void *bc, int len, int cc) +{ + while (len >= 0) { + const struct inet_diag_bc_op *op = bc; + + if (cc > len) + return 0; + if (cc == len) + return 1; + if (op->yes < 4) + return 0; + len -= op->yes; + bc += op->yes; + } + return 0; +} + +static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) +{ + const unsigned char *bc = bytecode; + int len = bytecode_len; + + while (len > 0) { + struct inet_diag_bc_op *op = (struct inet_diag_bc_op *)bc; + +//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len); + switch (op->code) { + case INET_DIAG_BC_AUTO: + case INET_DIAG_BC_S_COND: + case INET_DIAG_BC_D_COND: + case INET_DIAG_BC_S_GE: + case INET_DIAG_BC_S_LE: + case INET_DIAG_BC_D_GE: + case INET_DIAG_BC_D_LE: + if (op->yes < 4 || op->yes > len + 4) + return -EINVAL; + case INET_DIAG_BC_JMP: + if (op->no < 4 || op->no > len + 4) + return -EINVAL; + if (op->no < len && + !valid_cc(bytecode, bytecode_len, len - op->no)) + return -EINVAL; + break; + case INET_DIAG_BC_NOP: + if (op->yes < 4 || op->yes > len + 4) + return -EINVAL; + break; + default: + return -EINVAL; + } + bc += op->yes; + len -= op->yes; + } + return len == 0 ? 0 : -EINVAL; +} + +static int inet_diag_dump_sock(struct sk_buff *skb, struct sock *sk, + struct netlink_callback *cb) +{ + struct inet_diag_req *r = NLMSG_DATA(cb->nlh); + + if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { + struct inet_diag_entry entry; + struct rtattr *bc = (struct rtattr *)(r + 1); + struct inet_sock *inet = inet_sk(sk); + + entry.family = sk->sk_family; +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + if (entry.family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + entry.saddr = np->rcv_saddr.s6_addr32; + entry.daddr = np->daddr.s6_addr32; + } else +#endif + { + entry.saddr = &inet->rcv_saddr; + entry.daddr = &inet->daddr; + } + entry.sport = inet->num; + entry.dport = ntohs(inet->dport); + entry.userlocks = sk->sk_userlocks; + + if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry)) + return 0; + } + + return inet_diag_fill(skb, sk, r->idiag_ext, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); +} + +static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, + struct request_sock *req, + u32 pid, u32 seq, + const struct nlmsghdr *unlh) +{ + const struct inet_request_sock *ireq = inet_rsk(req); + struct inet_sock *inet = inet_sk(sk); + unsigned char *b = skb->tail; + struct inet_diag_msg *r; + struct nlmsghdr *nlh; + long tmo; + + nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r)); + nlh->nlmsg_flags = NLM_F_MULTI; + r = NLMSG_DATA(nlh); + + r->idiag_family = sk->sk_family; + r->idiag_state = TCP_SYN_RECV; + r->idiag_timer = 1; + r->idiag_retrans = req->retrans; + + r->id.idiag_if = sk->sk_bound_dev_if; + r->id.idiag_cookie[0] = (u32)(unsigned long)req; + r->id.idiag_cookie[1] = (u32)(((unsigned long)req >> 31) >> 1); + + tmo = req->expires - jiffies; + if (tmo < 0) + tmo = 0; + + r->id.idiag_sport = inet->sport; + r->id.idiag_dport = ireq->rmt_port; + r->id.idiag_src[0] = ireq->loc_addr; + r->id.idiag_dst[0] = ireq->rmt_addr; + r->idiag_expires = jiffies_to_msecs(tmo); + r->idiag_rqueue = 0; + r->idiag_wqueue = 0; + r->idiag_uid = sock_i_uid(sk); + r->idiag_inode = 0; +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + if (r->idiag_family == AF_INET6) { + ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, + &tcp6_rsk(req)->loc_addr); + ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, + &tcp6_rsk(req)->rmt_addr); + } +#endif + nlh->nlmsg_len = skb->tail - b; + + return skb->len; + +nlmsg_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, + struct netlink_callback *cb) +{ + struct inet_diag_entry entry; + struct inet_diag_req *r = NLMSG_DATA(cb->nlh); + struct inet_connection_sock *icsk = inet_csk(sk); + struct listen_sock *lopt; + struct rtattr *bc = NULL; + struct inet_sock *inet = inet_sk(sk); + int j, s_j; + int reqnum, s_reqnum; + int err = 0; + + s_j = cb->args[3]; + s_reqnum = cb->args[4]; + + if (s_j > 0) + s_j--; + + entry.family = sk->sk_family; + + read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + + lopt = icsk->icsk_accept_queue.listen_opt; + if (!lopt || !lopt->qlen) + goto out; + + if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { + bc = (struct rtattr *)(r + 1); + entry.sport = inet->num; + entry.userlocks = sk->sk_userlocks; + } + + for (j = s_j; j < lopt->nr_table_entries; j++) { + struct request_sock *req, *head = lopt->syn_table[j]; + + reqnum = 0; + for (req = head; req; reqnum++, req = req->dl_next) { + struct inet_request_sock *ireq = inet_rsk(req); + + if (reqnum < s_reqnum) + continue; + if (r->id.idiag_dport != ireq->rmt_port && + r->id.idiag_dport) + continue; + + if (bc) { + entry.saddr = +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + (entry.family == AF_INET6) ? + tcp6_rsk(req)->loc_addr.s6_addr32 : +#endif + &ireq->loc_addr; + entry.daddr = +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + (entry.family == AF_INET6) ? + tcp6_rsk(req)->rmt_addr.s6_addr32 : +#endif + &ireq->rmt_addr; + entry.dport = ntohs(ireq->rmt_port); + + if (!inet_diag_bc_run(RTA_DATA(bc), + RTA_PAYLOAD(bc), &entry)) + continue; + } + + err = inet_diag_fill_req(skb, sk, req, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, cb->nlh); + if (err < 0) { + cb->args[3] = j + 1; + cb->args[4] = reqnum; + goto out; + } + } + + s_reqnum = 0; + } + +out: + read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + + return err; +} + +static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + int i, num; + int s_i, s_num; + struct inet_diag_req *r = NLMSG_DATA(cb->nlh); + const struct inet_diag_handler *handler; + struct inet_hashinfo *hashinfo; + + handler = inet_diag_table[cb->nlh->nlmsg_type]; + BUG_ON(handler == NULL); + hashinfo = handler->idiag_hashinfo; + + s_i = cb->args[1]; + s_num = num = cb->args[2]; + + if (cb->args[0] == 0) { + if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV))) + goto skip_listen_ht; + + inet_listen_lock(hashinfo); + for (i = s_i; i < INET_LHTABLE_SIZE; i++) { + struct sock *sk; + struct hlist_node *node; + + num = 0; + sk_for_each(sk, node, &hashinfo->listening_hash[i]) { + struct inet_sock *inet = inet_sk(sk); + + if (num < s_num) { + num++; + continue; + } + + if (r->id.idiag_sport != inet->sport && + r->id.idiag_sport) + goto next_listen; + + if (!(r->idiag_states & TCPF_LISTEN) || + r->id.idiag_dport || + cb->args[3] > 0) + goto syn_recv; + + if (inet_diag_dump_sock(skb, sk, cb) < 0) { + inet_listen_unlock(hashinfo); + goto done; + } + +syn_recv: + if (!(r->idiag_states & TCPF_SYN_RECV)) + goto next_listen; + + if (inet_diag_dump_reqs(skb, sk, cb) < 0) { + inet_listen_unlock(hashinfo); + goto done; + } + +next_listen: + cb->args[3] = 0; + cb->args[4] = 0; + ++num; + } + + s_num = 0; + cb->args[3] = 0; + cb->args[4] = 0; + } + inet_listen_unlock(hashinfo); +skip_listen_ht: + cb->args[0] = 1; + s_i = num = s_num = 0; + } + + if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV))) + return skb->len; + + for (i = s_i; i < hashinfo->ehash_size; i++) { + struct inet_ehash_bucket *head = &hashinfo->ehash[i]; + struct sock *sk; + struct hlist_node *node; + + if (i > s_i) + s_num = 0; + + read_lock_bh(&head->lock); + + num = 0; + sk_for_each(sk, node, &head->chain) { + struct inet_sock *inet = inet_sk(sk); + + if (num < s_num) + goto next_normal; + if (!(r->idiag_states & (1 << sk->sk_state))) + goto next_normal; + if (r->id.idiag_sport != inet->sport && + r->id.idiag_sport) + goto next_normal; + if (r->id.idiag_dport != inet->dport && r->id.idiag_dport) + goto next_normal; + if (inet_diag_dump_sock(skb, sk, cb) < 0) { + read_unlock_bh(&head->lock); + goto done; + } +next_normal: + ++num; + } + + if (r->idiag_states & TCPF_TIME_WAIT) { + sk_for_each(sk, node, + &hashinfo->ehash[i + hashinfo->ehash_size].chain) { + struct inet_sock *inet = inet_sk(sk); + + if (num < s_num) + goto next_dying; + if (r->id.idiag_sport != inet->sport && + r->id.idiag_sport) + goto next_dying; + if (r->id.idiag_dport != inet->dport && + r->id.idiag_dport) + goto next_dying; + if (inet_diag_dump_sock(skb, sk, cb) < 0) { + read_unlock_bh(&head->lock); + goto done; + } +next_dying: + ++num; + } + } + read_unlock_bh(&head->lock); + } + +done: + cb->args[1] = i; + cb->args[2] = num; + return skb->len; +} + +static int inet_diag_dump_done(struct netlink_callback *cb) +{ + return 0; +} + + +static __inline__ int +inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + if (!(nlh->nlmsg_flags&NLM_F_REQUEST)) + return 0; + + if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX) + goto err_inval; + + if (inet_diag_table[nlh->nlmsg_type] == NULL) + return -ENOENT; + + if (NLMSG_LENGTH(sizeof(struct inet_diag_req)) > skb->len) + goto err_inval; + + if (nlh->nlmsg_flags&NLM_F_DUMP) { + if (nlh->nlmsg_len > + (4 + NLMSG_SPACE(sizeof(struct inet_diag_req)))) { + struct rtattr *rta = (void *)(NLMSG_DATA(nlh) + + sizeof(struct inet_diag_req)); + if (rta->rta_type != INET_DIAG_REQ_BYTECODE || + rta->rta_len < 8 || + rta->rta_len > + (nlh->nlmsg_len - + NLMSG_SPACE(sizeof(struct inet_diag_req)))) + goto err_inval; + if (inet_diag_bc_audit(RTA_DATA(rta), RTA_PAYLOAD(rta))) + goto err_inval; + } + return netlink_dump_start(idiagnl, skb, nlh, + inet_diag_dump, + inet_diag_dump_done); + } else { + return inet_diag_get_exact(skb, nlh); + } + +err_inval: + return -EINVAL; +} + + +static inline void inet_diag_rcv_skb(struct sk_buff *skb) +{ + int err; + struct nlmsghdr * nlh; + + if (skb->len >= NLMSG_SPACE(0)) { + nlh = (struct nlmsghdr *)skb->data; + if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) + return; + err = inet_diag_rcv_msg(skb, nlh); + if (err || nlh->nlmsg_flags & NLM_F_ACK) + netlink_ack(skb, nlh, err); + } +} + +static void inet_diag_rcv(struct sock *sk, int len) +{ + struct sk_buff *skb; + unsigned int qlen = skb_queue_len(&sk->sk_receive_queue); + + while (qlen-- && (skb = skb_dequeue(&sk->sk_receive_queue))) { + inet_diag_rcv_skb(skb); + kfree_skb(skb); + } +} + +static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, + void *_info) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct tcp_info *info = _info; + + r->idiag_rqueue = tp->rcv_nxt - tp->copied_seq; + r->idiag_wqueue = tp->write_seq - tp->snd_una; + if (info != NULL) + tcp_get_info(sk, info); +} + +static struct inet_diag_handler tcp_diag_handler = { + .idiag_hashinfo = &tcp_hashinfo, + .idiag_get_info = tcp_diag_get_info, + .idiag_type = TCPDIAG_GETSOCK, + .idiag_info_size = sizeof(struct tcp_info), +}; + +static DEFINE_SPINLOCK(inet_diag_register_lock); + +int inet_diag_register(const struct inet_diag_handler *h) +{ + const __u16 type = h->idiag_type; + int err = -EINVAL; + + if (type >= INET_DIAG_GETSOCK_MAX) + goto out; + + spin_lock(&inet_diag_register_lock); + err = -EEXIST; + if (inet_diag_table[type] == NULL) { + inet_diag_table[type] = h; + err = 0; + } + spin_unlock(&inet_diag_register_lock); +out: + return err; +} +EXPORT_SYMBOL_GPL(inet_diag_register); + +void inet_diag_unregister(const struct inet_diag_handler *h) +{ + const __u16 type = h->idiag_type; + + if (type >= INET_DIAG_GETSOCK_MAX) + return; + + spin_lock(&inet_diag_register_lock); + inet_diag_table[type] = NULL; + spin_unlock(&inet_diag_register_lock); + + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(inet_diag_unregister); + +static int __init inet_diag_init(void) +{ + const int inet_diag_table_size = (INET_DIAG_GETSOCK_MAX * + sizeof(struct inet_diag_handler *)); + int err = -ENOMEM; + + inet_diag_table = kmalloc(inet_diag_table_size, GFP_KERNEL); + if (!inet_diag_table) + goto out; + + memset(inet_diag_table, 0, inet_diag_table_size); + + idiagnl = netlink_kernel_create(NETLINK_INET_DIAG, inet_diag_rcv, + THIS_MODULE); + if (idiagnl == NULL) + goto out_free_table; + + err = inet_diag_register(&tcp_diag_handler); + if (err) + goto out_sock_release; +out: + return err; +out_sock_release: + sock_release(idiagnl->sk_socket); +out_free_table: + kfree(inet_diag_table); + goto out; +} + +static void __exit inet_diag_exit(void) +{ + sock_release(idiagnl->sk_socket); + kfree(inet_diag_table); +} + +module_init(inet_diag_init); +module_exit(inet_diag_exit); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c deleted file mode 100644 index 24abe82..0000000 --- a/net/ipv4/tcp_diag.c +++ /dev/null @@ -1,892 +0,0 @@ -/* - * inet_diag.c Module for monitoring INET transport protocols sockets. - * - * Version: $Id: inet_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $ - * - * Authors: Alexey Kuznetsov, - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include - -static const struct inet_diag_handler **inet_diag_table; - -struct inet_diag_entry { - u32 *saddr; - u32 *daddr; - u16 sport; - u16 dport; - u16 family; - u16 userlocks; -}; - -static struct sock *idiagnl; - -#define INET_DIAG_PUT(skb, attrtype, attrlen) \ - RTA_DATA(__RTA_PUT(skb, attrtype, attrlen)) - -static int inet_diag_fill(struct sk_buff *skb, struct sock *sk, - int ext, u32 pid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh) -{ - const struct inet_sock *inet = inet_sk(sk); - const struct inet_connection_sock *icsk = inet_csk(sk); - struct inet_diag_msg *r; - struct nlmsghdr *nlh; - void *info = NULL; - struct inet_diag_meminfo *minfo = NULL; - unsigned char *b = skb->tail; - const struct inet_diag_handler *handler; - - handler = inet_diag_table[unlh->nlmsg_type]; - BUG_ON(handler == NULL); - - nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r)); - nlh->nlmsg_flags = nlmsg_flags; - - r = NLMSG_DATA(nlh); - if (sk->sk_state != TCP_TIME_WAIT) { - if (ext & (1 << (INET_DIAG_MEMINFO - 1))) - minfo = INET_DIAG_PUT(skb, INET_DIAG_MEMINFO, - sizeof(*minfo)); - if (ext & (1 << (INET_DIAG_INFO - 1))) - info = INET_DIAG_PUT(skb, INET_DIAG_INFO, - handler->idiag_info_size); - - if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) { - size_t len = strlen(icsk->icsk_ca_ops->name); - strcpy(INET_DIAG_PUT(skb, INET_DIAG_CONG, len + 1), - icsk->icsk_ca_ops->name); - } - } - r->idiag_family = sk->sk_family; - r->idiag_state = sk->sk_state; - r->idiag_timer = 0; - r->idiag_retrans = 0; - - r->id.idiag_if = sk->sk_bound_dev_if; - r->id.idiag_cookie[0] = (u32)(unsigned long)sk; - r->id.idiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1); - - if (r->idiag_state == TCP_TIME_WAIT) { - const struct inet_timewait_sock *tw = inet_twsk(sk); - long tmo = tw->tw_ttd - jiffies; - if (tmo < 0) - tmo = 0; - - r->id.idiag_sport = tw->tw_sport; - r->id.idiag_dport = tw->tw_dport; - r->id.idiag_src[0] = tw->tw_rcv_saddr; - r->id.idiag_dst[0] = tw->tw_daddr; - r->idiag_state = tw->tw_substate; - r->idiag_timer = 3; - r->idiag_expires = (tmo * 1000 + HZ - 1) / HZ; - r->idiag_rqueue = 0; - r->idiag_wqueue = 0; - r->idiag_uid = 0; - r->idiag_inode = 0; -#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) - if (r->idiag_family == AF_INET6) { - const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk); - - ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, - &tcp6tw->tw_v6_rcv_saddr); - ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, - &tcp6tw->tw_v6_daddr); - } -#endif - nlh->nlmsg_len = skb->tail - b; - return skb->len; - } - - r->id.idiag_sport = inet->sport; - r->id.idiag_dport = inet->dport; - r->id.idiag_src[0] = inet->rcv_saddr; - r->id.idiag_dst[0] = inet->daddr; - -#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) - if (r->idiag_family == AF_INET6) { - struct ipv6_pinfo *np = inet6_sk(sk); - - ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, - &np->rcv_saddr); - ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, - &np->daddr); - } -#endif - -#define EXPIRES_IN_MS(tmo) ((tmo-jiffies)*1000+HZ-1)/HZ - - if (icsk->icsk_pending == ICSK_TIME_RETRANS) { - r->idiag_timer = 1; - r->idiag_retrans = icsk->icsk_retransmits; - r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout); - } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { - r->idiag_timer = 4; - r->idiag_retrans = icsk->icsk_probes_out; - r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout); - } else if (timer_pending(&sk->sk_timer)) { - r->idiag_timer = 2; - r->idiag_retrans = icsk->icsk_probes_out; - r->idiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires); - } else { - r->idiag_timer = 0; - r->idiag_expires = 0; - } -#undef EXPIRES_IN_MS - - r->idiag_uid = sock_i_uid(sk); - r->idiag_inode = sock_i_ino(sk); - - if (minfo) { - minfo->idiag_rmem = atomic_read(&sk->sk_rmem_alloc); - minfo->idiag_wmem = sk->sk_wmem_queued; - minfo->idiag_fmem = sk->sk_forward_alloc; - minfo->idiag_tmem = atomic_read(&sk->sk_wmem_alloc); - } - - handler->idiag_get_info(sk, r, info); - - if (sk->sk_state < TCP_TIME_WAIT && - icsk->icsk_ca_ops && icsk->icsk_ca_ops->get_info) - icsk->icsk_ca_ops->get_info(sk, ext, skb); - - nlh->nlmsg_len = skb->tail - b; - return skb->len; - -rtattr_failure: -nlmsg_failure: - skb_trim(skb, b - skb->data); - return -1; -} - -static int inet_diag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh) -{ - int err; - struct sock *sk; - struct inet_diag_req *req = NLMSG_DATA(nlh); - struct sk_buff *rep; - struct inet_hashinfo *hashinfo; - const struct inet_diag_handler *handler; - - handler = inet_diag_table[nlh->nlmsg_type]; - BUG_ON(handler == NULL); - hashinfo = handler->idiag_hashinfo; - - if (req->idiag_family == AF_INET) { - sk = inet_lookup(hashinfo, req->id.idiag_dst[0], - req->id.idiag_dport, req->id.idiag_src[0], - req->id.idiag_sport, req->id.idiag_if); - } -#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) - else if (req->idiag_family == AF_INET6) { - sk = inet6_lookup(hashinfo, - (struct in6_addr *)req->id.idiag_dst, - req->id.idiag_dport, - (struct in6_addr *)req->id.idiag_src, - req->id.idiag_sport, - req->id.idiag_if); - } -#endif - else { - return -EINVAL; - } - - if (sk == NULL) - return -ENOENT; - - err = -ESTALE; - if ((req->id.idiag_cookie[0] != INET_DIAG_NOCOOKIE || - req->id.idiag_cookie[1] != INET_DIAG_NOCOOKIE) && - ((u32)(unsigned long)sk != req->id.idiag_cookie[0] || - (u32)((((unsigned long)sk) >> 31) >> 1) != req->id.idiag_cookie[1])) - goto out; - - err = -ENOMEM; - rep = alloc_skb(NLMSG_SPACE((sizeof(struct inet_diag_msg) + - sizeof(struct inet_diag_meminfo) + - handler->idiag_info_size + 64)), - GFP_KERNEL); - if (!rep) - goto out; - - if (inet_diag_fill(rep, sk, req->idiag_ext, - NETLINK_CB(in_skb).pid, - nlh->nlmsg_seq, 0, nlh) <= 0) - BUG(); - - err = netlink_unicast(idiagnl, rep, NETLINK_CB(in_skb).pid, - MSG_DONTWAIT); - if (err > 0) - err = 0; - -out: - if (sk) { - if (sk->sk_state == TCP_TIME_WAIT) - inet_twsk_put((struct inet_timewait_sock *)sk); - else - sock_put(sk); - } - return err; -} - -static int bitstring_match(const u32 *a1, const u32 *a2, int bits) -{ - int words = bits >> 5; - - bits &= 0x1f; - - if (words) { - if (memcmp(a1, a2, words << 2)) - return 0; - } - if (bits) { - __u32 w1, w2; - __u32 mask; - - w1 = a1[words]; - w2 = a2[words]; - - mask = htonl((0xffffffff) << (32 - bits)); - - if ((w1 ^ w2) & mask) - return 0; - } - - return 1; -} - - -static int inet_diag_bc_run(const void *bc, int len, - const struct inet_diag_entry *entry) -{ - while (len > 0) { - int yes = 1; - const struct inet_diag_bc_op *op = bc; - - switch (op->code) { - case INET_DIAG_BC_NOP: - break; - case INET_DIAG_BC_JMP: - yes = 0; - break; - case INET_DIAG_BC_S_GE: - yes = entry->sport >= op[1].no; - break; - case INET_DIAG_BC_S_LE: - yes = entry->dport <= op[1].no; - break; - case INET_DIAG_BC_D_GE: - yes = entry->dport >= op[1].no; - break; - case INET_DIAG_BC_D_LE: - yes = entry->dport <= op[1].no; - break; - case INET_DIAG_BC_AUTO: - yes = !(entry->userlocks & SOCK_BINDPORT_LOCK); - break; - case INET_DIAG_BC_S_COND: - case INET_DIAG_BC_D_COND: - { - struct inet_diag_hostcond *cond = (struct inet_diag_hostcond*)(op+1); - u32 *addr; - - if (cond->port != -1 && - cond->port != (op->code == INET_DIAG_BC_S_COND ? - entry->sport : entry->dport)) { - yes = 0; - break; - } - - if (cond->prefix_len == 0) - break; - - if (op->code == INET_DIAG_BC_S_COND) - addr = entry->saddr; - else - addr = entry->daddr; - - if (bitstring_match(addr, cond->addr, cond->prefix_len)) - break; - if (entry->family == AF_INET6 && - cond->family == AF_INET) { - if (addr[0] == 0 && addr[1] == 0 && - addr[2] == htonl(0xffff) && - bitstring_match(addr+3, cond->addr, cond->prefix_len)) - break; - } - yes = 0; - break; - } - } - - if (yes) { - len -= op->yes; - bc += op->yes; - } else { - len -= op->no; - bc += op->no; - } - } - return (len == 0); -} - -static int valid_cc(const void *bc, int len, int cc) -{ - while (len >= 0) { - const struct inet_diag_bc_op *op = bc; - - if (cc > len) - return 0; - if (cc == len) - return 1; - if (op->yes < 4) - return 0; - len -= op->yes; - bc += op->yes; - } - return 0; -} - -static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) -{ - const unsigned char *bc = bytecode; - int len = bytecode_len; - - while (len > 0) { - struct inet_diag_bc_op *op = (struct inet_diag_bc_op*)bc; - -//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len); - switch (op->code) { - case INET_DIAG_BC_AUTO: - case INET_DIAG_BC_S_COND: - case INET_DIAG_BC_D_COND: - case INET_DIAG_BC_S_GE: - case INET_DIAG_BC_S_LE: - case INET_DIAG_BC_D_GE: - case INET_DIAG_BC_D_LE: - if (op->yes < 4 || op->yes > len+4) - return -EINVAL; - case INET_DIAG_BC_JMP: - if (op->no < 4 || op->no > len+4) - return -EINVAL; - if (op->no < len && - !valid_cc(bytecode, bytecode_len, len-op->no)) - return -EINVAL; - break; - case INET_DIAG_BC_NOP: - if (op->yes < 4 || op->yes > len+4) - return -EINVAL; - break; - default: - return -EINVAL; - } - bc += op->yes; - len -= op->yes; - } - return len == 0 ? 0 : -EINVAL; -} - -static int inet_diag_dump_sock(struct sk_buff *skb, struct sock *sk, - struct netlink_callback *cb) -{ - struct inet_diag_req *r = NLMSG_DATA(cb->nlh); - - if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { - struct inet_diag_entry entry; - struct rtattr *bc = (struct rtattr *)(r + 1); - struct inet_sock *inet = inet_sk(sk); - - entry.family = sk->sk_family; -#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) - if (entry.family == AF_INET6) { - struct ipv6_pinfo *np = inet6_sk(sk); - - entry.saddr = np->rcv_saddr.s6_addr32; - entry.daddr = np->daddr.s6_addr32; - } else -#endif - { - entry.saddr = &inet->rcv_saddr; - entry.daddr = &inet->daddr; - } - entry.sport = inet->num; - entry.dport = ntohs(inet->dport); - entry.userlocks = sk->sk_userlocks; - - if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry)) - return 0; - } - - return inet_diag_fill(skb, sk, r->idiag_ext, NETLINK_CB(cb->skb).pid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); -} - -static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, - struct request_sock *req, - u32 pid, u32 seq, - const struct nlmsghdr *unlh) -{ - const struct inet_request_sock *ireq = inet_rsk(req); - struct inet_sock *inet = inet_sk(sk); - unsigned char *b = skb->tail; - struct inet_diag_msg *r; - struct nlmsghdr *nlh; - long tmo; - - nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r)); - nlh->nlmsg_flags = NLM_F_MULTI; - r = NLMSG_DATA(nlh); - - r->idiag_family = sk->sk_family; - r->idiag_state = TCP_SYN_RECV; - r->idiag_timer = 1; - r->idiag_retrans = req->retrans; - - r->id.idiag_if = sk->sk_bound_dev_if; - r->id.idiag_cookie[0] = (u32)(unsigned long)req; - r->id.idiag_cookie[1] = (u32)(((unsigned long)req >> 31) >> 1); - - tmo = req->expires - jiffies; - if (tmo < 0) - tmo = 0; - - r->id.idiag_sport = inet->sport; - r->id.idiag_dport = ireq->rmt_port; - r->id.idiag_src[0] = ireq->loc_addr; - r->id.idiag_dst[0] = ireq->rmt_addr; - r->idiag_expires = jiffies_to_msecs(tmo); - r->idiag_rqueue = 0; - r->idiag_wqueue = 0; - r->idiag_uid = sock_i_uid(sk); - r->idiag_inode = 0; -#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) - if (r->idiag_family == AF_INET6) { - ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, - &tcp6_rsk(req)->loc_addr); - ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, - &tcp6_rsk(req)->rmt_addr); - } -#endif - nlh->nlmsg_len = skb->tail - b; - - return skb->len; - -nlmsg_failure: - skb_trim(skb, b - skb->data); - return -1; -} - -static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, - struct netlink_callback *cb) -{ - struct inet_diag_entry entry; - struct inet_diag_req *r = NLMSG_DATA(cb->nlh); - struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt; - struct rtattr *bc = NULL; - struct inet_sock *inet = inet_sk(sk); - int j, s_j; - int reqnum, s_reqnum; - int err = 0; - - s_j = cb->args[3]; - s_reqnum = cb->args[4]; - - if (s_j > 0) - s_j--; - - entry.family = sk->sk_family; - - read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - - lopt = icsk->icsk_accept_queue.listen_opt; - if (!lopt || !lopt->qlen) - goto out; - - if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { - bc = (struct rtattr *)(r + 1); - entry.sport = inet->num; - entry.userlocks = sk->sk_userlocks; - } - - for (j = s_j; j < lopt->nr_table_entries; j++) { - struct request_sock *req, *head = lopt->syn_table[j]; - - reqnum = 0; - for (req = head; req; reqnum++, req = req->dl_next) { - struct inet_request_sock *ireq = inet_rsk(req); - - if (reqnum < s_reqnum) - continue; - if (r->id.idiag_dport != ireq->rmt_port && - r->id.idiag_dport) - continue; - - if (bc) { - entry.saddr = -#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) - (entry.family == AF_INET6) ? - tcp6_rsk(req)->loc_addr.s6_addr32 : -#endif - &ireq->loc_addr; - entry.daddr = -#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) - (entry.family == AF_INET6) ? - tcp6_rsk(req)->rmt_addr.s6_addr32 : -#endif - &ireq->rmt_addr; - entry.dport = ntohs(ireq->rmt_port); - - if (!inet_diag_bc_run(RTA_DATA(bc), - RTA_PAYLOAD(bc), &entry)) - continue; - } - - err = inet_diag_fill_req(skb, sk, req, - NETLINK_CB(cb->skb).pid, - cb->nlh->nlmsg_seq, cb->nlh); - if (err < 0) { - cb->args[3] = j + 1; - cb->args[4] = reqnum; - goto out; - } - } - - s_reqnum = 0; - } - -out: - read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - - return err; -} - -static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) -{ - int i, num; - int s_i, s_num; - struct inet_diag_req *r = NLMSG_DATA(cb->nlh); - const struct inet_diag_handler *handler; - struct inet_hashinfo *hashinfo; - - handler = inet_diag_table[cb->nlh->nlmsg_type]; - BUG_ON(handler == NULL); - hashinfo = handler->idiag_hashinfo; - - s_i = cb->args[1]; - s_num = num = cb->args[2]; - - if (cb->args[0] == 0) { - if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV))) - goto skip_listen_ht; - - inet_listen_lock(hashinfo); - for (i = s_i; i < INET_LHTABLE_SIZE; i++) { - struct sock *sk; - struct hlist_node *node; - - num = 0; - sk_for_each(sk, node, &hashinfo->listening_hash[i]) { - struct inet_sock *inet = inet_sk(sk); - - if (num < s_num) { - num++; - continue; - } - - if (r->id.idiag_sport != inet->sport && - r->id.idiag_sport) - goto next_listen; - - if (!(r->idiag_states & TCPF_LISTEN) || - r->id.idiag_dport || - cb->args[3] > 0) - goto syn_recv; - - if (inet_diag_dump_sock(skb, sk, cb) < 0) { - inet_listen_unlock(hashinfo); - goto done; - } - -syn_recv: - if (!(r->idiag_states & TCPF_SYN_RECV)) - goto next_listen; - - if (inet_diag_dump_reqs(skb, sk, cb) < 0) { - inet_listen_unlock(hashinfo); - goto done; - } - -next_listen: - cb->args[3] = 0; - cb->args[4] = 0; - ++num; - } - - s_num = 0; - cb->args[3] = 0; - cb->args[4] = 0; - } - inet_listen_unlock(hashinfo); -skip_listen_ht: - cb->args[0] = 1; - s_i = num = s_num = 0; - } - - if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV))) - return skb->len; - - for (i = s_i; i < hashinfo->ehash_size; i++) { - struct inet_ehash_bucket *head = &hashinfo->ehash[i]; - struct sock *sk; - struct hlist_node *node; - - if (i > s_i) - s_num = 0; - - read_lock_bh(&head->lock); - - num = 0; - sk_for_each(sk, node, &head->chain) { - struct inet_sock *inet = inet_sk(sk); - - if (num < s_num) - goto next_normal; - if (!(r->idiag_states & (1 << sk->sk_state))) - goto next_normal; - if (r->id.idiag_sport != inet->sport && - r->id.idiag_sport) - goto next_normal; - if (r->id.idiag_dport != inet->dport && r->id.idiag_dport) - goto next_normal; - if (inet_diag_dump_sock(skb, sk, cb) < 0) { - read_unlock_bh(&head->lock); - goto done; - } -next_normal: - ++num; - } - - if (r->idiag_states & TCPF_TIME_WAIT) { - sk_for_each(sk, node, - &hashinfo->ehash[i + hashinfo->ehash_size].chain) { - struct inet_sock *inet = inet_sk(sk); - - if (num < s_num) - goto next_dying; - if (r->id.idiag_sport != inet->sport && - r->id.idiag_sport) - goto next_dying; - if (r->id.idiag_dport != inet->dport && - r->id.idiag_dport) - goto next_dying; - if (inet_diag_dump_sock(skb, sk, cb) < 0) { - read_unlock_bh(&head->lock); - goto done; - } -next_dying: - ++num; - } - } - read_unlock_bh(&head->lock); - } - -done: - cb->args[1] = i; - cb->args[2] = num; - return skb->len; -} - -static int inet_diag_dump_done(struct netlink_callback *cb) -{ - return 0; -} - - -static __inline__ int -inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) -{ - if (!(nlh->nlmsg_flags&NLM_F_REQUEST)) - return 0; - - if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX) - goto err_inval; - - if (inet_diag_table[nlh->nlmsg_type] == NULL) - return -ENOENT; - - if (NLMSG_LENGTH(sizeof(struct inet_diag_req)) > skb->len) - goto err_inval; - - if (nlh->nlmsg_flags&NLM_F_DUMP) { - if (nlh->nlmsg_len > - (4 + NLMSG_SPACE(sizeof(struct inet_diag_req)))) { - struct rtattr *rta = (void *)(NLMSG_DATA(nlh) + - sizeof(struct inet_diag_req)); - if (rta->rta_type != INET_DIAG_REQ_BYTECODE || - rta->rta_len < 8 || - rta->rta_len > - (nlh->nlmsg_len - - NLMSG_SPACE(sizeof(struct inet_diag_req)))) - goto err_inval; - if (inet_diag_bc_audit(RTA_DATA(rta), RTA_PAYLOAD(rta))) - goto err_inval; - } - return netlink_dump_start(idiagnl, skb, nlh, - inet_diag_dump, - inet_diag_dump_done); - } else { - return inet_diag_get_exact(skb, nlh); - } - -err_inval: - return -EINVAL; -} - - -static inline void inet_diag_rcv_skb(struct sk_buff *skb) -{ - int err; - struct nlmsghdr * nlh; - - if (skb->len >= NLMSG_SPACE(0)) { - nlh = (struct nlmsghdr *)skb->data; - if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) - return; - err = inet_diag_rcv_msg(skb, nlh); - if (err || nlh->nlmsg_flags & NLM_F_ACK) - netlink_ack(skb, nlh, err); - } -} - -static void inet_diag_rcv(struct sock *sk, int len) -{ - struct sk_buff *skb; - unsigned int qlen = skb_queue_len(&sk->sk_receive_queue); - - while (qlen-- && (skb = skb_dequeue(&sk->sk_receive_queue))) { - inet_diag_rcv_skb(skb); - kfree_skb(skb); - } -} - -static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, - void *_info) -{ - const struct tcp_sock *tp = tcp_sk(sk); - struct tcp_info *info = _info; - - r->idiag_rqueue = tp->rcv_nxt - tp->copied_seq; - r->idiag_wqueue = tp->write_seq - tp->snd_una; - if (info != NULL) - tcp_get_info(sk, info); -} - -static struct inet_diag_handler tcp_diag_handler = { - .idiag_hashinfo = &tcp_hashinfo, - .idiag_get_info = tcp_diag_get_info, - .idiag_type = TCPDIAG_GETSOCK, - .idiag_info_size = sizeof(struct tcp_info), -}; - -static DEFINE_SPINLOCK(inet_diag_register_lock); - -int inet_diag_register(const struct inet_diag_handler *h) -{ - const __u16 type = h->idiag_type; - int err = -EINVAL; - - if (type >= INET_DIAG_GETSOCK_MAX) - goto out; - - spin_lock(&inet_diag_register_lock); - err = -EEXIST; - if (inet_diag_table[type] == NULL) { - inet_diag_table[type] = h; - err = 0; - } - spin_unlock(&inet_diag_register_lock); -out: - return err; -} -EXPORT_SYMBOL_GPL(inet_diag_register); - -void inet_diag_unregister(const struct inet_diag_handler *h) -{ - const __u16 type = h->idiag_type; - - if (type >= INET_DIAG_GETSOCK_MAX) - return; - - spin_lock(&inet_diag_register_lock); - inet_diag_table[type] = NULL; - spin_unlock(&inet_diag_register_lock); - - synchronize_rcu(); -} -EXPORT_SYMBOL_GPL(inet_diag_unregister); - -static int __init inet_diag_init(void) -{ - const int inet_diag_table_size = (INET_DIAG_GETSOCK_MAX * - sizeof(struct inet_diag_handler *)); - int err = -ENOMEM; - - inet_diag_table = kmalloc(inet_diag_table_size, GFP_KERNEL); - if (!inet_diag_table) - goto out; - - memset(inet_diag_table, 0, inet_diag_table_size); - - idiagnl = netlink_kernel_create(NETLINK_INET_DIAG, inet_diag_rcv, - THIS_MODULE); - if (idiagnl == NULL) - goto out_free_table; - - err = inet_diag_register(&tcp_diag_handler); - if (err) - goto out_sock_release; -out: - return err; -out_sock_release: - sock_release(idiagnl->sk_socket); -out_free_table: - kfree(inet_diag_table); - goto out; -} - -static void __exit inet_diag_exit(void) -{ - sock_release(idiagnl->sk_socket); - kfree(inet_diag_table); -} - -module_init(inet_diag_init); -module_exit(inet_diag_exit); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 8cef9dc..93c5f92 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -35,7 +35,7 @@ #include #include #include -#include +#include #include diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index 3951003..0c340c3 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include /* TCP Westwood structure */ -- cgit v0.10.2 From 17b085eacef81a6286bd478f2ec75e04abb091cb Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 12 Aug 2005 12:59:17 -0300 Subject: [INET_DIAG]: Move the tcp_diag interface to the proper place With this the previous setup is back, i.e. tcp_diag can be built as a module, as dccp_diag and both share the infrastructure available in inet_diag. If one selects CONFIG_INET_DIAG as module CONFIG_INET_TCP_DIAG will also be built as a module, as will CONFIG_INET_DCCP_DIAG, if CONFIG_IP_DCCP was selected static or as a module, if CONFIG_INET_DIAG is y, being statically linked CONFIG_INET_TCP_DIAG will follow suit and CONFIG_INET_DCCP_DIAG will be built in the same manner as CONFIG_IP_DCCP. Now to aim at UDP, converting it to use inet_hashinfo, so that we can use iproute2 for UDP sockets as well. Ah, just to show an example of this new infrastructure working for DCCP :-) [root@qemu ~]# ./ss -dane State Recv-Q Send-Q Local Address:Port Peer Address:Port LISTEN 0 0 *:5001 *:* ino:942 sk:cfd503a0 ESTAB 0 0 127.0.0.1:5001 127.0.0.1:32770 ino:943 sk:cfd50a60 ESTAB 0 0 127.0.0.1:32770 127.0.0.1:5001 ino:947 sk:cfd50700 TIME-WAIT 0 0 127.0.0.1:32769 127.0.0.1:5001 timer:(timewait,3.430ms,0) ino:0 sk:cf209620 Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/net/tcp.h b/include/net/tcp.h index fef1227..d958260 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -479,7 +479,7 @@ static inline void tcp_clear_xmit_timers(struct sock *sk) extern unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu); extern unsigned int tcp_current_mss(struct sock *sk, int large); -/* tcp_diag.c */ +/* tcp.c */ extern void tcp_get_info(struct sock *, struct tcp_info *); /* Read 'sendfile()'-style from a TCP socket */ diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig index efce4f3..6760830 100644 --- a/net/dccp/Kconfig +++ b/net/dccp/Kconfig @@ -19,9 +19,9 @@ config IP_DCCP If in doubt, say N. -config IP_DCCP_DIAG - depends on IP_DCCP && IP_INET_DIAG - def_tristate y if (IP_DCCP = y && IP_INET_DIAG = y) +config INET_DCCP_DIAG + depends on IP_DCCP && INET_DIAG + def_tristate y if (IP_DCCP = y && INET_DIAG = y) def_tristate m source "net/dccp/ccids/Kconfig" diff --git a/net/dccp/Makefile b/net/dccp/Makefile index 5741fff..44a867f 100644 --- a/net/dccp/Makefile +++ b/net/dccp/Makefile @@ -3,8 +3,8 @@ obj-$(CONFIG_IP_DCCP) += dccp.o dccp-y := ccid.o input.o ipv4.o minisocks.o options.o output.o proto.o \ timer.o packet_history.o -obj-$(CONFIG_IP_DCCP_DIAG) += dccp_diag.o - -obj-y += ccids/ +obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o dccp_diag-y := diag.o + +obj-y += ccids/ diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 019e88d..e55136a 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -413,8 +413,8 @@ config INET_TUNNEL If unsure, say Y. -config IP_INET_DIAG - tristate "IP: INET socket monitoring interface" +config INET_DIAG + tristate "INET: socket monitoring interface" default y ---help--- Support for INET (TCP, DCCP, etc) socket monitoring interface used by @@ -423,6 +423,10 @@ config IP_INET_DIAG If unsure, say Y. +config INET_TCP_DIAG + depends on INET_DIAG + def_tristate INET_DIAG + config TCP_CONG_ADVANCED bool "TCP: advanced congestion control" ---help--- diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index fe5accb..f0435d0 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -30,8 +30,9 @@ obj-$(CONFIG_IP_ROUTE_MULTIPATH_WRANDOM) += multipath_wrandom.o obj-$(CONFIG_IP_ROUTE_MULTIPATH_DRR) += multipath_drr.o obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_IP_VS) += ipvs/ -obj-$(CONFIG_IP_INET_DIAG) += inet_diag.o +obj-$(CONFIG_INET_DIAG) += inet_diag.o obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o +obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 3bd5109..1880ad8 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -797,25 +797,6 @@ static void inet_diag_rcv(struct sock *sk, int len) } } -static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, - void *_info) -{ - const struct tcp_sock *tp = tcp_sk(sk); - struct tcp_info *info = _info; - - r->idiag_rqueue = tp->rcv_nxt - tp->copied_seq; - r->idiag_wqueue = tp->write_seq - tp->snd_una; - if (info != NULL) - tcp_get_info(sk, info); -} - -static struct inet_diag_handler tcp_diag_handler = { - .idiag_hashinfo = &tcp_hashinfo, - .idiag_get_info = tcp_diag_get_info, - .idiag_type = TCPDIAG_GETSOCK, - .idiag_info_size = sizeof(struct tcp_info), -}; - static DEFINE_SPINLOCK(inet_diag_register_lock); int inet_diag_register(const struct inet_diag_handler *h) @@ -864,19 +845,13 @@ static int __init inet_diag_init(void) goto out; memset(inet_diag_table, 0, inet_diag_table_size); - idiagnl = netlink_kernel_create(NETLINK_INET_DIAG, inet_diag_rcv, THIS_MODULE); if (idiagnl == NULL) goto out_free_table; - - err = inet_diag_register(&tcp_diag_handler); - if (err) - goto out_sock_release; + err = 0; out: return err; -out_sock_release: - sock_release(idiagnl->sk_socket); out_free_table: kfree(inet_diag_table); goto out; diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c new file mode 100644 index 0000000..c148c10 --- /dev/null +++ b/net/ipv4/tcp_diag.c @@ -0,0 +1,54 @@ +/* + * tcp_diag.c Module for monitoring TCP transport protocols sockets. + * + * Version: $Id: tcp_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $ + * + * Authors: Alexey Kuznetsov, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include + +#include +#include + +#include + +#include + +static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, + void *_info) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct tcp_info *info = _info; + + r->idiag_rqueue = tp->rcv_nxt - tp->copied_seq; + r->idiag_wqueue = tp->write_seq - tp->snd_una; + if (info != NULL) + tcp_get_info(sk, info); +} + +static struct inet_diag_handler tcp_diag_handler = { + .idiag_hashinfo = &tcp_hashinfo, + .idiag_get_info = tcp_diag_get_info, + .idiag_type = TCPDIAG_GETSOCK, + .idiag_info_size = sizeof(struct tcp_info), +}; + +static int __init tcp_diag_init(void) +{ + return inet_diag_register(&tcp_diag_handler); +} + +static void __exit tcp_diag_exit(void) +{ + inet_diag_unregister(&tcp_diag_handler); +} + +module_init(tcp_diag_init); +module_exit(tcp_diag_exit); +MODULE_LICENSE("GPL"); -- cgit v0.10.2 From 0ba2c6e8c0fb5cde5a23a213c2e7cb851b85c310 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Sat, 13 Aug 2005 13:55:44 -0700 Subject: [NETFILTER]: introduce and use aligned_u64 data type As proposed by Andi Kleen, this is required esp. for x86_64 architecture, where 64bit code needs 8byte aligned 64bit data types, but 32bit userspace apps will only align to 4bytes. Signed-off-by: Harald Welte Signed-off-by: David S. Miller diff --git a/include/linux/netfilter/nfnetlink_log.h b/include/linux/netfilter/nfnetlink_log.h index a61836a..b04b038 100644 --- a/include/linux/netfilter/nfnetlink_log.h +++ b/include/linux/netfilter/nfnetlink_log.h @@ -5,6 +5,7 @@ * and not any kind of function definitions. It is shared between kernel and * userspace. Don't put kernel specific stuff in here */ +#include #include enum nfulnl_msg_types { @@ -27,8 +28,8 @@ struct nfulnl_msg_packet_hw { } __attribute__ ((packed)); struct nfulnl_msg_packet_timestamp { - u_int64_t sec; - u_int64_t usec; + aligned_u64 sec; + aligned_u64 usec; } __attribute__ ((packed)); #define NFULNL_PREFIXLEN 30 /* just like old log target */ diff --git a/include/linux/netfilter/nfnetlink_queue.h b/include/linux/netfilter/nfnetlink_queue.h index 2d8d2b2..9e77437 100644 --- a/include/linux/netfilter/nfnetlink_queue.h +++ b/include/linux/netfilter/nfnetlink_queue.h @@ -1,6 +1,7 @@ #ifndef _NFNETLINK_QUEUE_H #define _NFNETLINK_QUEUE_H +#include #include enum nfqnl_msg_types { @@ -24,8 +25,8 @@ struct nfqnl_msg_packet_hw { } __attribute__ ((packed)); struct nfqnl_msg_packet_timestamp { - u_int64_t sec; - u_int64_t usec; + aligned_u64 sec; + aligned_u64 usec; } __attribute__ ((packed)); enum nfqnl_attr_type { diff --git a/include/linux/types.h b/include/linux/types.h index dcb13f8..2b678c2 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -123,6 +123,9 @@ typedef __u64 u_int64_t; typedef __s64 int64_t; #endif +/* this is a special 64bit data type that is 8-byte aligned */ +#define aligned_u64 unsigned long long __attribute__((aligned(8))) + /* * The type used for indexing onto a disc or disc partition. * If required, asm/types.h can override it and define -- cgit v0.10.2 From 9d810fd2d28a9d672eca3136476af1a54a380bb2 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Sat, 13 Aug 2005 13:56:26 -0700 Subject: [NETFILTER]: Add new iptables "connbytes" match This patch ads a new "connbytes" match that utilizes the CONFIG_NF_CT_ACCT per-connection byte and packet counters. Using it you can do things like packet classification on average packet size within a connection. Signed-off-by: Harald Welte Signed-off-by: David S. Miller diff --git a/include/linux/netfilter_ipv4/ipt_connbytes.h b/include/linux/netfilter_ipv4/ipt_connbytes.h new file mode 100644 index 0000000..abaa65a --- /dev/null +++ b/include/linux/netfilter_ipv4/ipt_connbytes.h @@ -0,0 +1,25 @@ +#ifndef _IPT_CONNBYTES_H +#define _IPT_CONNBYTES_H + +enum ipt_connbytes_what { + IPT_CONNBYTES_WHAT_PKTS, + IPT_CONNBYTES_WHAT_BYTES, + IPT_CONNBYTES_WHAT_AVGPKT, +}; + +enum ipt_connbytes_direction { + IPT_CONNBYTES_DIR_ORIGINAL, + IPT_CONNBYTES_DIR_REPLY, + IPT_CONNBYTES_DIR_BOTH, +}; + +struct ipt_connbytes_info +{ + struct { + aligned_u64 from; /* count to be matched */ + aligned_u64 to; /* count to be matched */ + } count; + u_int8_t what; /* ipt_connbytes_what */ + u_int8_t direction; /* ipt_connbytes_direction */ +}; +#endif diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 9f5e1d7..3f7e6e4 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -386,6 +386,16 @@ config IP_NF_MATCH_CONNMARK . The module will be called ipt_connmark.o. If unsure, say `N'. +config IP_NF_MATCH_CONNBYTES + tristate 'Connection byte/packet counter match support' + depends on IP_NF_CT_ACCT && IP_NF_IPTABLES + help + This option adds a `connbytes' match, which allows you to match the + number of bytes and/or packets for each direction within a connection. + + If you want to compile it as a module, say M here and read + . If unsure, say `N'. + config IP_NF_MATCH_HASHLIMIT tristate 'hashlimit match support' depends on IP_NF_IPTABLES @@ -723,6 +733,5 @@ config IP_NF_CONNTRACK_NETLINK help This option enables support for a netlink-based userspace interface - endmenu diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 58aa7c61..7c8ae85 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -59,6 +59,7 @@ obj-$(CONFIG_IP_NF_MATCH_TTL) += ipt_ttl.o obj-$(CONFIG_IP_NF_MATCH_STATE) += ipt_state.o obj-$(CONFIG_IP_NF_MATCH_CONNMARK) += ipt_connmark.o obj-$(CONFIG_IP_NF_MATCH_CONNTRACK) += ipt_conntrack.o +obj-$(CONFIG_IP_NF_MATCH_CONNBYTES) += ipt_connbytes.o obj-$(CONFIG_IP_NF_MATCH_TCPMSS) += ipt_tcpmss.o obj-$(CONFIG_IP_NF_MATCH_REALM) += ipt_realm.o obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o diff --git a/net/ipv4/netfilter/ipt_connbytes.c b/net/ipv4/netfilter/ipt_connbytes.c new file mode 100644 index 0000000..0dfb52c --- /dev/null +++ b/net/ipv4/netfilter/ipt_connbytes.c @@ -0,0 +1,166 @@ +/* Kernel module to match connection tracking byte counter. + * GPL (C) 2002 Martin Devera (devik@cdi.cz). + * + * 2004-07-20 Harald Welte + * - reimplemented to use per-connection accounting counters + * - add functionality to match number of packets + * - add functionality to match average packet size + * - add support to match directions seperately + * + */ +#include +#include +#include +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Harald Welte "); +MODULE_DESCRIPTION("iptables match for matching number of pkts/bytes per connection"); + +/* 64bit divisor, dividend and result. dynamic precision */ +static u_int64_t div64_64(u_int64_t divisor, u_int64_t dividend) +{ + u_int64_t result = divisor; + + if (dividend > 0xffffffff) { + int first_bit = find_first_bit((unsigned long *) ÷nd, sizeof(dividend)); + /* calculate number of bits to shift. shift exactly enough + * bits to make dividend fit in 32bits. */ + int num_shift = (64 - 32 - first_bit); + /* first bit has to be < 32, since dividend was > 0xffffffff */ + result = result >> num_shift; + dividend = dividend >> num_shift; + } + + do_div(divisor, dividend); + + return divisor; +} + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_connbytes_info *sinfo = matchinfo; + enum ip_conntrack_info ctinfo; + struct ip_conntrack *ct; + u_int64_t what = 0; /* initialize to make gcc happy */ + + if (!(ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo))) + return 0; /* no match */ + + switch (sinfo->what) { + case IPT_CONNBYTES_WHAT_PKTS: + switch (sinfo->direction) { + case IPT_CONNBYTES_DIR_ORIGINAL: + what = ct->counters[IP_CT_DIR_ORIGINAL].packets; + break; + case IPT_CONNBYTES_DIR_REPLY: + what = ct->counters[IP_CT_DIR_REPLY].packets; + break; + case IPT_CONNBYTES_DIR_BOTH: + what = ct->counters[IP_CT_DIR_ORIGINAL].packets; + what += ct->counters[IP_CT_DIR_REPLY].packets; + break; + } + break; + case IPT_CONNBYTES_WHAT_BYTES: + switch (sinfo->direction) { + case IPT_CONNBYTES_DIR_ORIGINAL: + what = ct->counters[IP_CT_DIR_ORIGINAL].bytes; + break; + case IPT_CONNBYTES_DIR_REPLY: + what = ct->counters[IP_CT_DIR_REPLY].bytes; + break; + case IPT_CONNBYTES_DIR_BOTH: + what = ct->counters[IP_CT_DIR_ORIGINAL].bytes; + what += ct->counters[IP_CT_DIR_REPLY].bytes; + break; + } + break; + case IPT_CONNBYTES_WHAT_AVGPKT: + switch (sinfo->direction) { + case IPT_CONNBYTES_DIR_ORIGINAL: + what = div64_64(ct->counters[IP_CT_DIR_ORIGINAL].bytes, + ct->counters[IP_CT_DIR_ORIGINAL].packets); + break; + case IPT_CONNBYTES_DIR_REPLY: + what = div64_64(ct->counters[IP_CT_DIR_REPLY].bytes, + ct->counters[IP_CT_DIR_REPLY].packets); + break; + case IPT_CONNBYTES_DIR_BOTH: + { + u_int64_t bytes; + u_int64_t pkts; + bytes = ct->counters[IP_CT_DIR_ORIGINAL].bytes + + ct->counters[IP_CT_DIR_REPLY].bytes; + pkts = ct->counters[IP_CT_DIR_ORIGINAL].packets+ + ct->counters[IP_CT_DIR_REPLY].packets; + + /* FIXME_THEORETICAL: what to do if sum + * overflows ? */ + + what = div64_64(bytes, pkts); + } + break; + } + break; + } + + if (sinfo->count.to) + return (what <= sinfo->count.to && what >= sinfo->count.from); + else + return (what >= sinfo->count.from); +} + +static int check(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + const struct ipt_connbytes_info *sinfo = matchinfo; + + if (matchsize != IPT_ALIGN(sizeof(struct ipt_connbytes_info))) + return 0; + + if (sinfo->what != IPT_CONNBYTES_WHAT_PKTS && + sinfo->what != IPT_CONNBYTES_WHAT_BYTES && + sinfo->what != IPT_CONNBYTES_WHAT_AVGPKT) + return 0; + + if (sinfo->direction != IPT_CONNBYTES_DIR_ORIGINAL && + sinfo->direction != IPT_CONNBYTES_DIR_REPLY && + sinfo->direction != IPT_CONNBYTES_DIR_BOTH) + return 0; + + return 1; +} + +static struct ipt_match state_match = { + .name = "connbytes", + .match = &match, + .checkentry = &check, + .me = THIS_MODULE +}; + +static int __init init(void) +{ + return ipt_register_match(&state_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&state_match); +} + +module_init(init); +module_exit(fini); -- cgit v0.10.2 From 8ffde671730df0b392ca478643b88ef7153244c0 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sat, 13 Aug 2005 13:57:58 -0700 Subject: [NETFILTER]: Fix div64_64 in ipt_connbytes Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller diff --git a/net/ipv4/netfilter/ipt_connbytes.c b/net/ipv4/netfilter/ipt_connbytes.c index 0dfb52c..47128c0 100644 --- a/net/ipv4/netfilter/ipt_connbytes.c +++ b/net/ipv4/netfilter/ipt_connbytes.c @@ -22,23 +22,19 @@ MODULE_AUTHOR("Harald Welte "); MODULE_DESCRIPTION("iptables match for matching number of pkts/bytes per connection"); /* 64bit divisor, dividend and result. dynamic precision */ -static u_int64_t div64_64(u_int64_t divisor, u_int64_t dividend) +static u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor) { - u_int64_t result = divisor; - - if (dividend > 0xffffffff) { - int first_bit = find_first_bit((unsigned long *) ÷nd, sizeof(dividend)); - /* calculate number of bits to shift. shift exactly enough - * bits to make dividend fit in 32bits. */ - int num_shift = (64 - 32 - first_bit); - /* first bit has to be < 32, since dividend was > 0xffffffff */ - result = result >> num_shift; - dividend = dividend >> num_shift; - } + u_int32_t d = divisor; + + if (divisor > 0xffffffffULL) { + unsigned int shift = fls(divisor >> 32); - do_div(divisor, dividend); + d = divisor >> shift; + dividend >>= shift; + } - return divisor; + do_div(dividend, d); + return dividend; } static int -- cgit v0.10.2 From 25ed891019b84498c83903ecf53df7ce35e9cff6 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sat, 13 Aug 2005 13:58:21 -0700 Subject: [NETFILTER]: Nicer names for ipt_connbytes constants Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller diff --git a/include/linux/netfilter_ipv4/ipt_connbytes.h b/include/linux/netfilter_ipv4/ipt_connbytes.h index abaa65a..9e5532f 100644 --- a/include/linux/netfilter_ipv4/ipt_connbytes.h +++ b/include/linux/netfilter_ipv4/ipt_connbytes.h @@ -2,9 +2,9 @@ #define _IPT_CONNBYTES_H enum ipt_connbytes_what { - IPT_CONNBYTES_WHAT_PKTS, - IPT_CONNBYTES_WHAT_BYTES, - IPT_CONNBYTES_WHAT_AVGPKT, + IPT_CONNBYTES_PKTS, + IPT_CONNBYTES_BYTES, + IPT_CONNBYTES_AVGPKT, }; enum ipt_connbytes_direction { diff --git a/net/ipv4/netfilter/ipt_connbytes.c b/net/ipv4/netfilter/ipt_connbytes.c index 47128c0..df4a42c 100644 --- a/net/ipv4/netfilter/ipt_connbytes.c +++ b/net/ipv4/netfilter/ipt_connbytes.c @@ -54,7 +54,7 @@ match(const struct sk_buff *skb, return 0; /* no match */ switch (sinfo->what) { - case IPT_CONNBYTES_WHAT_PKTS: + case IPT_CONNBYTES_PKTS: switch (sinfo->direction) { case IPT_CONNBYTES_DIR_ORIGINAL: what = ct->counters[IP_CT_DIR_ORIGINAL].packets; @@ -68,7 +68,7 @@ match(const struct sk_buff *skb, break; } break; - case IPT_CONNBYTES_WHAT_BYTES: + case IPT_CONNBYTES_BYTES: switch (sinfo->direction) { case IPT_CONNBYTES_DIR_ORIGINAL: what = ct->counters[IP_CT_DIR_ORIGINAL].bytes; @@ -82,7 +82,7 @@ match(const struct sk_buff *skb, break; } break; - case IPT_CONNBYTES_WHAT_AVGPKT: + case IPT_CONNBYTES_AVGPKT: switch (sinfo->direction) { case IPT_CONNBYTES_DIR_ORIGINAL: what = div64_64(ct->counters[IP_CT_DIR_ORIGINAL].bytes, @@ -128,9 +128,9 @@ static int check(const char *tablename, if (matchsize != IPT_ALIGN(sizeof(struct ipt_connbytes_info))) return 0; - if (sinfo->what != IPT_CONNBYTES_WHAT_PKTS && - sinfo->what != IPT_CONNBYTES_WHAT_BYTES && - sinfo->what != IPT_CONNBYTES_WHAT_AVGPKT) + if (sinfo->what != IPT_CONNBYTES_PKTS && + sinfo->what != IPT_CONNBYTES_BYTES && + sinfo->what != IPT_CONNBYTES_AVGPKT) return 0; if (sinfo->direction != IPT_CONNBYTES_DIR_ORIGINAL && -- cgit v0.10.2 From a61bbcf28a8cb0ba56f8193d512f7222e711a294 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sun, 14 Aug 2005 17:24:31 -0700 Subject: [NET]: Store skb->timestamp as offset to a base timestamp Reduces skb size by 8 bytes on 64-bit. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller diff --git a/drivers/atm/ambassador.c b/drivers/atm/ambassador.c index 73c6b85..d74a7c5 100644 --- a/drivers/atm/ambassador.c +++ b/drivers/atm/ambassador.c @@ -513,7 +513,7 @@ static void rx_complete (amb_dev * dev, rx_out * rx) { // VC layer stats atomic_inc(&atm_vcc->stats->rx); - do_gettimeofday(&skb->stamp); + __net_timestamp(skb); // end of our responsability atm_vcc->push (atm_vcc, skb); return; diff --git a/drivers/atm/atmtcp.c b/drivers/atm/atmtcp.c index f2f01cb..57f1810 100644 --- a/drivers/atm/atmtcp.c +++ b/drivers/atm/atmtcp.c @@ -325,7 +325,7 @@ static int atmtcp_c_send(struct atm_vcc *vcc,struct sk_buff *skb) result = -ENOBUFS; goto done; } - do_gettimeofday(&new_skb->stamp); + __net_timestamp(new_skb); memcpy(skb_put(new_skb,skb->len),skb->data,skb->len); out_vcc->push(out_vcc,new_skb); atomic_inc(&vcc->stats->tx); diff --git a/drivers/atm/eni.c b/drivers/atm/eni.c index 10da369..c13c4d7 100644 --- a/drivers/atm/eni.c +++ b/drivers/atm/eni.c @@ -537,7 +537,7 @@ static int rx_aal0(struct atm_vcc *vcc) return 0; } skb_put(skb,length); - skb->stamp = eni_vcc->timestamp; + skb_set_timestamp(skb, &eni_vcc->timestamp); DPRINTK("got len %ld\n",length); if (do_rx_dma(vcc,skb,1,length >> 2,length >> 2)) return 1; eni_vcc->rxing++; diff --git a/drivers/atm/firestream.c b/drivers/atm/firestream.c index b078fa5..5821974 100644 --- a/drivers/atm/firestream.c +++ b/drivers/atm/firestream.c @@ -815,7 +815,7 @@ static void process_incoming (struct fs_dev *dev, struct queue *q) skb_put (skb, qe->p1 & 0xffff); ATM_SKB(skb)->vcc = atm_vcc; atomic_inc(&atm_vcc->stats->rx); - do_gettimeofday(&skb->stamp); + __net_timestamp(skb); fs_dprintk (FS_DEBUG_ALLOC, "Free rec-skb: %p (pushed)\n", skb); atm_vcc->push (atm_vcc, skb); fs_dprintk (FS_DEBUG_ALLOC, "Free rec-d: %p\n", pe); diff --git a/drivers/atm/fore200e.c b/drivers/atm/fore200e.c index 5f70219..2bf723a 100644 --- a/drivers/atm/fore200e.c +++ b/drivers/atm/fore200e.c @@ -1176,7 +1176,7 @@ fore200e_push_rpd(struct fore200e* fore200e, struct atm_vcc* vcc, struct rpd* rp return -ENOMEM; } - do_gettimeofday(&skb->stamp); + __net_timestamp(skb); #ifdef FORE200E_52BYTE_AAL0_SDU if (cell_header) { diff --git a/drivers/atm/he.c b/drivers/atm/he.c index 28250c9..fde9334 100644 --- a/drivers/atm/he.c +++ b/drivers/atm/he.c @@ -1886,7 +1886,7 @@ he_service_rbrq(struct he_dev *he_dev, int group) if (rx_skb_reserve > 0) skb_reserve(skb, rx_skb_reserve); - do_gettimeofday(&skb->stamp); + __net_timestamp(skb); for (iov = he_vcc->iov_head; iov < he_vcc->iov_tail; ++iov) { diff --git a/drivers/atm/horizon.c b/drivers/atm/horizon.c index 924a2c8..0cded04 100644 --- a/drivers/atm/horizon.c +++ b/drivers/atm/horizon.c @@ -1034,7 +1034,7 @@ static void rx_schedule (hrz_dev * dev, int irq) { struct atm_vcc * vcc = ATM_SKB(skb)->vcc; // VC layer stats atomic_inc(&vcc->stats->rx); - do_gettimeofday(&skb->stamp); + __net_timestamp(skb); // end of our responsability vcc->push (vcc, skb); } diff --git a/drivers/atm/idt77252.c b/drivers/atm/idt77252.c index 30b7e99..b4a76ca 100644 --- a/drivers/atm/idt77252.c +++ b/drivers/atm/idt77252.c @@ -1101,7 +1101,7 @@ dequeue_rx(struct idt77252_dev *card, struct rsq_entry *rsqe) cell, ATM_CELL_PAYLOAD); ATM_SKB(sb)->vcc = vcc; - do_gettimeofday(&sb->stamp); + __net_timestamp(sb); vcc->push(vcc, sb); atomic_inc(&vcc->stats->rx); @@ -1179,7 +1179,7 @@ dequeue_rx(struct idt77252_dev *card, struct rsq_entry *rsqe) skb_trim(skb, len); ATM_SKB(skb)->vcc = vcc; - do_gettimeofday(&skb->stamp); + __net_timestamp(skb); vcc->push(vcc, skb); atomic_inc(&vcc->stats->rx); @@ -1201,7 +1201,7 @@ dequeue_rx(struct idt77252_dev *card, struct rsq_entry *rsqe) skb_trim(skb, len); ATM_SKB(skb)->vcc = vcc; - do_gettimeofday(&skb->stamp); + __net_timestamp(skb); vcc->push(vcc, skb); atomic_inc(&vcc->stats->rx); @@ -1340,7 +1340,7 @@ idt77252_rx_raw(struct idt77252_dev *card) ATM_CELL_PAYLOAD); ATM_SKB(sb)->vcc = vcc; - do_gettimeofday(&sb->stamp); + __net_timestamp(sb); vcc->push(vcc, sb); atomic_inc(&vcc->stats->rx); diff --git a/drivers/atm/lanai.c b/drivers/atm/lanai.c index ffe3afa..51ec147 100644 --- a/drivers/atm/lanai.c +++ b/drivers/atm/lanai.c @@ -1427,7 +1427,7 @@ static void vcc_rx_aal5(struct lanai_vcc *lvcc, int endptr) skb_put(skb, size); vcc_rx_memcpy(skb->data, lvcc, size); ATM_SKB(skb)->vcc = lvcc->rx.atmvcc; - do_gettimeofday(&skb->stamp); + __net_timestamp(skb); lvcc->rx.atmvcc->push(lvcc->rx.atmvcc, skb); atomic_inc(&lvcc->rx.atmvcc->stats->rx); out: diff --git a/drivers/atm/nicstar.c b/drivers/atm/nicstar.c index a0e3bd8..c57e20dcb 100644 --- a/drivers/atm/nicstar.c +++ b/drivers/atm/nicstar.c @@ -2213,7 +2213,7 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe) memcpy(sb->tail, cell, ATM_CELL_PAYLOAD); skb_put(sb, ATM_CELL_PAYLOAD); ATM_SKB(sb)->vcc = vcc; - do_gettimeofday(&sb->stamp); + __net_timestamp(sb); vcc->push(vcc, sb); atomic_inc(&vcc->stats->rx); cell += ATM_CELL_PAYLOAD; @@ -2346,7 +2346,7 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe) skb->destructor = ns_sb_destructor; #endif /* NS_USE_DESTRUCTORS */ ATM_SKB(skb)->vcc = vcc; - do_gettimeofday(&skb->stamp); + __net_timestamp(skb); vcc->push(vcc, skb); atomic_inc(&vcc->stats->rx); } @@ -2373,7 +2373,7 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe) sb->destructor = ns_sb_destructor; #endif /* NS_USE_DESTRUCTORS */ ATM_SKB(sb)->vcc = vcc; - do_gettimeofday(&sb->stamp); + __net_timestamp(sb); vcc->push(vcc, sb); atomic_inc(&vcc->stats->rx); } @@ -2398,7 +2398,7 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe) memcpy(skb->data, sb->data, NS_SMBUFSIZE); skb_put(skb, len - NS_SMBUFSIZE); ATM_SKB(skb)->vcc = vcc; - do_gettimeofday(&skb->stamp); + __net_timestamp(skb); vcc->push(vcc, skb); atomic_inc(&vcc->stats->rx); } @@ -2505,7 +2505,7 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe) #ifdef NS_USE_DESTRUCTORS hb->destructor = ns_hb_destructor; #endif /* NS_USE_DESTRUCTORS */ - do_gettimeofday(&hb->stamp); + __net_timestamp(hb); vcc->push(vcc, hb); atomic_inc(&vcc->stats->rx); } diff --git a/drivers/atm/zatm.c b/drivers/atm/zatm.c index 85fee95..c4b75ec 100644 --- a/drivers/atm/zatm.c +++ b/drivers/atm/zatm.c @@ -400,7 +400,7 @@ unsigned long *x; EVENT("error code 0x%x/0x%x\n",(here[3] & uPD98401_AAL5_ES) >> uPD98401_AAL5_ES_SHIFT,error); skb = ((struct rx_buffer_head *) bus_to_virt(here[2]))->skb; - do_gettimeofday(&skb->stamp); + __net_timestamp(skb); #if 0 printk("[-3..0] 0x%08lx 0x%08lx 0x%08lx 0x%08lx\n",((unsigned *) skb->data)[-3], ((unsigned *) skb->data)[-2],((unsigned *) skb->data)[-1], diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 60b3215..32635c4 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -155,13 +155,20 @@ struct skb_shared_info { #define SKB_DATAREF_SHIFT 16 #define SKB_DATAREF_MASK ((1 << SKB_DATAREF_SHIFT) - 1) +extern struct timeval skb_tv_base; + +struct skb_timeval { + u32 off_sec; + u32 off_usec; +}; + /** * struct sk_buff - socket buffer * @next: Next buffer in list * @prev: Previous buffer in list * @list: List we are on * @sk: Socket we are owned by - * @stamp: Time we arrived + * @tstamp: Time we arrived stored as offset to skb_tv_base * @dev: Device we arrived on/are leaving by * @input_dev: Device we arrived on * @h: Transport layer header @@ -202,7 +209,7 @@ struct sk_buff { struct sk_buff *prev; struct sock *sk; - struct timeval stamp; + struct skb_timeval tstamp; struct net_device *dev; struct net_device *input_dev; @@ -1213,6 +1220,42 @@ static inline void *skb_header_pointer(const struct sk_buff *skb, int offset, extern void skb_init(void); extern void skb_add_mtu(int mtu); +/** + * skb_get_timestamp - get timestamp from a skb + * @skb: skb to get stamp from + * @stamp: pointer to struct timeval to store stamp in + * + * Timestamps are stored in the skb as offsets to a base timestamp. + * This function converts the offset back to a struct timeval and stores + * it in stamp. + */ +static inline void skb_get_timestamp(struct sk_buff *skb, struct timeval *stamp) +{ + stamp->tv_sec = skb->tstamp.off_sec; + stamp->tv_usec = skb->tstamp.off_usec; + if (skb->tstamp.off_sec) { + stamp->tv_sec += skb_tv_base.tv_sec; + stamp->tv_usec += skb_tv_base.tv_usec; + } +} + +/** + * skb_set_timestamp - set timestamp of a skb + * @skb: skb to set stamp of + * @stamp: pointer to struct timeval to get stamp from + * + * Timestamps are stored in the skb as offsets to a base timestamp. + * This function converts a struct timeval to an offset and stores + * it in the skb. + */ +static inline void skb_set_timestamp(struct sk_buff *skb, struct timeval *stamp) +{ + skb->tstamp.off_sec = stamp->tv_sec - skb_tv_base.tv_sec; + skb->tstamp.off_usec = stamp->tv_usec - skb_tv_base.tv_usec; +} + +extern void __net_timestamp(struct sk_buff *skb); + #ifdef CONFIG_NETFILTER static inline void nf_conntrack_put(struct nf_conntrack *nfct) { diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 6d63a47..7f933f3 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -404,7 +404,7 @@ static inline int hci_recv_frame(struct sk_buff *skb) bt_cb(skb)->incoming = 1; /* Time stamp */ - do_gettimeofday(&skb->stamp); + __net_timestamp(skb); /* Queue frame for rx task */ skb_queue_tail(&hdev->rx_q, skb); diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 8980989..34c0773 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -363,7 +363,14 @@ __neigh_lookup_errno(struct neigh_table *tbl, const void *pkey, return neigh_create(tbl, pkey, dev); } -#define LOCALLY_ENQUEUED -2 +struct neighbour_cb { + unsigned long sched_next; + unsigned int flags; +}; + +#define LOCALLY_ENQUEUED 0x1 + +#define NEIGH_CB(skb) ((struct neighbour_cb *)(skb)->cb) #endif #endif diff --git a/include/net/sock.h b/include/net/sock.h index 065df67..d594288 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1282,16 +1282,19 @@ static inline int sock_intr_errno(long timeo) static __inline__ void sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { - struct timeval *stamp = &skb->stamp; + struct timeval stamp; + + skb_get_timestamp(skb, &stamp); if (sock_flag(sk, SOCK_RCVTSTAMP)) { /* Race occurred between timestamp enabling and packet receiving. Fill in the current time for now. */ - if (stamp->tv_sec == 0) - do_gettimeofday(stamp); + if (stamp.tv_sec == 0) + do_gettimeofday(&stamp); + skb_set_timestamp(skb, &stamp); put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP, sizeof(struct timeval), - stamp); + &stamp); } else - sk->sk_stamp = *stamp; + sk->sk_stamp = stamp; } /** diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 4f9e11b..55dc42e 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -999,7 +999,7 @@ static int hci_send_frame(struct sk_buff *skb) if (atomic_read(&hdev->promisc)) { /* Time stamp */ - do_gettimeofday(&skb->stamp); + __net_timestamp(skb); hci_send_to_sock(hdev, skb); } diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 40b2195..d6da093 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1087,7 +1087,7 @@ void hci_si_event(struct hci_dev *hdev, int type, int dlen, void *data) memcpy(ev->data, data, dlen); bt_cb(skb)->incoming = 1; - do_gettimeofday(&skb->stamp); + __net_timestamp(skb); bt_cb(skb)->pkt_type = HCI_EVENT_PKT; skb->dev = (void *) hdev; diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index eed9090..32ef797 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -332,8 +332,12 @@ static inline void hci_sock_cmsg(struct sock *sk, struct msghdr *msg, struct sk_ put_cmsg(msg, SOL_HCI, HCI_CMSG_DIR, sizeof(incoming), &incoming); } - if (mask & HCI_CMSG_TSTAMP) - put_cmsg(msg, SOL_HCI, HCI_CMSG_TSTAMP, sizeof(skb->stamp), &skb->stamp); + if (mask & HCI_CMSG_TSTAMP) { + struct timeval tv; + + skb_get_timestamp(skb, &tv); + put_cmsg(msg, SOL_HCI, HCI_CMSG_TSTAMP, sizeof(tv), &tv); + } } static int hci_sock_recvmsg(struct kiocb *iocb, struct socket *sock, diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c index 561d75c..acb888d 100644 --- a/net/bridge/netfilter/ebt_ulog.c +++ b/net/bridge/netfilter/ebt_ulog.c @@ -162,7 +162,7 @@ static void ebt_ulog(const struct sk_buff *skb, unsigned int hooknr, pm->version = EBT_ULOG_VERSION; do_gettimeofday(&pm->stamp); if (ub->qlen == 1) - ub->skb->stamp = pm->stamp; + skb_set_timestamp(ub->skb, &pm->stamp); pm->data_len = copy_len; pm->mark = skb->nfmark; pm->hook = hooknr; diff --git a/net/core/dev.c b/net/core/dev.c index 9d153eb..a3ed53c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1009,13 +1009,22 @@ void net_disable_timestamp(void) atomic_dec(&netstamp_needed); } -static inline void net_timestamp(struct timeval *stamp) +void __net_timestamp(struct sk_buff *skb) +{ + struct timeval tv; + + do_gettimeofday(&tv); + skb_set_timestamp(skb, &tv); +} +EXPORT_SYMBOL(__net_timestamp); + +static inline void net_timestamp(struct sk_buff *skb) { if (atomic_read(&netstamp_needed)) - do_gettimeofday(stamp); + __net_timestamp(skb); else { - stamp->tv_sec = 0; - stamp->tv_usec = 0; + skb->tstamp.off_sec = 0; + skb->tstamp.off_usec = 0; } } @@ -1027,7 +1036,8 @@ static inline void net_timestamp(struct timeval *stamp) void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) { struct packet_type *ptype; - net_timestamp(&skb->stamp); + + net_timestamp(skb); rcu_read_lock(); list_for_each_entry_rcu(ptype, &ptype_all, list) { @@ -1379,8 +1389,8 @@ int netif_rx(struct sk_buff *skb) if (netpoll_rx(skb)) return NET_RX_DROP; - if (!skb->stamp.tv_sec) - net_timestamp(&skb->stamp); + if (!skb->tstamp.off_sec) + net_timestamp(skb); /* * The code is rearranged so that the path is the most @@ -1566,8 +1576,8 @@ int netif_receive_skb(struct sk_buff *skb) if (skb->dev->poll && netpoll_rx(skb)) return NET_RX_DROP; - if (!skb->stamp.tv_sec) - net_timestamp(&skb->stamp); + if (!skb->tstamp.off_sec) + net_timestamp(skb); if (!skb->input_dev) skb->input_dev = skb->dev; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 1beb782..72ee00f 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1217,7 +1217,7 @@ static void neigh_proxy_process(unsigned long arg) while (skb != (struct sk_buff *)&tbl->proxy_queue) { struct sk_buff *back = skb; - long tdif = back->stamp.tv_usec - now; + long tdif = NEIGH_CB(back)->sched_next - now; skb = skb->next; if (tdif <= 0) { @@ -1248,8 +1248,9 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, kfree_skb(skb); return; } - skb->stamp.tv_sec = LOCALLY_ENQUEUED; - skb->stamp.tv_usec = sched_next; + + NEIGH_CB(skb)->sched_next = sched_next; + NEIGH_CB(skb)->flags |= LOCALLY_ENQUEUED; spin_lock(&tbl->proxy_queue.lock); if (del_timer(&tbl->proxy_timer)) { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index ef498cb..39a161d 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -70,6 +70,8 @@ static kmem_cache_t *skbuff_head_cache; +struct timeval __read_mostly skb_tv_base; + /* * Keep out-of-line to prevent kernel bloat. * __builtin_return_address is not used because it is not always @@ -331,7 +333,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, unsigned int __nocast gfp_mask) n->next = n->prev = NULL; n->sk = NULL; - C(stamp); + C(tstamp); C(dev); C(h); C(nh); @@ -408,7 +410,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) memcpy(new->cb, old->cb, sizeof(old->cb)); new->local_df = old->local_df; new->pkt_type = old->pkt_type; - new->stamp = old->stamp; + new->tstamp = old->tstamp; new->destructor = NULL; #ifdef CONFIG_NETFILTER new->nfmark = old->nfmark; @@ -1645,6 +1647,7 @@ void __init skb_init(void) NULL, NULL); if (!skbuff_head_cache) panic("cannot create skbuff cache"); + do_gettimeofday(&skb_tv_base); } EXPORT_SYMBOL(___pskb_trim); @@ -1678,3 +1681,4 @@ EXPORT_SYMBOL(skb_prepare_seq_read); EXPORT_SYMBOL(skb_seq_read); EXPORT_SYMBOL(skb_abort_seq_read); EXPORT_SYMBOL(skb_find_text); +EXPORT_SYMBOL(skb_tv_base); diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c index 8f06399..4a62093 100644 --- a/net/econet/af_econet.c +++ b/net/econet/af_econet.c @@ -159,7 +159,7 @@ static int econet_recvmsg(struct kiocb *iocb, struct socket *sock, err = memcpy_toiovec(msg->msg_iov, skb->data, copied); if (err) goto out_free; - sk->sk_stamp = skb->stamp; + skb_get_timestamp(skb, &sk->sk_stamp); if (msg->msg_name) memcpy(msg->msg_name, skb->cb, msg->msg_namelen); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 6eb9c54..8bf312b 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -865,7 +865,7 @@ static int arp_process(struct sk_buff *skb) if (n) neigh_release(n); - if (skb->stamp.tv_sec == LOCALLY_ENQUEUED || + if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED || skb->pkt_type == PACKET_HOST || in_dev->arp_parms->proxy_delay == 0) { arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); @@ -948,6 +948,8 @@ int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) goto out_of_mem; + memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); + return NF_HOOK(NF_ARP, NF_ARP_IN, skb, dev, NULL, arp_process); freeskb: diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 1ac64c0..9e6e683 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -533,7 +533,7 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) if (skb->dev) qp->iif = skb->dev->ifindex; skb->dev = NULL; - qp->stamp = skb->stamp; + skb_get_timestamp(skb, &qp->stamp); qp->meat += skb->len; atomic_add(skb->truesize, &ip_frag_mem); if (offset == 0) @@ -615,7 +615,7 @@ static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev) head->next = NULL; head->dev = dev; - head->stamp = qp->stamp; + skb_set_timestamp(head, &qp->stamp); iph = head->nh.iph; iph->frag_off = 0; diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index 1c49833..7f2bcc7 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -240,8 +240,8 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp) pmsg->packet_id = (unsigned long )entry; pmsg->data_len = data_len; - pmsg->timestamp_sec = entry->skb->stamp.tv_sec; - pmsg->timestamp_usec = entry->skb->stamp.tv_usec; + pmsg->timestamp_sec = skb_tv_base.tv_sec + entry->skb->tstamp.off_sec; + pmsg->timestamp_usec = skb_tv_base.tv_usec + entry->skb->tstamp.off_usec; pmsg->mark = entry->skb->nfmark; pmsg->hook = entry->info->hook; pmsg->hw_protocol = entry->skb->protocol; diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index b86f06e..1d8ac45 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -220,13 +220,13 @@ static void ipt_ulog_packet(unsigned int hooknum, pm = NLMSG_DATA(nlh); /* We might not have a timestamp, get one */ - if (skb->stamp.tv_sec == 0) - do_gettimeofday((struct timeval *)&skb->stamp); + if (skb->tstamp.off_sec == 0) + __net_timestamp((struct sk_buff *)skb); /* copy hook, prefix, timestamp, payload, etc. */ pm->data_len = copy_len; - pm->timestamp_sec = skb->stamp.tv_sec; - pm->timestamp_usec = skb->stamp.tv_usec; + pm->timestamp_sec = skb_tv_base.tv_sec + skb->tstamp.off_sec; + pm->timestamp_usec = skb_tv_base.tv_usec + skb->tstamp.off_usec; pm->mark = skb->nfmark; pm->hook = hooknum; if (prefix != NULL) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index fdd9547..ebb8654 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2097,9 +2097,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt seq_rtt = -1; } else if (seq_rtt < 0) seq_rtt = now - scb->when; - if (seq_usrtt) - *seq_usrtt = (usnow.tv_sec - skb->stamp.tv_sec) * 1000000 - + (usnow.tv_usec - skb->stamp.tv_usec); + if (seq_usrtt) { + struct timeval tv; + + skb_get_timestamp(skb, &tv); + *seq_usrtt = (usnow.tv_sec - tv.tv_sec) * 1000000 + + (usnow.tv_usec - tv.tv_usec); + } if (sacked & TCPCB_SACKED_ACKED) tp->sacked_out -= tcp_skb_pcount(skb); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 267b0fc..8d92ab5 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -282,7 +282,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) /* If congestion control is doing timestamping */ if (icsk->icsk_ca_ops->rtt_sample) - do_gettimeofday(&skb->stamp); + __net_timestamp(skb); sysctl_flags = 0; if (tcb->flags & TCPCB_FLAG_SYN) { @@ -483,7 +483,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned * skbs, which it never sent before. --ANK */ TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; - buff->stamp = skb->stamp; + buff->tstamp = skb->tstamp; if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { tp->lost_out -= tcp_skb_pcount(skb); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 7ae72d4..a7eae30 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -812,7 +812,7 @@ static void ndisc_recv_ns(struct sk_buff *skb) if (ipv6_chk_acast_addr(dev, &msg->target) || (idev->cnf.forwarding && pneigh_lookup(&nd_tbl, &msg->target, dev, 0))) { - if (skb->stamp.tv_sec != LOCALLY_ENQUEUED && + if (!(NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED) && skb->pkt_type != PACKET_HOST && inc != 0 && idev->nd_parms->proxy_delay != 0) { @@ -1487,6 +1487,8 @@ int ndisc_rcv(struct sk_buff *skb) return 0; } + memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); + switch (msg->icmph.icmp6_type) { case NDISC_NEIGHBOUR_SOLICITATION: ndisc_recv_ns(skb); diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c index 7ecb91e..4467645 100644 --- a/net/ipv6/netfilter/ip6_queue.c +++ b/net/ipv6/netfilter/ip6_queue.c @@ -238,8 +238,8 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp) pmsg->packet_id = (unsigned long )entry; pmsg->data_len = data_len; - pmsg->timestamp_sec = entry->skb->stamp.tv_sec; - pmsg->timestamp_usec = entry->skb->stamp.tv_usec; + pmsg->timestamp_sec = skb_tv_base.tv_sec + entry->skb->tstamp.off_sec; + pmsg->timestamp_usec = skb_tv_base.tv_usec + entry->skb->tstamp.off_usec; pmsg->mark = entry->skb->nfmark; pmsg->hook = entry->info->hook; pmsg->hw_protocol = entry->skb->protocol; diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 59e7c63..9d9e043 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -562,7 +562,7 @@ static void ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, if (skb->dev) fq->iif = skb->dev->ifindex; skb->dev = NULL; - fq->stamp = skb->stamp; + skb_get_timestamp(skb, &fq->stamp); fq->meat += skb->len; atomic_add(skb->truesize, &ip6_frag_mem); @@ -664,7 +664,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff **skb_in, head->next = NULL; head->dev = dev; - head->stamp = fq->stamp; + skb_set_timestamp(head, &fq->stamp); head->nh.ipv6h->payload_len = htons(payload_len); *skb_in = head; diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index 39d5939..c54f8ac 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -1796,8 +1796,8 @@ static int ipx_recvmsg(struct kiocb *iocb, struct socket *sock, copied); if (rc) goto out_free; - if (skb->stamp.tv_sec) - sk->sk_stamp = skb->stamp; + if (skb->tstamp.off_sec) + skb_get_timestamp(skb, &sk->sk_stamp) msg->msg_namelen = sizeof(*sipx); diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 464c9fa..ff5601c 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -491,11 +491,11 @@ __build_packet_message(struct nfulnl_instance *inst, NFA_PUT(inst->skb, NFULA_HWADDR, sizeof(phw), &phw); } - if (skb->stamp.tv_sec) { + if (skb->tstamp.off_sec) { struct nfulnl_msg_packet_timestamp ts; - ts.sec = cpu_to_be64(skb->stamp.tv_sec); - ts.usec = cpu_to_be64(skb->stamp.tv_usec); + ts.sec = cpu_to_be64(skb_tv_base.tv_sec + skb->tstamp.off_sec); + ts.usec = cpu_to_be64(skb_tv_base.tv_usec + skb->tstamp.off_usec); NFA_PUT(inst->skb, NFULA_TIMESTAMP, sizeof(ts), &ts); } diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 741686f..e3a5285 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -494,11 +494,11 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, NFA_PUT(skb, NFQA_HWADDR, sizeof(phw), &phw); } - if (entry->skb->stamp.tv_sec) { + if (entry->skb->tstamp.off_sec) { struct nfqnl_msg_packet_timestamp ts; - ts.sec = htonll(entry->skb->stamp.tv_sec); - ts.usec = htonll(entry->skb->stamp.tv_usec); + ts.sec = htonll(skb_tv_base.tv_sec + entry->skb->tstamp.off_sec); + ts.usec = htonll(skb_tv_base.tv_usec + entry->skb->tstamp.off_usec); NFA_PUT(skb, NFQA_TIMESTAMP, sizeof(ts), &ts); } diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index deb5f6f..ba99709 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -635,12 +635,12 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe h->tp_snaplen = snaplen; h->tp_mac = macoff; h->tp_net = netoff; - if (skb->stamp.tv_sec == 0) { - do_gettimeofday(&skb->stamp); + if (skb->tstamp.off_sec == 0) { + __net_timestamp(skb); sock_enable_timestamp(sk); } - h->tp_sec = skb->stamp.tv_sec; - h->tp_usec = skb->stamp.tv_usec; + h->tp_sec = skb_tv_base.tv_sec + skb->tstamp.off_sec; + h->tp_usec = skb_tv_base.tv_usec + skb->tstamp.off_usec; sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h))); sll->sll_halen = 0; diff --git a/net/sctp/input.c b/net/sctp/input.c index 742be91..28f3224 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -236,8 +236,8 @@ int sctp_rcv(struct sk_buff *skb) } /* SCTP seems to always need a timestamp right now (FIXME) */ - if (skb->stamp.tv_sec == 0) { - do_gettimeofday(&skb->stamp); + if (skb->tstamp.off_sec == 0) { + __net_timestamp(skb); sock_enable_timestamp(sk); } diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 00d32b7..3868a8d 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -1362,6 +1362,7 @@ struct sctp_association *sctp_unpack_cookie( char *key; sctp_scope_t scope; struct sk_buff *skb = chunk->skb; + struct timeval tv; headersize = sizeof(sctp_chunkhdr_t) + SCTP_SECRET_SIZE; bodysize = ntohs(chunk->chunk_hdr->length) - headersize; @@ -1434,7 +1435,8 @@ no_hmac: * an association, there is no need to check cookie's expiration * for init collision case of lost COOKIE ACK. */ - if (!asoc && tv_lt(bear_cookie->expiration, skb->stamp)) { + skb_get_timestamp(skb, &tv); + if (!asoc && tv_lt(bear_cookie->expiration, tv)) { __u16 len; /* * Section 3.3.10.3 Stale Cookie Error (3) @@ -1447,10 +1449,9 @@ no_hmac: len = ntohs(chunk->chunk_hdr->length); *errp = sctp_make_op_error_space(asoc, chunk, len); if (*errp) { - suseconds_t usecs = (skb->stamp.tv_sec - + suseconds_t usecs = (tv.tv_sec - bear_cookie->expiration.tv_sec) * 1000000L + - skb->stamp.tv_usec - - bear_cookie->expiration.tv_usec; + tv.tv_usec - bear_cookie->expiration.tv_usec; usecs = htonl(usecs); sctp_init_cause(*errp, SCTP_ERROR_STALE_COOKIE, diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 199d374..05fe2e7 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -584,13 +584,16 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) /* possibly an icmp error */ dprintk("svc: recvfrom returned error %d\n", -err); } - if (skb->stamp.tv_sec == 0) { - skb->stamp.tv_sec = xtime.tv_sec; - skb->stamp.tv_usec = xtime.tv_nsec / NSEC_PER_USEC; + if (skb->tstamp.off_sec == 0) { + struct timeval tv; + + tv.tv_sec = xtime.tv_sec; + tv.tv_usec = xtime.tv_nsec * 1000; + skb_set_timestamp(skb, &tv); /* Don't enable netstamp, sunrpc doesn't need that much accuracy */ } - svsk->sk_sk->sk_stamp = skb->stamp; + skb_get_timestamp(skb, &svsk->sk_sk->sk_stamp); set_bit(SK_DATA, &svsk->sk_flags); /* there may be more data... */ /* -- cgit v0.10.2 From fb13ab2849074244a51ae5147483610529a29ced Mon Sep 17 00:00:00 2001 From: Domen Puncer Date: Sun, 14 Aug 2005 17:32:05 -0700 Subject: [NETFILTER]: Remove two unused files Signed-off-by: Domen Puncer Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller diff --git a/include/linux/netfilter_ipv4/ip_logging.h b/include/linux/netfilter_ipv4/ip_logging.h deleted file mode 100644 index 0c5c52c..0000000 --- a/include/linux/netfilter_ipv4/ip_logging.h +++ /dev/null @@ -1,20 +0,0 @@ -/* IPv4 macros for the internal logging interface. */ -#ifndef __IP_LOGGING_H -#define __IP_LOGGING_H - -#ifdef __KERNEL__ -#include -#include - -#define nf_log_ip_packet(pskb,hooknum,in,out,fmt,args...) \ - nf_log_packet(AF_INET,pskb,hooknum,in,out,fmt,##args) - -#define nf_log_ip(pfh,len,fmt,args...) \ - nf_log(AF_INET,pfh,len,fmt,##args) - -#define nf_ip_log_register(logging) nf_log_register(AF_INET,logging) -#define nf_ip_log_unregister(logging) nf_log_unregister(AF_INET,logging) - -#endif /*__KERNEL__*/ - -#endif /*__IP_LOGGING_H*/ diff --git a/include/linux/netfilter_ipv6/ip6_logging.h b/include/linux/netfilter_ipv6/ip6_logging.h deleted file mode 100644 index a0b2ee3..0000000 --- a/include/linux/netfilter_ipv6/ip6_logging.h +++ /dev/null @@ -1,20 +0,0 @@ -/* IPv6 macros for the nternal logging interface. */ -#ifndef __IP6_LOGGING_H -#define __IP6_LOGGING_H - -#ifdef __KERNEL__ -#include -#include - -#define nf_log_ip6_packet(pskb,hooknum,in,out,fmt,args...) \ - nf_log_packet(AF_INET6,pskb,hooknum,in,out,fmt,##args) - -#define nf_log_ip6(pfh,len,fmt,args...) \ - nf_log(AF_INET6,pfh,len,fmt,##args) - -#define nf_ip6_log_register(logging) nf_log_register(AF_INET6,logging) -#define nf_ip6_log_unregister(logging) nf_log_unregister(AF_INET6,logging) - -#endif /*__KERNEL__*/ - -#endif /*__IP6_LOGGING_H*/ -- cgit v0.10.2 From 9baa5c67ff4ce57b6b9f68c90714a1bb876fccd7 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sun, 14 Aug 2005 17:32:50 -0700 Subject: [NETFILTER]: Don't exclude local packets from MASQUERADING Increases consistency in source-address selection. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index 91e7450..2f3e181 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -86,11 +86,6 @@ masquerade_target(struct sk_buff **pskb, IP_NF_ASSERT(hooknum == NF_IP_POST_ROUTING); - /* FIXME: For the moment, don't do local packets, breaks - testsuite for 2.3.49 --RR */ - if ((*pskb)->sk) - return NF_ACCEPT; - ct = ip_conntrack_get(*pskb, &ctinfo); IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)); -- cgit v0.10.2 From 000efe1d86620244b8e017429e57fab4170ab05a Mon Sep 17 00:00:00 2001 From: Gary Wayne Smith Date: Sun, 14 Aug 2005 17:33:24 -0700 Subject: [NETFILTER]: Make NETMAP target usable in OUTPUT Signed-off-by: Gary Wayne Smith Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c index 06254b2..e6e7b60 100644 --- a/net/ipv4/netfilter/ipt_NETMAP.c +++ b/net/ipv4/netfilter/ipt_NETMAP.c @@ -46,7 +46,8 @@ check(const char *tablename, DEBUGP(MODULENAME":check: size %u.\n", targinfosize); return 0; } - if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_POST_ROUTING))) { + if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_POST_ROUTING) | + (1 << NF_IP_LOCAL_OUT))) { DEBUGP(MODULENAME":check: bad hooks %x.\n", hook_mask); return 0; } @@ -76,12 +77,13 @@ target(struct sk_buff **pskb, struct ip_nat_range newrange; IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING - || hooknum == NF_IP_POST_ROUTING); + || hooknum == NF_IP_POST_ROUTING + || hooknum == NF_IP_LOCAL_OUT); ct = ip_conntrack_get(*pskb, &ctinfo); netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip); - if (hooknum == NF_IP_PRE_ROUTING) + if (hooknum == NF_IP_PRE_ROUTING || hooknum == NF_IP_LOCAL_OUT) new_ip = (*pskb)->nh.iph->daddr & ~netmask; else new_ip = (*pskb)->nh.iph->saddr & ~netmask; -- cgit v0.10.2 From 34b4a4a624bafe089107966a6c56d2a1aca026d4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 14 Aug 2005 17:33:59 -0700 Subject: [NETFILTER]: Remove tasklist_lock abuse in ipt{,6}owner Rip out cmd/sid/pid matching since its unfixable broken and stands in the way of locking changes to tasklist_lock. Signed-off-by: Christoph Hellwig Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller diff --git a/net/ipv4/netfilter/ipt_owner.c b/net/ipv4/netfilter/ipt_owner.c index 3b9065e..c1889f8 100644 --- a/net/ipv4/netfilter/ipt_owner.c +++ b/net/ipv4/netfilter/ipt_owner.c @@ -21,106 +21,6 @@ MODULE_AUTHOR("Marc Boucher "); MODULE_DESCRIPTION("iptables owner match"); static int -match_comm(const struct sk_buff *skb, const char *comm) -{ - struct task_struct *g, *p; - struct files_struct *files; - int i; - - read_lock(&tasklist_lock); - do_each_thread(g, p) { - if(strncmp(p->comm, comm, sizeof(p->comm))) - continue; - - task_lock(p); - files = p->files; - if(files) { - spin_lock(&files->file_lock); - for (i=0; i < files->max_fds; i++) { - if (fcheck_files(files, i) == - skb->sk->sk_socket->file) { - spin_unlock(&files->file_lock); - task_unlock(p); - read_unlock(&tasklist_lock); - return 1; - } - } - spin_unlock(&files->file_lock); - } - task_unlock(p); - } while_each_thread(g, p); - read_unlock(&tasklist_lock); - return 0; -} - -static int -match_pid(const struct sk_buff *skb, pid_t pid) -{ - struct task_struct *p; - struct files_struct *files; - int i; - - read_lock(&tasklist_lock); - p = find_task_by_pid(pid); - if (!p) - goto out; - task_lock(p); - files = p->files; - if(files) { - spin_lock(&files->file_lock); - for (i=0; i < files->max_fds; i++) { - if (fcheck_files(files, i) == - skb->sk->sk_socket->file) { - spin_unlock(&files->file_lock); - task_unlock(p); - read_unlock(&tasklist_lock); - return 1; - } - } - spin_unlock(&files->file_lock); - } - task_unlock(p); -out: - read_unlock(&tasklist_lock); - return 0; -} - -static int -match_sid(const struct sk_buff *skb, pid_t sid) -{ - struct task_struct *g, *p; - struct file *file = skb->sk->sk_socket->file; - int i, found=0; - - read_lock(&tasklist_lock); - do_each_thread(g, p) { - struct files_struct *files; - if (p->signal->session != sid) - continue; - - task_lock(p); - files = p->files; - if (files) { - spin_lock(&files->file_lock); - for (i=0; i < files->max_fds; i++) { - if (fcheck_files(files, i) == file) { - found = 1; - break; - } - } - spin_unlock(&files->file_lock); - } - task_unlock(p); - if (found) - goto out; - } while_each_thread(g, p); -out: - read_unlock(&tasklist_lock); - - return found; -} - -static int match(const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, @@ -145,24 +45,6 @@ match(const struct sk_buff *skb, return 0; } - if(info->match & IPT_OWNER_PID) { - if (!match_pid(skb, info->pid) ^ - !!(info->invert & IPT_OWNER_PID)) - return 0; - } - - if(info->match & IPT_OWNER_SID) { - if (!match_sid(skb, info->sid) ^ - !!(info->invert & IPT_OWNER_SID)) - return 0; - } - - if(info->match & IPT_OWNER_COMM) { - if (!match_comm(skb, info->comm) ^ - !!(info->invert & IPT_OWNER_COMM)) - return 0; - } - return 1; } @@ -173,6 +55,8 @@ checkentry(const char *tablename, unsigned int matchsize, unsigned int hook_mask) { + const struct ipt_owner_info *info = matchinfo; + if (hook_mask & ~((1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_POST_ROUTING))) { printk("ipt_owner: only valid for LOCAL_OUT or POST_ROUTING.\n"); @@ -184,15 +68,13 @@ checkentry(const char *tablename, IPT_ALIGN(sizeof(struct ipt_owner_info))); return 0; } -#ifdef CONFIG_SMP - /* files->file_lock can not be used in a BH */ - if (((struct ipt_owner_info *)matchinfo)->match - & (IPT_OWNER_PID|IPT_OWNER_SID|IPT_OWNER_COMM)) { - printk("ipt_owner: pid, sid and command matching is broken " - "on SMP.\n"); + + if (info->match & (IPT_OWNER_PID|IPT_OWNER_SID|IPT_OWNER_COMM)) { + printk("ipt_owner: pid, sid and command matching " + "not supported anymore\n"); return 0; } -#endif + return 1; } diff --git a/net/ipv6/netfilter/ip6t_owner.c b/net/ipv6/netfilter/ip6t_owner.c index ab0e32d..9b91dec 100644 --- a/net/ipv6/netfilter/ip6t_owner.c +++ b/net/ipv6/netfilter/ip6t_owner.c @@ -20,71 +20,6 @@ MODULE_AUTHOR("Marc Boucher "); MODULE_DESCRIPTION("IP6 tables owner matching module"); MODULE_LICENSE("GPL"); -static int -match_pid(const struct sk_buff *skb, pid_t pid) -{ - struct task_struct *p; - struct files_struct *files; - int i; - - read_lock(&tasklist_lock); - p = find_task_by_pid(pid); - if (!p) - goto out; - task_lock(p); - files = p->files; - if(files) { - spin_lock(&files->file_lock); - for (i=0; i < files->max_fds; i++) { - if (fcheck_files(files, i) == skb->sk->sk_socket->file) { - spin_unlock(&files->file_lock); - task_unlock(p); - read_unlock(&tasklist_lock); - return 1; - } - } - spin_unlock(&files->file_lock); - } - task_unlock(p); -out: - read_unlock(&tasklist_lock); - return 0; -} - -static int -match_sid(const struct sk_buff *skb, pid_t sid) -{ - struct task_struct *g, *p; - struct file *file = skb->sk->sk_socket->file; - int i, found=0; - - read_lock(&tasklist_lock); - do_each_thread(g, p) { - struct files_struct *files; - if (p->signal->session != sid) - continue; - - task_lock(p); - files = p->files; - if (files) { - spin_lock(&files->file_lock); - for (i=0; i < files->max_fds; i++) { - if (fcheck_files(files, i) == file) { - found = 1; - break; - } - } - spin_unlock(&files->file_lock); - } - task_unlock(p); - if (found) - goto out; - } while_each_thread(g, p); -out: - read_unlock(&tasklist_lock); - - return found; -} static int match(const struct sk_buff *skb, @@ -112,18 +47,6 @@ match(const struct sk_buff *skb, return 0; } - if(info->match & IP6T_OWNER_PID) { - if (!match_pid(skb, info->pid) ^ - !!(info->invert & IP6T_OWNER_PID)) - return 0; - } - - if(info->match & IP6T_OWNER_SID) { - if (!match_sid(skb, info->sid) ^ - !!(info->invert & IP6T_OWNER_SID)) - return 0; - } - return 1; } @@ -134,6 +57,8 @@ checkentry(const char *tablename, unsigned int matchsize, unsigned int hook_mask) { + const struct ip6t_owner_info *info = matchinfo; + if (hook_mask & ~((1 << NF_IP6_LOCAL_OUT) | (1 << NF_IP6_POST_ROUTING))) { printk("ip6t_owner: only valid for LOCAL_OUT or POST_ROUTING.\n"); @@ -142,14 +67,13 @@ checkentry(const char *tablename, if (matchsize != IP6T_ALIGN(sizeof(struct ip6t_owner_info))) return 0; -#ifdef CONFIG_SMP - /* files->file_lock can not be used in a BH */ - if (((struct ip6t_owner_info *)matchinfo)->match - & (IP6T_OWNER_PID|IP6T_OWNER_SID)) { - printk("ip6t_owner: pid and sid matching is broken on SMP.\n"); + + if (info->match & (IP6T_OWNER_PID|IP6T_OWNER_SID)) { + printk("ipt_owner: pid and sid matching " + "not supported anymore\n"); return 0; } -#endif + return 1; } -- cgit v0.10.2 From c173437669967301facff151bfeb7bae67354e4c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sat, 13 Aug 2005 20:34:23 -0300 Subject: [PACKET_HISTORY]: Add dccphtx_rtt and rename the win_count fields As requested by Ian. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Ian McDonald Signed-off-by: David S. Miller diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 80f12c9..edf9740 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -1004,7 +1004,7 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, /* Can we send? if so add options and add to packet history */ if (rc == 0) - new_packet->dccphtx_win_count = + new_packet->dccphtx_ccval = DCCP_SKB_CB(skb)->dccpd_ccval = hctx->ccid3hctx_last_win_count; out: @@ -1060,7 +1060,7 @@ static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, int len) min_t(unsigned long, quarter_rtt, 5)) % 16; ccid3_pr_debug("%s, sk=%p, window changed from %u to %u!\n", dccp_role(sk), sk, - packet->dccphtx_win_count, + packet->dccphtx_ccval, hctx->ccid3hctx_last_win_count); } /* COMPLIANCE_END */ @@ -1068,9 +1068,10 @@ static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, int len) ccid3_pr_debug("%s, sk=%p, packet sent (%llu,%u)\n", dccp_role(sk), sk, packet->dccphtx_seqno, - packet->dccphtx_win_count); + packet->dccphtx_ccval); #endif hctx->ccid3hctx_idle = 0; + packet->dccphtx_rtt = hctx->ccid3hctx_rtt; packet->dccphtx_sent = 1; } else ccid3_pr_debug("%s, sk=%p, seqno=%llu NOT inserted!\n", @@ -1489,11 +1490,10 @@ trim_history: step = 2; /* OK, find next data packet */ num_later = 1; - win_count = entry->dccphrx_win_count; + win_count = entry->dccphrx_ccval; break; case 2: - tmp = (win_count - - entry->dccphrx_win_count); + tmp = win_count - entry->dccphrx_ccval; if (tmp < 0) tmp += TFRC_WIN_COUNT_LIMIT; if (tmp > TFRC_WIN_COUNT_PER_RTT + 1) { @@ -1553,7 +1553,7 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk) } do_gettimeofday(&(hcrx->ccid3hcrx_tstamp_last_feedback)); - hcrx->ccid3hcrx_last_counter = packet->dccphrx_win_count; + hcrx->ccid3hcrx_last_counter = packet->dccphrx_ccval; hcrx->ccid3hcrx_seqno_last_counter = packet->dccphrx_seqno; hcrx->ccid3hcrx_bytes_recv = 0; @@ -1645,11 +1645,11 @@ static u32 ccid3_hc_rx_calc_first_li(struct sock *sk) switch (step) { case 0: tstamp = entry->dccphrx_tstamp; - win_count = entry->dccphrx_win_count; + win_count = entry->dccphrx_ccval; step = 1; break; case 1: - interval = win_count - entry->dccphrx_win_count; + interval = win_count - entry->dccphrx_ccval; if (interval < 0) interval += TFRC_WIN_COUNT_LIMIT; if (interval > 4) @@ -1816,7 +1816,7 @@ static void ccid3_hc_rx_detect_loss(struct sock *sk) } if (seq_loss != DCCP_MAX_SEQNO + 1) - win_loss = a_loss->dccphrx_win_count; + win_loss = a_loss->dccphrx_ccval; out_update_li: ccid3_hc_rx_update_li(sk, seq_loss, win_loss); @@ -1918,7 +1918,7 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) return; } - win_count = packet->dccphrx_win_count; + win_count = packet->dccphrx_ccval; ins = ccid3_hc_rx_add_hist(sk, packet); diff --git a/net/dccp/packet_history.h b/net/dccp/packet_history.h index 565dc96..0056525 100644 --- a/net/dccp/packet_history.h +++ b/net/dccp/packet_history.h @@ -47,15 +47,16 @@ struct dccp_tx_hist_entry { struct list_head dccphtx_node; u64 dccphtx_seqno:48, - dccphtx_win_count:8, + dccphtx_ccval:4, dccphtx_sent:1; + u32 dccphtx_rtt; struct timeval dccphtx_tstamp; }; struct dccp_rx_hist_entry { struct list_head dccphrx_node; u64 dccphrx_seqno:48, - dccphrx_win_count:4, + dccphrx_ccval:4, dccphrx_type:4; u32 dccphrx_ndp; /* In fact it is from 8 to 24 bits */ struct timeval dccphrx_tstamp; @@ -136,10 +137,10 @@ static inline struct dccp_rx_hist_entry * if (entry != NULL) { const struct dccp_hdr *dh = dccp_hdr(skb); - entry->dccphrx_seqno = DCCP_SKB_CB(skb)->dccpd_seq; - entry->dccphrx_win_count = dh->dccph_ccval; - entry->dccphrx_type = dh->dccph_type; - entry->dccphrx_ndp = ndp; + entry->dccphrx_seqno = DCCP_SKB_CB(skb)->dccpd_seq; + entry->dccphrx_ccval = dh->dccph_ccval; + entry->dccphrx_type = dh->dccph_type; + entry->dccphrx_ndp = ndp; do_gettimeofday(&(entry->dccphrx_tstamp)); } -- cgit v0.10.2 From 7690af3fff7633e40b1b9950eb8489129251d074 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sat, 13 Aug 2005 20:34:54 -0300 Subject: [DCCP]: Just reflow the source code to fit in 80 columns Andrew Morton should be happy now 8) Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h index 469f9a1..95eb47d 100644 --- a/net/dccp/ccid.h +++ b/net/dccp/ccid.h @@ -30,21 +30,26 @@ struct ccid { int (*ccid_hc_tx_init)(struct sock *sk); void (*ccid_hc_rx_exit)(struct sock *sk); void (*ccid_hc_tx_exit)(struct sock *sk); - void (*ccid_hc_rx_packet_recv)(struct sock *sk, struct sk_buff *skb); + void (*ccid_hc_rx_packet_recv)(struct sock *sk, + struct sk_buff *skb); int (*ccid_hc_rx_parse_options)(struct sock *sk, unsigned char option, unsigned char len, u16 idx, unsigned char* value); - void (*ccid_hc_rx_insert_options)(struct sock *sk, struct sk_buff *skb); - void (*ccid_hc_tx_insert_options)(struct sock *sk, struct sk_buff *skb); - void (*ccid_hc_tx_packet_recv)(struct sock *sk, struct sk_buff *skb); + void (*ccid_hc_rx_insert_options)(struct sock *sk, + struct sk_buff *skb); + void (*ccid_hc_tx_insert_options)(struct sock *sk, + struct sk_buff *skb); + void (*ccid_hc_tx_packet_recv)(struct sock *sk, + struct sk_buff *skb); int (*ccid_hc_tx_parse_options)(struct sock *sk, unsigned char option, unsigned char len, u16 idx, unsigned char* value); int (*ccid_hc_tx_send_packet)(struct sock *sk, struct sk_buff *skb, int len); - void (*ccid_hc_tx_packet_sent)(struct sock *sk, int more, int len); + void (*ccid_hc_tx_packet_sent)(struct sock *sk, int more, + int len); }; extern int ccid_register(struct ccid *ccid); @@ -123,7 +128,8 @@ static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk, { int rc = 0; if (ccid->ccid_hc_tx_parse_options != NULL) - rc = ccid->ccid_hc_tx_parse_options(sk, option, len, idx, value); + rc = ccid->ccid_hc_tx_parse_options(sk, option, len, idx, + value); return rc; } diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 8a0d7af..62e735f 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -25,7 +25,8 @@ extern int dccp_debug; do { if (dccp_debug) \ printk(KERN_DEBUG "%s: " format, __FUNCTION__ , ##a); \ } while (0) -#define dccp_pr_debug_cat(format, a...) do { if (dccp_debug) printk(format, ##a); } while (0) +#define dccp_pr_debug_cat(format, a...) do { if (dccp_debug) \ + printk(format, ##a); } while (0) #else #define dccp_pr_debug(format, a...) #define dccp_pr_debug_cat(format, a...) @@ -72,7 +73,8 @@ static inline const int after48(const u64 seq1, const u64 seq2) } /* is seq2 <= seq1 <= seq3 ? */ -static inline const int between48(const u64 seq1, const u64 seq2, const u64 seq3) +static inline const int between48(const u64 seq1, const u64 seq2, + const u64 seq3) { return (seq3 << 16) - (seq2 << 16) >= (seq1 << 16) - (seq2 << 16); } @@ -107,12 +109,14 @@ struct dccp_mib { } __SNMP_MIB_ALIGN__; DECLARE_SNMP_STAT(struct dccp_mib, dccp_statistics); -#define DCCP_INC_STATS(field) SNMP_INC_STATS(dccp_statistics, field) -#define DCCP_INC_STATS_BH(field) SNMP_INC_STATS_BH(dccp_statistics, field) -#define DCCP_INC_STATS_USER(field) SNMP_INC_STATS_USER(dccp_statistics, field) -#define DCCP_DEC_STATS(field) SNMP_DEC_STATS(dccp_statistics, field) -#define DCCP_ADD_STATS_BH(field, val) SNMP_ADD_STATS_BH(dccp_statistics, field, val) -#define DCCP_ADD_STATS_USER(field, val) SNMP_ADD_STATS_USER(dccp_statistics, field, val) +#define DCCP_INC_STATS(field) SNMP_INC_STATS(dccp_statistics, field) +#define DCCP_INC_STATS_BH(field) SNMP_INC_STATS_BH(dccp_statistics, field) +#define DCCP_INC_STATS_USER(field) SNMP_INC_STATS_USER(dccp_statistics, field) +#define DCCP_DEC_STATS(field) SNMP_DEC_STATS(dccp_statistics, field) +#define DCCP_ADD_STATS_BH(field, val) \ + SNMP_ADD_STATS_BH(dccp_statistics, field, val) +#define DCCP_ADD_STATS_USER(field, val) \ + SNMP_ADD_STATS_USER(dccp_statistics, field, val) extern int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb); extern int dccp_retransmit_skb(struct sock *sk, struct sk_buff *skb); @@ -234,8 +238,8 @@ extern int dccp_disconnect(struct sock *sk, int flags); extern int dccp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen); extern int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg); -extern int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t size); +extern int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t size); extern int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); @@ -246,7 +250,8 @@ extern void dccp_shutdown(struct sock *sk, int how); extern int dccp_v4_checksum(const struct sk_buff *skb, const u32 saddr, const u32 daddr); -extern int dccp_v4_send_reset(struct sock *sk, enum dccp_reset_codes code); +extern int dccp_v4_send_reset(struct sock *sk, + enum dccp_reset_codes code); extern void dccp_send_close(struct sock *sk); struct dccp_skb_cb { @@ -303,7 +308,8 @@ static inline void dccp_inc_seqno(u64 *seqno) static inline void dccp_hdr_set_seq(struct dccp_hdr *dh, const u64 gss) { - struct dccp_hdr_ext *dhx = (struct dccp_hdr_ext *)((void *)dh + sizeof(*dh)); + struct dccp_hdr_ext *dhx = (struct dccp_hdr_ext *)((void *)dh + + sizeof(*dh)); #if defined(__LITTLE_ENDIAN_BITFIELD) dh->dccph_seq = htonl((gss >> 32)) >> 8; @@ -315,7 +321,8 @@ static inline void dccp_hdr_set_seq(struct dccp_hdr *dh, const u64 gss) dhx->dccph_seq_low = htonl(gss & 0xffffffff); } -static inline void dccp_hdr_set_ack(struct dccp_hdr_ack_bits *dhack, const u64 gsr) +static inline void dccp_hdr_set_ack(struct dccp_hdr_ack_bits *dhack, + const u64 gsr) { #if defined(__LITTLE_ENDIAN_BITFIELD) dhack->dccph_ack_nr_high = htonl((gsr >> 32)) >> 8; @@ -332,11 +339,14 @@ static inline void dccp_update_gsr(struct sock *sk, u64 seq) struct dccp_sock *dp = dccp_sk(sk); u64 tmp_gsr; - dccp_set_seqno(&tmp_gsr, dp->dccps_gsr + 1 - (dp->dccps_options.dccpo_sequence_window / 4)); + dccp_set_seqno(&tmp_gsr, + (dp->dccps_gsr + 1 - + (dp->dccps_options.dccpo_sequence_window / 4))); dp->dccps_gsr = seq; dccp_set_seqno(&dp->dccps_swl, max48(tmp_gsr, dp->dccps_isr)); dccp_set_seqno(&dp->dccps_swh, - dp->dccps_gsr + (3 * dp->dccps_options.dccpo_sequence_window) / 4); + (dp->dccps_gsr + + (3 * dp->dccps_options.dccpo_sequence_window) / 4)); } static inline void dccp_update_gss(struct sock *sk, u64 seq) @@ -344,7 +354,9 @@ static inline void dccp_update_gss(struct sock *sk, u64 seq) struct dccp_sock *dp = dccp_sk(sk); u64 tmp_gss; - dccp_set_seqno(&tmp_gss, dp->dccps_gss - dp->dccps_options.dccpo_sequence_window + 1); + dccp_set_seqno(&tmp_gss, + (dp->dccps_gss - + dp->dccps_options.dccpo_sequence_window + 1)); dp->dccps_awl = max48(tmp_gss, dp->dccps_iss); dp->dccps_awh = dp->dccps_gss = seq; } @@ -373,16 +385,20 @@ extern struct socket *dccp_ctl_socket; * * @dccpap_buf_head - circular buffer head * @dccpap_buf_tail - circular buffer tail - * @dccpap_buf_ackno - ack # of the most recent packet acknoldgeable in the buffer (i.e. %dccpap_buf_head) - * @dccpap_buf_nonce - the one-bit sum of the ECN Nonces on all packets acked by the buffer with State 0 + * @dccpap_buf_ackno - ack # of the most recent packet acknowledgeable in the + * buffer (i.e. %dccpap_buf_head) + * @dccpap_buf_nonce - the one-bit sum of the ECN Nonces on all packets acked + * by the buffer with State 0 * * Additionally, the HC-Receiver must keep some information about the * Ack Vectors it has recently sent. For each packet sent carrying an * Ack Vector, it remembers four variables: * - * @dccpap_ack_seqno - the Sequence Number used for the packet (HC-Receiver seqno) + * @dccpap_ack_seqno - the Sequence Number used for the packet + * (HC-Receiver seqno) * @dccpap_ack_ptr - the value of buf_head at the time of acknowledgement. - * @dccpap_ack_ackno - the Acknowledgement Number used for the packet (HC-Sender seqno) + * @dccpap_ack_ackno - the Acknowledgement Number used for the packet + * (HC-Sender seqno) * @dccpap_ack_nonce - the one-bit sum of the ECN Nonces for all State 0. * * @dccpap_buf_len - circular buffer length diff --git a/net/dccp/input.c b/net/dccp/input.c index bdaecde..4b8638f 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -93,7 +93,8 @@ static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb) */ if (dh->dccph_type == DCCP_PKT_SYNC || dh->dccph_type == DCCP_PKT_SYNCACK) { - if (between48(DCCP_SKB_CB(skb)->dccpd_ack_seq, dp->dccps_awl, dp->dccps_awh) && + if (between48(DCCP_SKB_CB(skb)->dccpd_ack_seq, + dp->dccps_awl, dp->dccps_awh) && !before48(DCCP_SKB_CB(skb)->dccpd_seq, dp->dccps_swl)) dccp_update_gsr(sk, DCCP_SKB_CB(skb)->dccpd_seq); else @@ -122,11 +123,13 @@ static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb) if (between48(DCCP_SKB_CB(skb)->dccpd_seq, lswl, dp->dccps_swh) && (DCCP_SKB_CB(skb)->dccpd_ack_seq == DCCP_PKT_WITHOUT_ACK_SEQ || - between48(DCCP_SKB_CB(skb)->dccpd_ack_seq, lawl, dp->dccps_awh))) { + between48(DCCP_SKB_CB(skb)->dccpd_ack_seq, + lawl, dp->dccps_awh))) { dccp_update_gsr(sk, DCCP_SKB_CB(skb)->dccpd_seq); if (dh->dccph_type != DCCP_PKT_SYNC && - DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) + (DCCP_SKB_CB(skb)->dccpd_ack_seq != + DCCP_PKT_WITHOUT_ACK_SEQ)) dp->dccps_gar = DCCP_SKB_CB(skb)->dccpd_ack_seq; } else { dccp_pr_debug("Step 6 failed, sending SYNC...\n"); @@ -161,10 +164,13 @@ int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, if (dccp_ackpkts_add(dp->dccps_hc_rx_ackpkts, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_ACKPKTS_STATE_RECEIVED)) { - LIMIT_NETDEBUG(KERN_INFO "DCCP: acknowledgeable packets buffer full!\n"); + LIMIT_NETDEBUG(KERN_INFO "DCCP: acknowledgeable " + "packets buffer full!\n"); ap->dccpap_ack_seqno = DCCP_MAX_SEQNO + 1; inet_csk_schedule_ack(sk); - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, TCP_DELACK_MIN, TCP_RTO_MAX); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, + TCP_DELACK_MIN, + DCCP_RTO_MAX); goto discard; } @@ -175,7 +181,8 @@ int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, */ if (!inet_csk_ack_scheduled(sk)) { inet_csk_schedule_ack(sk); - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, 5 * HZ, TCP_RTO_MAX); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, 5 * HZ, + DCCP_RTO_MAX); } } @@ -186,8 +193,8 @@ int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, case DCCP_PKT_DATAACK: case DCCP_PKT_DATA: /* - * FIXME: check if sk_receive_queue is full, schedule DATA_DROPPED option - * if it is. + * FIXME: check if sk_receive_queue is full, schedule DATA_DROPPED + * option if it is. */ __skb_pull(skb, dh->dccph_doff * 4); __skb_queue_tail(&sk->sk_receive_queue, skb); @@ -272,11 +279,13 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk, __kfree_skb(sk->sk_send_head); sk->sk_send_head = NULL; - if (!between48(DCCP_SKB_CB(skb)->dccpd_ack_seq, dp->dccps_awl, dp->dccps_awh)) { - dccp_pr_debug("invalid ackno: S.AWL=%llu, P.ackno=%llu, S.AWH=%llu \n", - (unsigned long long) dp->dccps_awl, - (unsigned long long) DCCP_SKB_CB(skb)->dccpd_ack_seq, - (unsigned long long) dp->dccps_awh); + if (!between48(DCCP_SKB_CB(skb)->dccpd_ack_seq, + dp->dccps_awl, dp->dccps_awh)) { + dccp_pr_debug("invalid ackno: S.AWL=%llu, " + "P.ackno=%llu, S.AWH=%llu \n", + (unsigned long long)dp->dccps_awl, + (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq, + (unsigned long long)dp->dccps_awh); goto out_invalid_packet; } @@ -296,16 +305,17 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk, /* * Step 10: Process REQUEST state (second part) * If S.state == REQUEST, - * / * If we get here, P is a valid Response from the server (see - * Step 4), and we should move to PARTOPEN state. PARTOPEN - * means send an Ack, don't send Data packets, retransmit - * Acks periodically, and always include any Init Cookie from - * the Response * / + * / * If we get here, P is a valid Response from the + * server (see Step 4), and we should move to + * PARTOPEN state. PARTOPEN means send an Ack, + * don't send Data packets, retransmit Acks + * periodically, and always include any Init Cookie + * from the Response * / * S.state := PARTOPEN * Set PARTOPEN timer * Continue with S.state == PARTOPEN - * / * Step 12 will send the Ack completing the three-way - * handshake * / + * / * Step 12 will send the Ack completing the + * three-way handshake * / */ dccp_set_state(sk, DCCP_PARTOPEN); @@ -341,7 +351,8 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk, out_invalid_packet: return 1; /* dccp_v4_do_rcv will send a reset, but... - FIXME: the reset code should be DCCP_RESET_CODE_PACKET_ERROR */ + FIXME: the reset code should be + DCCP_RESET_CODE_PACKET_ERROR */ } static int dccp_rcv_respond_partopen_state_process(struct sock *sk, @@ -358,11 +369,12 @@ static int dccp_rcv_respond_partopen_state_process(struct sock *sk, case DCCP_PKT_DATAACK: case DCCP_PKT_ACK: /* - * FIXME: we should be reseting the PARTOPEN (DELACK) timer here, - * but only if we haven't used the DELACK timer for something else, - * like sending a delayed ack for a TIMESTAMP echo, etc, for now - * were not clearing it, sending an extra ACK when there is nothing - * else to do in DELACK is not a big deal after all. + * FIXME: we should be reseting the PARTOPEN (DELACK) timer + * here but only if we haven't used the DELACK timer for + * something else, like sending a delayed ack for a TIMESTAMP + * echo, etc, for now were not clearing it, sending an extra + * ACK when there is nothing else to do in DELACK is not a big + * deal after all. */ /* Stop the PARTOPEN timer */ @@ -374,7 +386,8 @@ static int dccp_rcv_respond_partopen_state_process(struct sock *sk, if (dh->dccph_type == DCCP_PKT_DATAACK) { dccp_rcv_established(sk, skb, dh, len); - queued = 1; /* packet was queued (by dccp_rcv_established) */ + queued = 1; /* packet was queued + (by dccp_rcv_established) */ } break; } @@ -399,7 +412,8 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (dccp_parse_options(sk, skb)) goto discard; - if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) + if (DCCP_SKB_CB(skb)->dccpd_ack_seq != + DCCP_PKT_WITHOUT_ACK_SEQ) dccp_event_ack_recv(sk, skb); ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb); @@ -415,14 +429,17 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, DCCP_ACKPKTS_STATE_RECEIVED)) goto discard; /* - * FIXME: this activation is probably wrong, have to study more - * TCP delack machinery and how it fits into DCCP draft, but - * for now it kinda "works" 8) + * FIXME: this activation is probably wrong, have to + * study more TCP delack machinery and how it fits into + * DCCP draft, but for now it kinda "works" 8) */ - if (dp->dccps_hc_rx_ackpkts->dccpap_ack_seqno == DCCP_MAX_SEQNO + 1 && + if ((dp->dccps_hc_rx_ackpkts->dccpap_ack_seqno == + DCCP_MAX_SEQNO + 1) && !inet_csk_ack_scheduled(sk)) { inet_csk_schedule_ack(sk); - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, TCP_DELACK_MIN, TCP_RTO_MAX); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, + TCP_DELACK_MIN, + DCCP_RTO_MAX); } } } @@ -436,7 +453,10 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, * Drop packet and return */ if (dh->dccph_type == DCCP_PKT_RESET) { - /* Queue the equivalent of TCP fin so that dccp_recvmsg exits the loop */ + /* + * Queue the equivalent of TCP fin so that dccp_recvmsg + * exits the loop + */ dccp_fin(sk, skb); dccp_time_wait(sk, DCCP_TIME_WAIT, 0); return 0; @@ -450,10 +470,12 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, * Drop packet and return */ } else if ((dp->dccps_role != DCCP_ROLE_CLIENT && - (dh->dccph_type == DCCP_PKT_RESPONSE || dh->dccph_type == DCCP_PKT_CLOSEREQ)) || + (dh->dccph_type == DCCP_PKT_RESPONSE || + dh->dccph_type == DCCP_PKT_CLOSEREQ)) || (dp->dccps_role == DCCP_ROLE_CLIENT && dh->dccph_type == DCCP_PKT_REQUEST) || - (sk->sk_state == DCCP_RESPOND && dh->dccph_type == DCCP_PKT_DATA)) { + (sk->sk_state == DCCP_RESPOND && + dh->dccph_type == DCCP_PKT_DATA)) { dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq); goto discard; } @@ -491,11 +513,13 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, case DCCP_RESPOND: case DCCP_PARTOPEN: - queued = dccp_rcv_respond_partopen_state_process(sk, skb, dh, len); + queued = dccp_rcv_respond_partopen_state_process(sk, skb, + dh, len); break; } - if (dh->dccph_type == DCCP_PKT_ACK || dh->dccph_type == DCCP_PKT_DATAACK) { + if (dh->dccph_type == DCCP_PKT_ACK || + dh->dccph_type == DCCP_PKT_DATAACK) { switch (old_state) { case DCCP_PARTOPEN: sk->sk_state_change(sk); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index d3770ae..42d9c87 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -29,7 +29,7 @@ struct inet_hashinfo __cacheline_aligned dccp_hashinfo = { .lhash_lock = RW_LOCK_UNLOCKED, .lhash_users = ATOMIC_INIT(0), - .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(dccp_hashinfo.lhash_wait), + .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(dccp_hashinfo.lhash_wait), .portalloc_lock = SPIN_LOCK_UNLOCKED, .port_rover = 1024 - 1, }; @@ -61,7 +61,8 @@ static int __dccp_v4_check_established(struct sock *sk, const __u16 lport, const int dif = sk->sk_bound_dev_if; INET_ADDR_COOKIE(acookie, saddr, daddr) const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport); - const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, dccp_hashinfo.ehash_size); + const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, + dccp_hashinfo.ehash_size); struct inet_ehash_bucket *head = &dccp_hashinfo.ehash[hash]; const struct sock *sk2; const struct hlist_node *node; @@ -133,11 +134,12 @@ static int dccp_v4_hash_connect(struct sock *sk) local_bh_disable(); /* TODO. Actually it is not so bad idea to remove - * dccp_hashinfo.portalloc_lock before next submission to Linus. + * dccp_hashinfo.portalloc_lock before next submission to + * Linus. * As soon as we touch this place at all it is time to think. * - * Now it protects single _advisory_ variable dccp_hashinfo.port_rover, - * hence it is mostly useless. + * Now it protects single _advisory_ variable + * dccp_hashinfo.port_rover, hence it is mostly useless. * Code will work nicely if we just delete it, but * I am afraid in contented case it will work not better or * even worse: another cpu just will hit the same bucket @@ -152,7 +154,8 @@ static int dccp_v4_hash_connect(struct sock *sk) rover++; if ((rover < low) || (rover > high)) rover = low; - head = &dccp_hashinfo.bhash[inet_bhashfn(rover, dccp_hashinfo.bhash_size)]; + head = &dccp_hashinfo.bhash[inet_bhashfn(rover, + dccp_hashinfo.bhash_size)]; spin_lock(&head->lock); /* Does not bother with rcv_saddr checks, @@ -172,7 +175,8 @@ static int dccp_v4_hash_connect(struct sock *sk) } } - tb = inet_bind_bucket_create(dccp_hashinfo.bind_bucket_cachep, head, rover); + tb = inet_bind_bucket_create(dccp_hashinfo.bind_bucket_cachep, + head, rover); if (tb == NULL) { spin_unlock(&head->lock); break; @@ -211,7 +215,8 @@ ok: goto out; } - head = &dccp_hashinfo.bhash[inet_bhashfn(snum, dccp_hashinfo.bhash_size)]; + head = &dccp_hashinfo.bhash[inet_bhashfn(snum, + dccp_hashinfo.bhash_size)]; tb = inet_csk(sk)->icsk_bind_hash; spin_lock_bh(&head->lock); if (sk_head(&tb->owners) == sk && sk->sk_bind_node.next == NULL) { @@ -313,7 +318,9 @@ static int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, out: return err; failure: - /* This unhashes the socket and releases the local port, if necessary. */ + /* + * This unhashes the socket and releases the local port, if necessary. + */ dccp_set_state(sk, DCCP_CLOSED); ip_rt_put(rt); sk->sk_route_caps = 0; @@ -365,8 +372,9 @@ static inline void dccp_do_pmtu_discovery(struct sock *sk, /* * From: draft-ietf-dccp-spec-11.txt * - * DCCP-Sync packets are the best choice for upward probing, - * since DCCP-Sync probes do not risk application data loss. + * DCCP-Sync packets are the best choice for upward + * probing, since DCCP-Sync probes do not risk application + * data loss. */ dccp_send_sync(sk, dp->dccps_gsr); } /* else let the usual retransmit timer handle it */ @@ -405,11 +413,13 @@ static void dccp_v4_ctl_send_ack(struct sk_buff *rxskb) dh->dccph_x = 1; dccp_hdr_set_seq(dh, DCCP_SKB_CB(rxskb)->dccpd_ack_seq); - dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), DCCP_SKB_CB(rxskb)->dccpd_seq); + dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), + DCCP_SKB_CB(rxskb)->dccpd_seq); bh_lock_sock(dccp_ctl_socket->sk); err = ip_build_and_send_pkt(skb, dccp_ctl_socket->sk, - rxskb->nh.iph->daddr, rxskb->nh.iph->saddr, NULL); + rxskb->nh.iph->daddr, + rxskb->nh.iph->saddr, NULL); bh_unlock_sock(dccp_ctl_socket->sk); if (err == NET_XMIT_CN || err == 0) { @@ -418,7 +428,8 @@ static void dccp_v4_ctl_send_ack(struct sk_buff *rxskb) } } -static void dccp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req) +static void dccp_v4_reqsk_send_ack(struct sk_buff *skb, + struct request_sock *req) { dccp_v4_ctl_send_ack(skb); } @@ -465,7 +476,8 @@ out: void dccp_v4_err(struct sk_buff *skb, u32 info) { const struct iphdr *iph = (struct iphdr *)skb->data; - const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data + (iph->ihl << 2)); + const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data + + (iph->ihl << 2)); struct dccp_sock *dp; struct inet_sock *inet; const int type = skb->h.icmph->type; @@ -605,7 +617,8 @@ out: sock_put(sk); } -extern struct sk_buff *dccp_make_reset(struct sock *sk, struct dst_entry *dst, enum dccp_reset_codes code); +extern struct sk_buff *dccp_make_reset(struct sock *sk, struct dst_entry *dst, + enum dccp_reset_codes code); int dccp_v4_send_reset(struct sock *sk, enum dccp_reset_codes code) { @@ -689,7 +702,8 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) ireq->loc_addr = daddr; ireq->rmt_addr = saddr; /* FIXME: Merge Aristeu's option parsing code when ready */ - req->rcv_wnd = 100; /* Fake, option parsing will get the right value */ + req->rcv_wnd = 100; /* Fake, option parsing will get the + right value */ ireq->opt = NULL; /* @@ -804,7 +818,8 @@ static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) return sk; } -int dccp_v4_checksum(const struct sk_buff *skb, const u32 saddr, const u32 daddr) +int dccp_v4_checksum(const struct sk_buff *skb, const u32 saddr, + const u32 daddr) { const struct dccp_hdr* dh = dccp_hdr(skb); int checksum_len; @@ -814,11 +829,13 @@ int dccp_v4_checksum(const struct sk_buff *skb, const u32 saddr, const u32 daddr checksum_len = skb->len; else { checksum_len = (dh->dccph_cscov + dh->dccph_x) * sizeof(u32); - checksum_len = checksum_len < skb->len ? checksum_len : skb->len; + checksum_len = checksum_len < skb->len ? checksum_len : + skb->len; } tmp = csum_partial((unsigned char *)dh, checksum_len, 0); - return csum_tcpudp_magic(saddr, daddr, checksum_len, IPPROTO_DCCP, tmp); + return csum_tcpudp_magic(saddr, daddr, checksum_len, + IPPROTO_DCCP, tmp); } static int dccp_v4_verify_checksum(struct sk_buff *skb, @@ -832,10 +849,12 @@ static int dccp_v4_verify_checksum(struct sk_buff *skb, checksum_len = skb->len; else { checksum_len = (dh->dccph_cscov + dh->dccph_x) * sizeof(u32); - checksum_len = checksum_len < skb->len ? checksum_len : skb->len; + checksum_len = checksum_len < skb->len ? checksum_len : + skb->len; } tmp = csum_partial((unsigned char *)dh, checksum_len, 0); - return csum_tcpudp_magic(saddr, daddr, checksum_len, IPPROTO_DCCP, tmp) == 0 ? 0 : -1; + return csum_tcpudp_magic(saddr, daddr, checksum_len, + IPPROTO_DCCP, tmp) == 0 ? 0 : -1; } static struct dst_entry* dccp_v4_route_skb(struct sock *sk, @@ -850,7 +869,9 @@ static struct dst_entry* dccp_v4_route_skb(struct sock *sk, .proto = sk->sk_protocol, .uli_u = { .ports = { .sport = dccp_hdr(skb)->dccph_dport, - .dport = dccp_hdr(skb)->dccph_sport } } }; + .dport = dccp_hdr(skb)->dccph_sport } + } + }; if (ip_route_output_flow(&rt, &fl, sk, 0)) { IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); @@ -899,17 +920,20 @@ void dccp_v4_ctl_send_reset(struct sk_buff *rxskb) dh->dccph_dport = rxdh->dccph_sport; dh->dccph_doff = dccp_hdr_reset_len / 4; dh->dccph_x = 1; - dccp_hdr_reset(skb)->dccph_reset_code = DCCP_SKB_CB(rxskb)->dccpd_reset_code; + dccp_hdr_reset(skb)->dccph_reset_code = + DCCP_SKB_CB(rxskb)->dccpd_reset_code; dccp_hdr_set_seq(dh, DCCP_SKB_CB(rxskb)->dccpd_ack_seq); - dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), DCCP_SKB_CB(rxskb)->dccpd_seq); + dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), + DCCP_SKB_CB(rxskb)->dccpd_seq); dh->dccph_checksum = dccp_v4_checksum(skb, rxskb->nh.iph->saddr, rxskb->nh.iph->daddr); bh_lock_sock(dccp_ctl_socket->sk); err = ip_build_and_send_pkt(skb, dccp_ctl_socket->sk, - rxskb->nh.iph->daddr, rxskb->nh.iph->saddr, NULL); + rxskb->nh.iph->daddr, + rxskb->nh.iph->saddr, NULL); bh_unlock_sock(dccp_ctl_socket->sk); if (err == NET_XMIT_CN || err == 0) { @@ -933,7 +957,8 @@ int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) /* * Step 3: Process LISTEN state * If S.state == LISTEN, - * If P.type == Request or P contains a valid Init Cookie option, + * If P.type == Request or P contains a valid Init Cookie + * option, * * Must scan the packet's options to check for an Init * Cookie. Only the Init Cookie is processed here, * however; other options are processed in Step 8. This @@ -950,7 +975,8 @@ int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) * Generate Reset(No Connection) unless P.type == Reset * Drop packet and return * - * NOTE: the check for the packet types is done in dccp_rcv_state_process + * NOTE: the check for the packet types is done in + * dccp_rcv_state_process */ if (sk->sk_state == DCCP_LISTEN) { struct sock *nsk = dccp_v4_hnd_req(sk, skb); @@ -1007,7 +1033,8 @@ static inline int dccp_invalid_packet(struct sk_buff *skb) } if (!pskb_may_pull(skb, dh->dccph_doff * sizeof(u32))) { - dccp_pr_debug("P.Data Offset(%u) too small 2\n", dh->dccph_doff); + dccp_pr_debug("P.Data Offset(%u) too small 2\n", + dh->dccph_doff); return 1; } @@ -1021,8 +1048,8 @@ static inline int dccp_invalid_packet(struct sk_buff *skb) dh->dccph_type != DCCP_PKT_DATA && dh->dccph_type != DCCP_PKT_ACK && dh->dccph_type != DCCP_PKT_DATAACK) { - dccp_pr_debug("P.type (%s) not Data, Ack nor DataAck and P.X == 0\n", - dccp_packet_name(dh->dccph_type)); + dccp_pr_debug("P.type (%s) not Data, Ack nor DataAck and " + "P.X == 0\n", dccp_packet_name(dh->dccph_type)); return 1; } @@ -1055,10 +1082,11 @@ int dccp_v4_rcv(struct sk_buff *skb) * dccp_ackpkts_add, you'll get something like this on a session that * sends 10 DATA/DATAACK packets: * - * dccp_ackpkts_print: 281473596467422 |0,0|3,0|0,0|3,0|0,0|3,0|0,0|3,0|0,1| + * ackpkts_print: 281473596467422 |0,0|3,0|0,0|3,0|0,0|3,0|0,0|3,0|0,1| * * 0, 0 means: DCCP_ACKPKTS_STATE_RECEIVED, RLE == just this packet - * 0, 1 means: DCCP_ACKPKTS_STATE_RECEIVED, RLE == two adjacent packets with the same state + * 0, 1 means: DCCP_ACKPKTS_STATE_RECEIVED, RLE == two adjacent packets + * with the same state * 3, 0 means: DCCP_ACKPKTS_STATE_NOT_RECEIVED, RLE == just this packet * * So... @@ -1072,10 +1100,12 @@ int dccp_v4_rcv(struct sk_buff *skb) * 281473596467416 was received * 281473596467415 was not received * 281473596467414 was received - * 281473596467413 was received (this one was the 3way handshake RESPONSE) + * 281473596467413 was received (this one was the 3way handshake + * RESPONSE) * */ - if (dh->dccph_type == DCCP_PKT_DATA || dh->dccph_type == DCCP_PKT_DATAACK) { + if (dh->dccph_type == DCCP_PKT_DATA || + dh->dccph_type == DCCP_PKT_DATAACK) { static int discard = 0; if (discard) { @@ -1170,7 +1200,8 @@ no_dccp_socket: * Drop packet and return */ if (dh->dccph_type != DCCP_PKT_RESET) { - DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; + DCCP_SKB_CB(skb)->dccpd_reset_code = + DCCP_RESET_CODE_NO_CONNECTION; dccp_v4_ctl_send_reset(skb); } @@ -1196,8 +1227,9 @@ static int dccp_v4_init_sock(struct sock *sk) dccp_options_init(&dp->dccps_options); if (dp->dccps_options.dccpo_send_ack_vector) { - dp->dccps_hc_rx_ackpkts = dccp_ackpkts_alloc(DCCP_MAX_ACK_VECTOR_LEN, - GFP_KERNEL); + dp->dccps_hc_rx_ackpkts = + dccp_ackpkts_alloc(DCCP_MAX_ACK_VECTOR_LEN, + GFP_KERNEL); if (dp->dccps_hc_rx_ackpkts == NULL) return -ENOMEM; @@ -1211,8 +1243,10 @@ static int dccp_v4_init_sock(struct sock *sk) * setsockopt(CCIDs-I-want/accept). -acme */ if (likely(!dccp_ctl_socket_init)) { - dp->dccps_hc_rx_ccid = ccid_init(dp->dccps_options.dccpo_ccid, sk); - dp->dccps_hc_tx_ccid = ccid_init(dp->dccps_options.dccpo_ccid, sk); + dp->dccps_hc_rx_ccid = ccid_init(dp->dccps_options.dccpo_ccid, + sk); + dp->dccps_hc_tx_ccid = ccid_init(dp->dccps_options.dccpo_ccid, + sk); if (dp->dccps_hc_rx_ccid == NULL || dp->dccps_hc_tx_ccid == NULL) { ccid_exit(dp->dccps_hc_rx_ccid, sk); diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index a6a0b27..b8e6720 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -69,8 +69,8 @@ void dccp_time_wait(struct sock *sk, int state, int timeo) * socket up. We've got bigger problems than * non-graceful socket closings. */ - if (net_ratelimit()) - printk(KERN_INFO "DCCP: time wait bucket table overflow\n"); + LIMIT_NETDEBUG(KERN_INFO "DCCP: time wait bucket " + "table overflow\n"); } dccp_done(sk); @@ -98,19 +98,23 @@ struct sock *dccp_create_openreq_child(struct sock *sk, newicsk->icsk_rto = DCCP_TIMEOUT_INIT; if (newdp->dccps_options.dccpo_send_ack_vector) { - newdp->dccps_hc_rx_ackpkts = dccp_ackpkts_alloc(DCCP_MAX_ACK_VECTOR_LEN, - GFP_ATOMIC); + newdp->dccps_hc_rx_ackpkts = + dccp_ackpkts_alloc(DCCP_MAX_ACK_VECTOR_LEN, + GFP_ATOMIC); /* - * XXX: We're using the same CCIDs set on the parent, i.e. sk_clone - * copied the master sock and left the CCID pointers for this child, - * that is why we do the __ccid_get calls. + * XXX: We're using the same CCIDs set on the parent, + * i.e. sk_clone copied the master sock and left the + * CCID pointers for this child, that is why we do the + * __ccid_get calls. */ if (unlikely(newdp->dccps_hc_rx_ackpkts == NULL)) goto out_free; } - if (unlikely(ccid_hc_rx_init(newdp->dccps_hc_rx_ccid, newsk) != 0 || - ccid_hc_tx_init(newdp->dccps_hc_tx_ccid, newsk) != 0)) { + if (unlikely(ccid_hc_rx_init(newdp->dccps_hc_rx_ccid, + newsk) != 0 || + ccid_hc_tx_init(newdp->dccps_hc_tx_ccid, + newsk) != 0)) { dccp_ackpkts_free(newdp->dccps_hc_rx_ackpkts); ccid_hc_rx_exit(newdp->dccps_hc_rx_ccid, newsk); ccid_hc_tx_exit(newdp->dccps_hc_tx_ccid, newsk); @@ -129,7 +133,8 @@ out_free: * Step 3: Process LISTEN state * * Choose S.ISS (initial seqno) or set from Init Cookie - * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie + * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init + * Cookie */ /* See dccp_v4_conn_request */ @@ -160,13 +165,15 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, /* Check for retransmitted REQUEST */ if (dccp_hdr(skb)->dccph_type == DCCP_PKT_REQUEST) { - if (after48(DCCP_SKB_CB(skb)->dccpd_seq, dccp_rsk(req)->dreq_isr)) { + if (after48(DCCP_SKB_CB(skb)->dccpd_seq, + dccp_rsk(req)->dreq_isr)) { struct dccp_request_sock *dreq = dccp_rsk(req); dccp_pr_debug("Retransmitted REQUEST\n"); /* Send another RESPONSE packet */ dccp_set_seqno(&dreq->dreq_iss, dreq->dreq_iss + 1); - dccp_set_seqno(&dreq->dreq_isr, DCCP_SKB_CB(skb)->dccpd_seq); + dccp_set_seqno(&dreq->dreq_isr, + DCCP_SKB_CB(skb)->dccpd_seq); req->rsk_ops->rtx_syn_ack(sk, req, NULL); } /* Network Duplicate, discard packet */ @@ -181,7 +188,8 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, /* Invalid ACK */ if (DCCP_SKB_CB(skb)->dccpd_ack_seq != dccp_rsk(req)->dreq_iss) { - dccp_pr_debug("Invalid ACK number: ack_seq=%llu, dreq_iss=%llu\n", + dccp_pr_debug("Invalid ACK number: ack_seq=%llu, " + "dreq_iss=%llu\n", (unsigned long long) DCCP_SKB_CB(skb)->dccpd_ack_seq, (unsigned long long) @@ -223,7 +231,8 @@ int dccp_child_process(struct sock *parent, struct sock *child, const int state = child->sk_state; if (!sock_owned_by_user(child)) { - ret = dccp_rcv_state_process(child, skb, dccp_hdr(skb), skb->len); + ret = dccp_rcv_state_process(child, skb, dccp_hdr(skb), + skb->len); /* Wakeup parent, send SIGIO */ if (state == DCCP_RESPOND && child->sk_state != state) diff --git a/net/dccp/options.c b/net/dccp/options.c index 5bf9976..68d6614 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -59,14 +59,15 @@ int dccp_parse_options(struct sock *sk, struct sk_buff *skb) { struct dccp_sock *dp = dccp_sk(sk); #ifdef DCCP_DEBUG - const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ? "CLIENT rx opt: " : - "server rx opt: "; + const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ? + "CLIENT rx opt: " : "server rx opt: "; #endif const struct dccp_hdr *dh = dccp_hdr(skb); const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type; unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb); unsigned char *opt_ptr = options; - const unsigned char *opt_end = (unsigned char *)dh + (dh->dccph_doff * 4); + const unsigned char *opt_end = (unsigned char *)dh + + (dh->dccph_doff * 4); struct dccp_options_received *opt_recv = &dp->dccps_options_received; unsigned char opt, len; unsigned char *value; @@ -106,7 +107,8 @@ int dccp_parse_options(struct sock *sk, struct sk_buff *skb) goto out_invalid_option; opt_recv->dccpor_ndp = dccp_decode_value_var(value, len); - dccp_pr_debug("%sNDP count=%d\n", debug_prefix, opt_recv->dccpor_ndp); + dccp_pr_debug("%sNDP count=%d\n", debug_prefix, + opt_recv->dccpor_ndp); break; case DCCPO_ACK_VECTOR_0: if (len > DCCP_MAX_ACK_VECTOR_LEN) @@ -124,8 +126,9 @@ int dccp_parse_options(struct sock *sk, struct sk_buff *skb) DCCP_SKB_CB(skb)->dccpd_ack_seq); dccp_ackvector_print(DCCP_SKB_CB(skb)->dccpd_ack_seq, value, len); - dccp_ackpkts_check_rcv_ackvector(dp->dccps_hc_rx_ackpkts, sk, - DCCP_SKB_CB(skb)->dccpd_ack_seq, + dccp_ackpkts_check_rcv_ackvector(dp->dccps_hc_rx_ackpkts, + sk, + DCCP_SKB_CB(skb)->dccpd_ack_seq, len, value); break; case DCCPO_TIMESTAMP: @@ -148,15 +151,21 @@ int dccp_parse_options(struct sock *sk, struct sk_buff *skb) opt_recv->dccpor_timestamp_echo = ntohl(*(u32 *)value); - dccp_pr_debug("%sTIMESTAMP_ECHO=%u, len=%d, ackno=%llu, diff=%u\n", - debug_prefix, opt_recv->dccpor_timestamp_echo, + dccp_pr_debug("%sTIMESTAMP_ECHO=%u, len=%d, ackno=%llu, " + "diff=%u\n", + debug_prefix, + opt_recv->dccpor_timestamp_echo, len + 2, (unsigned long long) DCCP_SKB_CB(skb)->dccpd_ack_seq, - tcp_time_stamp - opt_recv->dccpor_timestamp_echo); - - opt_recv->dccpor_elapsed_time = dccp_decode_value_var(value + 4, len - 4); - dccp_pr_debug("%sTIMESTAMP_ECHO ELAPSED_TIME=%d\n", debug_prefix, + (tcp_time_stamp - + opt_recv->dccpor_timestamp_echo)); + + opt_recv->dccpor_elapsed_time = + dccp_decode_value_var(value + 4, + len - 4); + dccp_pr_debug("%sTIMESTAMP_ECHO ELAPSED_TIME=%d\n", + debug_prefix, opt_recv->dccpor_elapsed_time); break; case DCCPO_ELAPSED_TIME: @@ -165,33 +174,41 @@ int dccp_parse_options(struct sock *sk, struct sk_buff *skb) if (pkt_type == DCCP_PKT_DATA) continue; - opt_recv->dccpor_elapsed_time = dccp_decode_value_var(value, len); + opt_recv->dccpor_elapsed_time = + dccp_decode_value_var(value, len); dccp_pr_debug("%sELAPSED_TIME=%d\n", debug_prefix, opt_recv->dccpor_elapsed_time); break; /* * From draft-ietf-dccp-spec-11.txt: * - * Option numbers 128 through 191 are for options sent from the HC- - * Sender to the HC-Receiver; option numbers 192 through 255 are for - * options sent from the HC-Receiver to the HC-Sender. + * Option numbers 128 through 191 are for + * options sent from the HC-Sender to the + * HC-Receiver; option numbers 192 through 255 + * are for options sent from the HC-Receiver to + * the HC-Sender. */ case 128 ... 191: { const u16 idx = value - options; - if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk, opt, len, idx, value) != 0) + if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk, + opt, len, idx, + value) != 0) goto out_invalid_option; } break; case 192 ... 255: { const u16 idx = value - options; - if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk, opt, len, idx, value) != 0) + if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk, + opt, len, idx, + value) != 0) goto out_invalid_option; } break; default: - pr_info("DCCP(%p): option %d(len=%d) not implemented, ignoring\n", + pr_info("DCCP(%p): option %d(len=%d) not " + "implemented, ignoring\n", sk, opt, len); break; } @@ -231,7 +248,8 @@ void dccp_insert_option(struct sock *sk, struct sk_buff *skb, unsigned char *to; if (DCCP_SKB_CB(skb)->dccpd_opt_len + len + 2 > DCCP_MAX_OPT_LEN) { - LIMIT_NETDEBUG(KERN_INFO "DCCP: packet too small to insert %d option!\n", option); + LIMIT_NETDEBUG(KERN_INFO "DCCP: packet too small to insert " + "%d option!\n", option); return; } @@ -287,8 +305,8 @@ void dccp_insert_option_elapsed_time(struct sock *sk, { #ifdef DCCP_DEBUG struct dccp_sock *dp = dccp_sk(sk); - const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ? "CLIENT TX opt: " : - "server TX opt: "; + const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ? + "CLIENT TX opt: " : "server TX opt: "; #endif const int elapsed_time_len = dccp_elapsed_time_len(elapsed_time); const int len = 2 + elapsed_time_len; @@ -299,7 +317,8 @@ void dccp_insert_option_elapsed_time(struct sock *sk, return; if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) { - LIMIT_NETDEBUG(KERN_INFO "DCCP: packet too small to insert elapsed time!\n"); + LIMIT_NETDEBUG(KERN_INFO "DCCP: packet too small to " + "insert elapsed time!\n"); return; } @@ -323,8 +342,8 @@ static void dccp_insert_option_ack_vector(struct sock *sk, struct sk_buff *skb) { struct dccp_sock *dp = dccp_sk(sk); #ifdef DCCP_DEBUG - const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ? "CLIENT TX opt: " : - "server TX opt: "; + const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ? + "CLIENT TX opt: " : "server TX opt: "; #endif struct dccp_ackpkts *ap = dp->dccps_hc_rx_ackpkts; int len = ap->dccpap_buf_vector_len + 2; @@ -335,7 +354,8 @@ static void dccp_insert_option_ack_vector(struct sock *sk, struct sk_buff *skb) dccp_insert_option_elapsed_time(sk, skb, elapsed_time); if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) { - LIMIT_NETDEBUG(KERN_INFO "DCCP: packet too small to insert ACK Vector!\n"); + LIMIT_NETDEBUG(KERN_INFO "DCCP: packet too small to " + "insert ACK Vector!\n"); return; } @@ -360,7 +380,8 @@ static void dccp_insert_option_ack_vector(struct sock *sk, struct sk_buff *skb) /* Check if buf_head wraps */ if (ap->dccpap_buf_head + len > ap->dccpap_buf_len) { - const unsigned int tailsize = ap->dccpap_buf_len - ap->dccpap_buf_head; + const unsigned int tailsize = (ap->dccpap_buf_len - + ap->dccpap_buf_head); memcpy(to, from, tailsize); to += tailsize; @@ -375,8 +396,8 @@ static void dccp_insert_option_ack_vector(struct sock *sk, struct sk_buff *skb) * For each acknowledgement it sends, the HC-Receiver will add an * acknowledgement record. ack_seqno will equal the HC-Receiver * sequence number it used for the ack packet; ack_ptr will equal - * buf_head; ack_ackno will equal buf_ackno; and ack_nonce will equal - * buf_nonce. + * buf_head; ack_ackno will equal buf_ackno; and ack_nonce will + * equal buf_nonce. * * This implemention uses just one ack record for now. */ @@ -386,33 +407,38 @@ static void dccp_insert_option_ack_vector(struct sock *sk, struct sk_buff *skb) ap->dccpap_ack_nonce = ap->dccpap_buf_nonce; ap->dccpap_ack_vector_len = ap->dccpap_buf_vector_len; - dccp_pr_debug("%sACK Vector 0, len=%d, ack_seqno=%llu, ack_ackno=%llu\n", + dccp_pr_debug("%sACK Vector 0, len=%d, ack_seqno=%llu, " + "ack_ackno=%llu\n", debug_prefix, ap->dccpap_ack_vector_len, (unsigned long long) ap->dccpap_ack_seqno, (unsigned long long) ap->dccpap_ack_ackno); } -static inline void dccp_insert_option_timestamp(struct sock *sk, struct sk_buff *skb) +static inline void dccp_insert_option_timestamp(struct sock *sk, + struct sk_buff *skb) { const u32 now = htonl(tcp_time_stamp); dccp_insert_option(sk, skb, DCCPO_TIMESTAMP, &now, sizeof(now)); } -static void dccp_insert_option_timestamp_echo(struct sock *sk, struct sk_buff *skb) +static void dccp_insert_option_timestamp_echo(struct sock *sk, + struct sk_buff *skb) { struct dccp_sock *dp = dccp_sk(sk); #ifdef DCCP_DEBUG - const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ? "CLIENT TX opt: " : - "server TX opt: "; + const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ? + "CLIENT TX opt: " : "server TX opt: "; #endif u32 tstamp_echo; - const u32 elapsed_time = jiffies_to_usecs(jiffies - dp->dccps_timestamp_time) / 10; + const u32 elapsed_time = jiffies_to_usecs(jiffies - + dp->dccps_timestamp_time) / 10; const int elapsed_time_len = dccp_elapsed_time_len(elapsed_time); const int len = 6 + elapsed_time_len; unsigned char *to; if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) { - LIMIT_NETDEBUG(KERN_INFO "DCCP: packet too small to insert timestamp echo!\n"); + LIMIT_NETDEBUG(KERN_INFO "DCCP: packet too small to insert " + "timestamp echo!\n"); return; } @@ -447,7 +473,8 @@ void dccp_insert_options(struct sock *sk, struct sk_buff *skb) if (!dccp_packet_without_ack(skb)) { if (dp->dccps_options.dccpo_send_ack_vector && - dp->dccps_hc_rx_ackpkts->dccpap_buf_ackno != DCCP_MAX_SEQNO + 1) + (dp->dccps_hc_rx_ackpkts->dccpap_buf_ackno != + DCCP_MAX_SEQNO + 1)) dccp_insert_option_ack_vector(sk, skb); dccp_insert_option_timestamp(sk, skb); @@ -480,12 +507,16 @@ struct dccp_ackpkts *dccp_ackpkts_alloc(unsigned int len, int priority) #ifdef DCCP_DEBUG memset(ap->dccpap_buf, 0xFF, len); #endif - ap->dccpap_buf_len = len; - ap->dccpap_buf_head = ap->dccpap_buf_tail = ap->dccpap_buf_len - 1; - ap->dccpap_buf_ackno = ap->dccpap_ack_ackno = ap->dccpap_ack_seqno = DCCP_MAX_SEQNO + 1; - ap->dccpap_buf_nonce = ap->dccpap_buf_nonce = 0; - ap->dccpap_ack_ptr = 0; - ap->dccpap_time = 0; + ap->dccpap_buf_len = len; + ap->dccpap_buf_head = + ap->dccpap_buf_tail = + ap->dccpap_buf_len - 1; + ap->dccpap_buf_ackno = + ap->dccpap_ack_ackno = + ap->dccpap_ack_seqno = DCCP_MAX_SEQNO + 1; + ap->dccpap_buf_nonce = ap->dccpap_buf_nonce = 0; + ap->dccpap_ack_ptr = 0; + ap->dccpap_time = 0; ap->dccpap_buf_vector_len = ap->dccpap_ack_vector_len = 0; } @@ -567,15 +598,16 @@ int dccp_ackpkts_add(struct dccp_ackpkts *ap, u64 ackno, u8 state) * * From Appendix A: * - * Of course, the circular buffer may overflow, either when the HC- - * Sender is sending data at a very high rate, when the HC-Receiver's - * acknowledgements are not reaching the HC-Sender, or when the HC- - * Sender is forgetting to acknowledge those acks (so the HC-Receiver - * is unable to clean up old state). In this case, the HC-Receiver - * should either compress the buffer (by increasing run lengths when - * possible), transfer its state to a larger buffer, or, as a last - * resort, drop all received packets, without processing them - * whatsoever, until its buffer shrinks again. + * Of course, the circular buffer may overflow, either when the + * HC-Sender is sending data at a very high rate, when the + * HC-Receiver's acknowledgements are not reaching the HC-Sender, + * or when the HC-Sender is forgetting to acknowledge those acks + * (so the HC-Receiver is unable to clean up old state). In this + * case, the HC-Receiver should either compress the buffer (by + * increasing run lengths when possible), transfer its state to + * a larger buffer, or, as a last resort, drop all received + * packets, without processing them whatsoever, until its buffer + * shrinks again. */ /* See if this is the first ackno being inserted */ @@ -583,15 +615,17 @@ int dccp_ackpkts_add(struct dccp_ackpkts *ap, u64 ackno, u8 state) ap->dccpap_buf[ap->dccpap_buf_head] = state; ap->dccpap_buf_vector_len = 1; } else if (after48(ackno, ap->dccpap_buf_ackno)) { - const u64 delta = dccp_delta_seqno(ap->dccpap_buf_ackno, ackno); + const u64 delta = dccp_delta_seqno(ap->dccpap_buf_ackno, + ackno); /* - * Look if the state of this packet is the same as the previous ackno - * and if so if we can bump the head len. + * Look if the state of this packet is the same as the + * previous ackno and if so if we can bump the head len. */ if (delta == 1 && dccp_ackpkts_state(ap, ap->dccpap_buf_head) == state && - dccp_ackpkts_len(ap, ap->dccpap_buf_head) < DCCP_ACKPKTS_LEN_MASK) + (dccp_ackpkts_len(ap, ap->dccpap_buf_head) < + DCCP_ACKPKTS_LEN_MASK)) ap->dccpap_buf[ap->dccpap_buf_head]++; else if (dccp_ackpkts_set_buf_head_state(ap, delta, state)) return -ENOBUFS; @@ -599,9 +633,10 @@ int dccp_ackpkts_add(struct dccp_ackpkts *ap, u64 ackno, u8 state) /* * A.1.2. Old Packets * - * When a packet with Sequence Number S arrives, and S <= buf_ackno, - * the HC-Receiver will scan the table for the byte corresponding to S. - * (Indexing structures could reduce the complexity of this scan.) + * When a packet with Sequence Number S arrives, and + * S <= buf_ackno, the HC-Receiver will scan the table + * for the byte corresponding to S. (Indexing structures + * could reduce the complexity of this scan.) */ u64 delta = dccp_delta_seqno(ackno, ap->dccpap_buf_ackno); unsigned int index = ap->dccpap_buf_head; @@ -610,11 +645,12 @@ int dccp_ackpkts_add(struct dccp_ackpkts *ap, u64 ackno, u8 state) const u8 len = dccp_ackpkts_len(ap, index); const u8 state = dccp_ackpkts_state(ap, index); /* - * valid packets not yet in dccpap_buf have a reserved entry, with - * a len equal to 0 + * valid packets not yet in dccpap_buf have a reserved + * entry, with a len equal to 0. */ if (state == DCCP_ACKPKTS_STATE_NOT_RECEIVED && - len == 0 && delta == 0) { /* Found our reserved seat! */ + len == 0 && delta == 0) { /* Found our + reserved seat! */ dccp_pr_debug("Found %llu reserved seat!\n", (unsigned long long) ackno); ap->dccpap_buf[index] = state; @@ -639,13 +675,14 @@ out: out_duplicate: /* Duplicate packet */ - dccp_pr_debug("Received a dup or already considered lost packet: %llu\n", - (unsigned long long) ackno); + dccp_pr_debug("Received a dup or already considered lost " + "packet: %llu\n", (unsigned long long) ackno); return -EILSEQ; } #ifdef DCCP_DEBUG -void dccp_ackvector_print(const u64 ackno, const unsigned char *vector, int len) +void dccp_ackvector_print(const u64 ackno, const unsigned char *vector, + int len) { if (!dccp_debug) return; @@ -678,8 +715,9 @@ static void dccp_ackpkts_trow_away_ack_record(struct dccp_ackpkts *ap) * As we're keeping track of the ack vector size * (dccpap_buf_vector_len) and the sent ack vector size * (dccpap_ack_vector_len) we don't need dccpap_buf_tail at all, but - * keep this code here as in the future we'll implement a vector of ack - * records, as suggested in draft-ietf-dccp-spec-11.txt Appendix A. -acme + * keep this code here as in the future we'll implement a vector of + * ack records, as suggested in draft-ietf-dccp-spec-11.txt + * Appendix A. -acme */ #if 0 ap->dccpap_buf_tail = ap->dccpap_ack_ptr + 1; @@ -699,10 +737,11 @@ void dccp_ackpkts_check_rcv_ackno(struct dccp_ackpkts *ap, struct sock *sk, if (ackno == ap->dccpap_ack_seqno) { #ifdef DCCP_DEBUG struct dccp_sock *dp = dccp_sk(sk); - const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ? "CLIENT rx ack: " : - "server rx ack: "; + const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ? + "CLIENT rx ack: " : "server rx ack: "; #endif - dccp_pr_debug("%sACK packet 0, len=%d, ack_seqno=%llu, ack_ackno=%llu, ACKED!\n", + dccp_pr_debug("%sACK packet 0, len=%d, ack_seqno=%llu, " + "ack_ackno=%llu, ACKED!\n", debug_prefix, 1, (unsigned long long) ap->dccpap_ack_seqno, (unsigned long long) ap->dccpap_ack_ackno); @@ -722,20 +761,21 @@ static void dccp_ackpkts_check_rcv_ackvector(struct dccp_ackpkts *ap, if (ap->dccpap_ack_seqno == DCCP_MAX_SEQNO + 1) return; /* - * We're in the receiver half connection, so if the received an ACK vector - * ackno (e.g. 50) before dccpap_ack_seqno (e.g. 52), we're not interested. + * We're in the receiver half connection, so if the received an ACK + * vector ackno (e.g. 50) before dccpap_ack_seqno (e.g. 52), we're + * not interested. * * Extra explanation with example: * * if we received an ACK vector with ackno 50, it can only be acking * 50, 49, 48, etc, not 52 (the seqno for the ACK vector we sent). */ - // dccp_pr_debug("is %llu < %llu? ", ackno, ap->dccpap_ack_seqno); + /* dccp_pr_debug("is %llu < %llu? ", ackno, ap->dccpap_ack_seqno); */ if (before48(ackno, ap->dccpap_ack_seqno)) { - // dccp_pr_debug_cat("yes\n"); + /* dccp_pr_debug_cat("yes\n"); */ return; } - // dccp_pr_debug_cat("no\n"); + /* dccp_pr_debug_cat("no\n"); */ i = len; while (i--) { @@ -744,18 +784,25 @@ static void dccp_ackpkts_check_rcv_ackvector(struct dccp_ackpkts *ap, dccp_set_seqno(&ackno_end_rl, ackno - rl); - // dccp_pr_debug("is %llu <= %llu <= %llu? ", ackno_end_rl, ap->dccpap_ack_seqno, ackno); + /* + * dccp_pr_debug("is %llu <= %llu <= %llu? ", ackno_end_rl, + * ap->dccpap_ack_seqno, ackno); + */ if (between48(ap->dccpap_ack_seqno, ackno_end_rl, ackno)) { - const u8 state = (*vector & DCCP_ACKPKTS_STATE_MASK) >> 6; - // dccp_pr_debug_cat("yes\n"); + const u8 state = (*vector & + DCCP_ACKPKTS_STATE_MASK) >> 6; + /* dccp_pr_debug_cat("yes\n"); */ if (state != DCCP_ACKPKTS_STATE_NOT_RECEIVED) { #ifdef DCCP_DEBUG struct dccp_sock *dp = dccp_sk(sk); - const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ? "CLIENT rx ack: " : - "server rx ack: "; + const char *debug_prefix = + dp->dccps_role == DCCP_ROLE_CLIENT ? + "CLIENT rx ack: " : "server rx ack: "; #endif - dccp_pr_debug("%sACK vector 0, len=%d, ack_seqno=%llu, ack_ackno=%llu, ACKED!\n", + dccp_pr_debug("%sACK vector 0, len=%d, " + "ack_seqno=%llu, ack_ackno=%llu, " + "ACKED!\n", debug_prefix, len, (unsigned long long) ap->dccpap_ack_seqno, @@ -764,13 +811,13 @@ static void dccp_ackpkts_check_rcv_ackvector(struct dccp_ackpkts *ap, dccp_ackpkts_trow_away_ack_record(ap); } /* - * If dccpap_ack_seqno was not received, no problem we'll - * send another ACK vector. + * If dccpap_ack_seqno was not received, no problem + * we'll send another ACK vector. */ ap->dccpap_ack_seqno = DCCP_MAX_SEQNO + 1; break; } - // dccp_pr_debug_cat("no\n"); + /* dccp_pr_debug_cat("no\n"); */ dccp_set_seqno(&ackno, ackno_end_rl - 1); ++vector; diff --git a/net/dccp/output.c b/net/dccp/output.c index 50292c0..dcc061b 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -40,13 +40,13 @@ int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) /* XXX For now we're using only 48 bits sequence numbers */ const int dccp_header_size = sizeof(*dh) + sizeof(struct dccp_hdr_ext) + - dccp_packet_hdr_len(dcb->dccpd_type); + dccp_packet_hdr_len(dcb->dccpd_type); int err, set_ack = 1; u64 ackno = dp->dccps_gsr; /* - * FIXME: study DCCP_PKT_SYNC[ACK] to see what is the right thing - * to do here... + * FIXME: study DCCP_PKT_SYNC[ACK] to see what is the right + * thing to do here... */ dccp_inc_seqno(&dp->dccps_gss); @@ -65,7 +65,9 @@ int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) skb->h.raw = skb_push(skb, dccp_header_size); dh = dccp_hdr(skb); - /* Data packets are not cloned as they are never retransmitted */ + /* + * Data packets are not cloned as they are never retransmitted + */ if (skb_cloned(skb)) skb_set_owner_w(skb, sk); @@ -86,10 +88,12 @@ int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) switch (dcb->dccpd_type) { case DCCP_PKT_REQUEST: - dccp_hdr_request(skb)->dccph_req_service = dcb->dccpd_service; + dccp_hdr_request(skb)->dccph_req_service = + dcb->dccpd_service; break; case DCCP_PKT_RESET: - dccp_hdr_reset(skb)->dccph_reset_code = dcb->dccpd_reset_code; + dccp_hdr_reset(skb)->dccph_reset_code = + dcb->dccpd_reset_code; break; } @@ -123,10 +127,13 @@ unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu) int mss_now; /* - * FIXME: we really should be using the af_specific thing to support IPv6. - * mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct dccp_hdr) - sizeof(struct dccp_hdr_ext); + * FIXME: we really should be using the af_specific thing to support + * IPv6. + * mss_now = pmtu - tp->af_specific->net_header_len - + * sizeof(struct dccp_hdr) - sizeof(struct dccp_hdr_ext); */ - mss_now = pmtu - sizeof(struct iphdr) - sizeof(struct dccp_hdr) - sizeof(struct dccp_hdr_ext); + mss_now = pmtu - sizeof(struct iphdr) - sizeof(struct dccp_hdr) - + sizeof(struct dccp_hdr_ext); /* Now subtract optional transport overhead */ mss_now -= dp->dccps_ext_header_len; @@ -223,7 +230,8 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst, dh->dccph_sport = inet_sk(sk)->sport; dh->dccph_dport = inet_rsk(req)->rmt_port; - dh->dccph_doff = (dccp_header_size + DCCP_SKB_CB(skb)->dccpd_opt_len) / 4; + dh->dccph_doff = (dccp_header_size + + DCCP_SKB_CB(skb)->dccpd_opt_len) / 4; dh->dccph_type = DCCP_PKT_RESPONSE; dh->dccph_x = 1; dccp_hdr_set_seq(dh, dccp_rsk(req)->dreq_iss); @@ -271,7 +279,8 @@ struct sk_buff *dccp_make_reset(struct sock *sk, struct dst_entry *dst, dh->dccph_sport = inet_sk(sk)->sport; dh->dccph_dport = inet_sk(sk)->dport; - dh->dccph_doff = (dccp_header_size + DCCP_SKB_CB(skb)->dccpd_opt_len) / 4; + dh->dccph_doff = (dccp_header_size + + DCCP_SKB_CB(skb)->dccpd_opt_len) / 4; dh->dccph_type = DCCP_PKT_RESET; dh->dccph_x = 1; dccp_hdr_set_seq(dh, dp->dccps_gss); @@ -348,7 +357,9 @@ void dccp_send_ack(struct sock *sk) if (skb == NULL) { inet_csk_schedule_ack(sk); inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, TCP_DELACK_MAX, TCP_RTO_MAX); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, + TCP_DELACK_MAX, + DCCP_RTO_MAX); return; } @@ -416,8 +427,10 @@ void dccp_send_sync(struct sock *sk, u64 seq) dccp_transmit_skb(sk, skb); } -/* Send a DCCP_PKT_CLOSE/CLOSEREQ. The caller locks the socket for us. This cannot be - * allowed to fail queueing a DCCP_PKT_CLOSE/CLOSEREQ frame under any circumstances. +/* + * Send a DCCP_PKT_CLOSE/CLOSEREQ. The caller locks the socket for us. This + * cannot be allowed to fail queueing a DCCP_PKT_CLOSE/CLOSEREQ frame under + * any circumstances. */ void dccp_send_close(struct sock *sk) { @@ -435,7 +448,8 @@ void dccp_send_close(struct sock *sk) /* Reserve space for headers and prepare control bits. */ skb_reserve(skb, sk->sk_prot->max_header); skb->csum = 0; - DCCP_SKB_CB(skb)->dccpd_type = dp->dccps_role == DCCP_ROLE_CLIENT ? DCCP_PKT_CLOSE : DCCP_PKT_CLOSEREQ; + DCCP_SKB_CB(skb)->dccpd_type = dp->dccps_role == DCCP_ROLE_CLIENT ? + DCCP_PKT_CLOSE : DCCP_PKT_CLOSEREQ; skb_set_owner_w(skb, sk); dccp_transmit_skb(sk, skb); diff --git a/net/dccp/packet_history.c b/net/dccp/packet_history.c index 6b41489..2d9ef5a 100644 --- a/net/dccp/packet_history.c +++ b/net/dccp/packet_history.c @@ -55,7 +55,7 @@ struct dccp_rx_hist *dccp_rx_hist_new(const char *name) sprintf(slab_name, dccp_rx_hist_mask, name); hist->dccprxh_slab = kmem_cache_create(slab_name, - sizeof(struct dccp_rx_hist_entry), + sizeof(struct dccp_rx_hist_entry), 0, SLAB_HWCACHE_ALIGN, NULL, NULL); if (hist->dccprxh_slab == NULL) @@ -128,7 +128,7 @@ struct dccp_tx_hist *dccp_tx_hist_new(const char *name) sprintf(slab_name, dccp_tx_hist_mask, name); hist->dccptxh_slab = kmem_cache_create(slab_name, - sizeof(struct dccp_tx_hist_entry), + sizeof(struct dccp_tx_hist_entry), 0, SLAB_HWCACHE_ALIGN, NULL, NULL); if (hist->dccptxh_slab == NULL) @@ -156,8 +156,8 @@ void dccp_tx_hist_delete(struct dccp_tx_hist *hist) EXPORT_SYMBOL_GPL(dccp_tx_hist_delete); -struct dccp_tx_hist_entry *dccp_tx_hist_find_entry(const struct list_head *list, - const u64 seq) +struct dccp_tx_hist_entry * + dccp_tx_hist_find_entry(const struct list_head *list, const u64 seq) { struct dccp_tx_hist_entry *packet = NULL, *entry; @@ -172,7 +172,8 @@ struct dccp_tx_hist_entry *dccp_tx_hist_find_entry(const struct list_head *list, EXPORT_SYMBOL_GPL(dccp_tx_hist_find_entry); -void dccp_tx_hist_purge_older(struct dccp_tx_hist *hist, struct list_head *list, +void dccp_tx_hist_purge_older(struct dccp_tx_hist *hist, + struct list_head *list, struct dccp_tx_hist_entry *packet) { struct dccp_tx_hist_entry *next; diff --git a/net/dccp/packet_history.h b/net/dccp/packet_history.h index 0056525..489fff4 100644 --- a/net/dccp/packet_history.h +++ b/net/dccp/packet_history.h @@ -115,7 +115,8 @@ extern void dccp_tx_hist_purge_older(struct dccp_tx_hist *hist, extern void dccp_tx_hist_purge(struct dccp_tx_hist *hist, struct list_head *list); -static inline struct dccp_tx_hist_entry *dccp_tx_hist_head(struct list_head *list) +static inline struct dccp_tx_hist_entry * + dccp_tx_hist_head(struct list_head *list) { struct dccp_tx_hist_entry *head = NULL; @@ -163,7 +164,8 @@ static inline void dccp_rx_hist_add_entry(struct list_head *list, list_add(&entry->dccphrx_node, list); } -static inline struct dccp_rx_hist_entry *dccp_rx_hist_head(struct list_head *list) +static inline struct dccp_rx_hist_entry * + dccp_rx_hist_head(struct list_head *list) { struct dccp_rx_hist_entry *head = NULL; diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 877c1e0..46dd489 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -255,12 +255,16 @@ int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, /* FIXME */ #if 0 - /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */ + /* + * Are we at urgent data? Stop if we have read anything or + * have SIGURG pending. + */ if (tp->urg_data && tp->urg_seq == *seq) { if (copied) break; if (signal_pending(current)) { - copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; + copied = timeo ? sock_intr_errno(timeo) : + -EAGAIN; break; } } @@ -285,7 +289,8 @@ int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, dccp_pr_debug("found fin ok!\n"); goto found_fin_ok; } - dccp_pr_debug("packet_type=%s\n", dccp_packet_name(dh->dccph_type)); + dccp_pr_debug("packet_type=%s\n", + dccp_packet_name(dh->dccph_type)); BUG_TRAP(flags & MSG_PEEK); skb = skb->next; } while (skb != (struct sk_buff *)&sk->sk_receive_queue); @@ -439,16 +444,16 @@ out: } static const unsigned char dccp_new_state[] = { - /* current state: new state: action: */ - [0] = DCCP_CLOSED, - [DCCP_OPEN] = DCCP_CLOSING | DCCP_ACTION_FIN, - [DCCP_REQUESTING] = DCCP_CLOSED, - [DCCP_PARTOPEN] = DCCP_CLOSING | DCCP_ACTION_FIN, - [DCCP_LISTEN] = DCCP_CLOSED, - [DCCP_RESPOND] = DCCP_CLOSED, - [DCCP_CLOSING] = DCCP_CLOSED, - [DCCP_TIME_WAIT] = DCCP_CLOSED, - [DCCP_CLOSED] = DCCP_CLOSED, + /* current state: new state: action: */ + [0] = DCCP_CLOSED, + [DCCP_OPEN] = DCCP_CLOSING | DCCP_ACTION_FIN, + [DCCP_REQUESTING] = DCCP_CLOSED, + [DCCP_PARTOPEN] = DCCP_CLOSING | DCCP_ACTION_FIN, + [DCCP_LISTEN] = DCCP_CLOSED, + [DCCP_RESPOND] = DCCP_CLOSED, + [DCCP_CLOSING] = DCCP_CLOSED, + [DCCP_TIME_WAIT] = DCCP_CLOSED, + [DCCP_CLOSED] = DCCP_CLOSED, }; static int dccp_close_state(struct sock *sk) @@ -541,7 +546,8 @@ struct proto_ops inet_dccp_ops = { .getname = inet_getname, .poll = sock_no_poll, .ioctl = inet_ioctl, - .listen = inet_dccp_listen, /* FIXME: work on inet_listen to rename it to sock_common_listen */ + /* FIXME: work on inet_listen to rename it to sock_common_listen */ + .listen = inet_dccp_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, @@ -638,10 +644,10 @@ static int __init dccp_init(void) if (rc) goto out; - dccp_hashinfo.bind_bucket_cachep = kmem_cache_create("dccp_bind_bucket", - sizeof(struct inet_bind_bucket), - 0, SLAB_HWCACHE_ALIGN, - NULL, NULL); + dccp_hashinfo.bind_bucket_cachep = + kmem_cache_create("dccp_bind_bucket", + sizeof(struct inet_bind_bucket), 0, + SLAB_HWCACHE_ALIGN, NULL, NULL); if (!dccp_hashinfo.bind_bucket_cachep) goto out_proto_unregister; @@ -657,14 +663,16 @@ static int __init dccp_init(void) goal = num_physpages >> (23 - PAGE_SHIFT); if (thash_entries) - goal = (thash_entries * sizeof(struct inet_ehash_bucket)) >> PAGE_SHIFT; + goal = (thash_entries * + sizeof(struct inet_ehash_bucket)) >> PAGE_SHIFT; for (ehash_order = 0; (1UL << ehash_order) < goal; ehash_order++) ; do { dccp_hashinfo.ehash_size = (1UL << ehash_order) * PAGE_SIZE / sizeof(struct inet_ehash_bucket); dccp_hashinfo.ehash_size >>= 1; - while (dccp_hashinfo.ehash_size & (dccp_hashinfo.ehash_size - 1)) + while (dccp_hashinfo.ehash_size & + (dccp_hashinfo.ehash_size - 1)) dccp_hashinfo.ehash_size--; dccp_hashinfo.ehash = (struct inet_ehash_bucket *) __get_free_pages(GFP_ATOMIC, ehash_order); @@ -686,7 +694,8 @@ static int __init dccp_init(void) do { dccp_hashinfo.bhash_size = (1UL << bhash_order) * PAGE_SIZE / sizeof(struct inet_bind_hashbucket); - if ((dccp_hashinfo.bhash_size > (64 * 1024)) && bhash_order > 0) + if ((dccp_hashinfo.bhash_size > (64 * 1024)) && + bhash_order > 0) continue; dccp_hashinfo.bhash = (struct inet_bind_hashbucket *) __get_free_pages(GFP_ATOMIC, bhash_order); diff --git a/net/dccp/timer.c b/net/dccp/timer.c index 9f1f1ab..47b1616 100644 --- a/net/dccp/timer.c +++ b/net/dccp/timer.c @@ -45,11 +45,13 @@ static int dccp_write_timeout(struct sock *sk) if (sk->sk_state == DCCP_REQUESTING || sk->sk_state == DCCP_PARTOPEN) { if (icsk->icsk_retransmits != 0) dst_negative_advice(&sk->sk_dst_cache); - retry_until = icsk->icsk_syn_retries ? : /* FIXME! */ 3 /* FIXME! sysctl_tcp_syn_retries */; + retry_until = icsk->icsk_syn_retries ? : + /* FIXME! */ 3 /* FIXME! sysctl_tcp_syn_retries */; } else { - if (icsk->icsk_retransmits >= /* FIXME! sysctl_tcp_retries1 */ 5 /* FIXME! */) { - /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black - hole detection. :-( + if (icsk->icsk_retransmits >= + /* FIXME! sysctl_tcp_retries1 */ 5 /* FIXME! */) { + /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu + black hole detection. :-( It is place to make it. It is not made. I do not want to make it. It is disguisting. It does not work in any @@ -96,14 +98,17 @@ static void dccp_delack_timer(unsigned long data) /* Try again later. */ icsk->icsk_ack.blocked = 1; NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED); - sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN); + sk_reset_timer(sk, &icsk->icsk_delack_timer, + jiffies + TCP_DELACK_MIN); goto out; } - if (sk->sk_state == DCCP_CLOSED || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) + if (sk->sk_state == DCCP_CLOSED || + !(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) goto out; if (time_after(icsk->icsk_ack.timeout, jiffies)) { - sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout); + sk_reset_timer(sk, &icsk->icsk_delack_timer, + icsk->icsk_ack.timeout); goto out; } @@ -112,7 +117,8 @@ static void dccp_delack_timer(unsigned long data) if (inet_csk_ack_scheduled(sk)) { if (!icsk->icsk_ack.pingpong) { /* Delayed ACK missed: inflate ATO. */ - icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto); + icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, + icsk->icsk_rto); } else { /* Delayed ACK missed: leave pingpong mode and * deflate ATO. @@ -167,7 +173,7 @@ static void dccp_retransmit_timer(struct sock *sk) inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL), - TCP_RTO_MAX); + DCCP_RTO_MAX); goto out; } @@ -175,7 +181,8 @@ static void dccp_retransmit_timer(struct sock *sk) icsk->icsk_retransmits++; icsk->icsk_rto = min(icsk->icsk_rto << 1, DCCP_RTO_MAX); - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, + DCCP_RTO_MAX); if (icsk->icsk_retransmits > 3 /* FIXME: sysctl_dccp_retries1 */) __sk_dst_reset(sk); out:; @@ -190,7 +197,8 @@ static void dccp_write_timer(unsigned long data) bh_lock_sock(sk); if (sock_owned_by_user(sk)) { /* Try again later */ - sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20)); + sk_reset_timer(sk, &icsk->icsk_retransmit_timer, + jiffies + (HZ / 20)); goto out; } @@ -198,7 +206,8 @@ static void dccp_write_timer(unsigned long data) goto out; if (time_after(icsk->icsk_timeout, jiffies)) { - sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); + sk_reset_timer(sk, &icsk->icsk_retransmit_timer, + icsk->icsk_timeout); goto out; } @@ -220,7 +229,8 @@ out: */ static void dccp_response_timer(struct sock *sk) { - inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL, DCCP_TIMEOUT_INIT, DCCP_RTO_MAX); + inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL, DCCP_TIMEOUT_INIT, + DCCP_RTO_MAX); } static void dccp_keepalive_timer(unsigned long data) -- cgit v0.10.2 From 531669a0a9041d60d13920973ef8aa4f743c14a0 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sat, 13 Aug 2005 20:35:17 -0300 Subject: [DCCP]: Rewrite dccp_sendmsg to be more like UDP Based on discussions with Nishida-san. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 46dd489..ed0bf58 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -214,197 +214,101 @@ out_discard: goto out_release; } -EXPORT_SYMBOL(dccp_sendmsg); - int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { const struct dccp_hdr *dh; - int copied = 0; - unsigned long used; - int err; - int target; /* Read at least this many bytes */ long timeo; lock_sock(sk); - err = -ENOTCONN; - if (sk->sk_state == DCCP_LISTEN) + if (sk->sk_state == DCCP_LISTEN) { + len = -ENOTCONN; goto out; - - timeo = sock_rcvtimeo(sk, nonblock); - - /* Urgent data needs to be handled specially. */ - if (flags & MSG_OOB) - goto recv_urg; - - /* FIXME */ -#if 0 - seq = &tp->copied_seq; - if (flags & MSG_PEEK) { - peek_seq = tp->copied_seq; - seq = &peek_seq; } -#endif - target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); + timeo = sock_rcvtimeo(sk, nonblock); do { - struct sk_buff *skb; - u32 offset; + struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); - /* FIXME */ -#if 0 - /* - * Are we at urgent data? Stop if we have read anything or - * have SIGURG pending. - */ - if (tp->urg_data && tp->urg_seq == *seq) { - if (copied) - break; - if (signal_pending(current)) { - copied = timeo ? sock_intr_errno(timeo) : - -EAGAIN; - break; - } - } -#endif + if (skb == NULL) + goto verify_sock_status; - /* Next get a buffer. */ - - skb = skb_peek(&sk->sk_receive_queue); - do { - if (!skb) - break; + dh = dccp_hdr(skb); - offset = 0; - dh = dccp_hdr(skb); + if (dh->dccph_type == DCCP_PKT_DATA || + dh->dccph_type == DCCP_PKT_DATAACK) + goto found_ok_skb; - if (dh->dccph_type == DCCP_PKT_DATA || - dh->dccph_type == DCCP_PKT_DATAACK) - goto found_ok_skb; - - if (dh->dccph_type == DCCP_PKT_RESET || - dh->dccph_type == DCCP_PKT_CLOSE) { - dccp_pr_debug("found fin ok!\n"); - goto found_fin_ok; - } - dccp_pr_debug("packet_type=%s\n", - dccp_packet_name(dh->dccph_type)); - BUG_TRAP(flags & MSG_PEEK); - skb = skb->next; - } while (skb != (struct sk_buff *)&sk->sk_receive_queue); - - /* Well, if we have backlog, try to process it now yet. */ - if (copied >= target && !sk->sk_backlog.tail) + if (dh->dccph_type == DCCP_PKT_RESET || + dh->dccph_type == DCCP_PKT_CLOSE) { + dccp_pr_debug("found fin ok!\n"); + len = 0; + goto found_fin_ok; + } + dccp_pr_debug("packet_type=%s\n", + dccp_packet_name(dh->dccph_type)); + sk_eat_skb(sk, skb); +verify_sock_status: + if (sock_flag(sk, SOCK_DONE)) { + len = 0; break; + } - if (copied) { - if (sk->sk_err || - sk->sk_state == DCCP_CLOSED || - (sk->sk_shutdown & RCV_SHUTDOWN) || - !timeo || - signal_pending(current) || - (flags & MSG_PEEK)) - break; - } else { - if (sock_flag(sk, SOCK_DONE)) - break; - - if (sk->sk_err) { - copied = sock_error(sk); - break; - } - - if (sk->sk_shutdown & RCV_SHUTDOWN) - break; - - if (sk->sk_state == DCCP_CLOSED) { - if (!sock_flag(sk, SOCK_DONE)) { - /* This occurs when user tries to read - * from never connected socket. - */ - copied = -ENOTCONN; - break; - } - break; - } + if (sk->sk_err) { + len = sock_error(sk); + break; + } - if (!timeo) { - copied = -EAGAIN; - break; - } + if (sk->sk_shutdown & RCV_SHUTDOWN) { + len = 0; + break; + } - if (signal_pending(current)) { - copied = sock_intr_errno(timeo); + if (sk->sk_state == DCCP_CLOSED) { + if (!sock_flag(sk, SOCK_DONE)) { + /* This occurs when user tries to read + * from never connected socket. + */ + len = -ENOTCONN; break; } + len = 0; + break; } - /* FIXME: cleanup_rbuf(sk, copied); */ + if (!timeo) { + len = -EAGAIN; + break; + } - if (copied >= target) { - /* Do not sleep, just process backlog. */ - release_sock(sk); - lock_sock(sk); - } else - sk_wait_data(sk, &timeo); + if (signal_pending(current)) { + len = sock_intr_errno(timeo); + break; + } + sk_wait_data(sk, &timeo); continue; - found_ok_skb: - /* Ok so how much can we use? */ - used = skb->len - offset; - if (len < used) - used = len; - - if (!(flags & MSG_TRUNC)) { - err = skb_copy_datagram_iovec(skb, offset, - msg->msg_iov, used); - if (err) { - /* Exception. Bailout! */ - if (!copied) - copied = -EFAULT; - break; - } + if (len > skb->len) + len = skb->len; + else if (len < skb->len) + msg->msg_flags |= MSG_TRUNC; + + if (skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len)) { + /* Exception. Bailout! */ + len = -EFAULT; + break; } - - copied += used; - len -= used; - - /* FIXME: tcp_rcv_space_adjust(sk); */ - -//skip_copy: - if (used + offset < skb->len) - continue; - - if (!(flags & MSG_PEEK)) - sk_eat_skb(sk, skb); - continue; found_fin_ok: if (!(flags & MSG_PEEK)) sk_eat_skb(sk, skb); break; - - } while (len > 0); - - /* According to UNIX98, msg_name/msg_namelen are ignored - * on connected socket. I was just happy when found this 8) --ANK - */ - - /* Clean up data we have read: This will do ACK frames. */ - /* FIXME: cleanup_rbuf(sk, copied); */ - - release_sock(sk); - return copied; - + } while (1); out: release_sock(sk); - return err; - -recv_urg: - /* FIXME: err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len); */ - goto out; + return len; } static int inet_dccp_listen(struct socket *sock, int backlog) -- cgit v0.10.2 From 725ba8eee3881e619c8e5a0116f1bdb6480ac2d9 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sat, 13 Aug 2005 20:35:39 -0300 Subject: [DCCP]: Introduce the DCCP Kernel hacking menu Only available if CONFIG_DEBUG_KERNEL is enabled in the "Kernel Hacking" Menu. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig index 6760830..3023f70 100644 --- a/net/dccp/Kconfig +++ b/net/dccp/Kconfig @@ -26,4 +26,25 @@ config INET_DCCP_DIAG source "net/dccp/ccids/Kconfig" +menu "DCCP Kernel Hacking" + depends on IP_DCCP=m && DEBUG_KERNEL=y + +config IP_DCCP_DEBUG + bool "DCCP debug messages" + ---help--- + Only use this if you're hacking DCCP. + + Just say N. + +config IP_DCCP_UNLOAD_HACK + depends on IP_DCCP_CCID3=m + bool "DCCP control sock unload hack" + ---help--- + Enable this to be able to unload the dccp module when the it + has only one refcount held, the control sock one. Just execute + "rmmod dccp_ccid3 dccp" + + Just say N. +endmenu + endmenu diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index edf9740..09274f3 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -2078,6 +2078,15 @@ module_init(ccid3_module_init); static __exit void ccid3_module_exit(void) { +#ifdef CONFIG_IP_DCCP_UNLOAD_HACK + /* + * Hack to use while developing, so that we get rid of the control + * sock, that is what keeps a refcount on dccp.ko -acme + */ + extern void dccp_ctl_sock_exit(void); + + dccp_ctl_sock_exit(); +#endif ccid_unregister(&ccid3); if (ccid3_tx_hist != NULL) { diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 62e735f..270f194 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -11,14 +11,13 @@ * published by the Free Software Foundation. */ +#include #include #include #include #include -#define DCCP_DEBUG - -#ifdef DCCP_DEBUG +#ifdef CONFIG_IP_DCCP_DEBUG extern int dccp_debug; #define dccp_pr_debug(format, a...) \ @@ -426,7 +425,7 @@ extern int dccp_ackpkts_add(struct dccp_ackpkts *ap, u64 ackno, u8 state); extern void dccp_ackpkts_check_rcv_ackno(struct dccp_ackpkts *ap, struct sock *sk, u64 ackno); -#ifdef DCCP_DEBUG +#ifdef CONFIG_IP_DCCP_DEBUG extern void dccp_ackvector_print(const u64 ackno, const unsigned char *vector, int len); extern void dccp_ackpkts_print(const struct dccp_ackpkts *ap); diff --git a/net/dccp/options.c b/net/dccp/options.c index 68d6614..fc363aa 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -58,7 +58,7 @@ static u32 dccp_decode_value_var(const unsigned char *bf, const u8 len) int dccp_parse_options(struct sock *sk, struct sk_buff *skb) { struct dccp_sock *dp = dccp_sk(sk); -#ifdef DCCP_DEBUG +#ifdef CONFIG_IP_DCCP_DEBUG const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ? "CLIENT rx opt: " : "server rx opt: "; #endif @@ -303,7 +303,7 @@ void dccp_insert_option_elapsed_time(struct sock *sk, struct sk_buff *skb, u32 elapsed_time) { -#ifdef DCCP_DEBUG +#ifdef CONFIG_IP_DCCP_DEBUG struct dccp_sock *dp = dccp_sk(sk); const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ? "CLIENT TX opt: " : "server TX opt: "; @@ -341,7 +341,7 @@ EXPORT_SYMBOL(dccp_insert_option_elapsed_time); static void dccp_insert_option_ack_vector(struct sock *sk, struct sk_buff *skb) { struct dccp_sock *dp = dccp_sk(sk); -#ifdef DCCP_DEBUG +#ifdef CONFIG_IP_DCCP_DEBUG const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ? "CLIENT TX opt: " : "server TX opt: "; #endif @@ -425,7 +425,7 @@ static void dccp_insert_option_timestamp_echo(struct sock *sk, struct sk_buff *skb) { struct dccp_sock *dp = dccp_sk(sk); -#ifdef DCCP_DEBUG +#ifdef CONFIG_IP_DCCP_DEBUG const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ? "CLIENT TX opt: " : "server TX opt: "; #endif @@ -504,7 +504,7 @@ struct dccp_ackpkts *dccp_ackpkts_alloc(unsigned int len, int priority) struct dccp_ackpkts *ap = kmalloc(sizeof(*ap) + len, priority); if (ap != NULL) { -#ifdef DCCP_DEBUG +#ifdef CONFIG_IP_DCCP_DEBUG memset(ap->dccpap_buf, 0xFF, len); #endif ap->dccpap_buf_len = len; @@ -526,7 +526,7 @@ struct dccp_ackpkts *dccp_ackpkts_alloc(unsigned int len, int priority) void dccp_ackpkts_free(struct dccp_ackpkts *ap) { if (ap != NULL) { -#ifdef DCCP_DEBUG +#ifdef CONFIG_IP_DCCP_DEBUG memset(ap, 0xFF, sizeof(*ap) + ap->dccpap_buf_len); #endif kfree(ap); @@ -680,7 +680,7 @@ out_duplicate: return -EILSEQ; } -#ifdef DCCP_DEBUG +#ifdef CONFIG_IP_DCCP_DEBUG void dccp_ackvector_print(const u64 ackno, const unsigned char *vector, int len) { @@ -735,7 +735,7 @@ void dccp_ackpkts_check_rcv_ackno(struct dccp_ackpkts *ap, struct sock *sk, return; if (ackno == ap->dccpap_ack_seqno) { -#ifdef DCCP_DEBUG +#ifdef CONFIG_IP_DCCP_DEBUG struct dccp_sock *dp = dccp_sk(sk); const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ? "CLIENT rx ack: " : "server rx ack: "; @@ -794,7 +794,7 @@ static void dccp_ackpkts_check_rcv_ackvector(struct dccp_ackpkts *ap, /* dccp_pr_debug_cat("yes\n"); */ if (state != DCCP_ACKPKTS_STATE_NOT_RECEIVED) { -#ifdef DCCP_DEBUG +#ifdef CONFIG_IP_DCCP_DEBUG struct dccp_sock *dp = dccp_sk(sk); const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ? diff --git a/net/dccp/proto.c b/net/dccp/proto.c index ed0bf58..be06692 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -503,12 +503,16 @@ static int __init dccp_ctl_sock_init(void) return rc; } -static void __exit dccp_ctl_sock_exit(void) +#ifdef CONFIG_IP_DCCP_UNLOAD_HACK +void dccp_ctl_sock_exit(void) { if (dccp_ctl_socket != NULL) sock_release(dccp_ctl_socket); } +EXPORT_SYMBOL_GPL(dccp_ctl_sock_exit); +#endif + static int __init init_dccp_v4_mibs(void) { int rc = -ENOMEM; @@ -655,19 +659,21 @@ static const char dccp_del_proto_err_msg[] __exitdata = static void __exit dccp_fini(void) { - dccp_ctl_sock_exit(); - inet_unregister_protosw(&dccp_v4_protosw); if (inet_del_protocol(&dccp_protocol, IPPROTO_DCCP) < 0) printk(dccp_del_proto_err_msg); - /* Free the control endpoint. */ - sock_release(dccp_ctl_socket); - - proto_unregister(&dccp_v4_prot); - + free_percpu(dccp_statistics[0]); + free_percpu(dccp_statistics[1]); + free_pages((unsigned long)dccp_hashinfo.bhash, + get_order(dccp_hashinfo.bhash_size * + sizeof(struct inet_bind_hashbucket))); + free_pages((unsigned long)dccp_hashinfo.ehash, + get_order(dccp_hashinfo.ehash_size * + sizeof(struct inet_ehash_bucket))); kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); + proto_unregister(&dccp_v4_prot); } module_init(dccp_init); -- cgit v0.10.2 From 8649b0d4166e6e80ffa298e75abd8f2afdd491a6 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sat, 13 Aug 2005 20:36:01 -0300 Subject: [DCCP]: Fix RESET handling in dccp_rcv_state_process To avoid holding TIMEWAIT state for sockets in the LISTEN state. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/net/dccp/input.c b/net/dccp/input.c index 4b8638f..9dadfc3 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -402,7 +402,48 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, const int old_state = sk->sk_state; int queued = 0; - if (sk->sk_state != DCCP_LISTEN && sk->sk_state != DCCP_REQUESTING) { + /* + * Step 3: Process LISTEN state + * (Continuing from dccp_v4_do_rcv and dccp_v6_do_rcv) + * + * If S.state == LISTEN, + * If P.type == Request or P contains a valid Init Cookie + * option, + * * Must scan the packet's options to check for an Init + * Cookie. Only the Init Cookie is processed here, + * however; other options are processed in Step 8. This + * scan need only be performed if the endpoint uses Init + * Cookies * + * * Generate a new socket and switch to that socket * + * Set S := new socket for this port pair + * S.state = RESPOND + * Choose S.ISS (initial seqno) or set from Init Cookie + * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie + * Continue with S.state == RESPOND + * * A Response packet will be generated in Step 11 * + * Otherwise, + * Generate Reset(No Connection) unless P.type == Reset + * Drop packet and return + * + * NOTE: the check for the packet types is done in + * dccp_rcv_state_process + */ + if (sk->sk_state == DCCP_LISTEN) { + if (dh->dccph_type == DCCP_PKT_REQUEST) { + if (dccp_v4_conn_request(sk, skb) < 0) + return 1; + + /* FIXME: do congestion control initialization */ + goto discard; + } + if (dh->dccph_type == DCCP_PKT_RESET) + goto discard; + + /* Caller (dccp_v4_do_rcv) will send Reset(No Connection)*/ + return 1; + } + + if (sk->sk_state != DCCP_REQUESTING) { if (dccp_check_seqno(sk, skb)) goto discard; @@ -484,23 +525,6 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, case DCCP_CLOSED: return 1; - case DCCP_LISTEN: - if (dh->dccph_type == DCCP_PKT_ACK || - dh->dccph_type == DCCP_PKT_DATAACK) - return 1; - - if (dh->dccph_type == DCCP_PKT_RESET) - goto discard; - - if (dh->dccph_type == DCCP_PKT_REQUEST) { - if (dccp_v4_conn_request(sk, skb) < 0) - return 1; - - /* FIXME: do congestion control initialization */ - goto discard; - } - goto discard; - case DCCP_REQUESTING: /* FIXME: do congestion control initialization */ -- cgit v0.10.2 From a1d3a35518779df0579dd9de0121354b49c68ddc Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sat, 13 Aug 2005 22:42:25 -0300 Subject: [DCCP]: Fix sparse warnings Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 09274f3..21948d0 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -82,12 +82,13 @@ enum ccid3_options { static int ccid3_debug; -struct dccp_tx_hist *ccid3_tx_hist; -struct dccp_rx_hist *ccid3_rx_hist; +static struct dccp_tx_hist *ccid3_tx_hist; +static struct dccp_rx_hist *ccid3_rx_hist; static kmem_cache_t *ccid3_loss_interval_hist_slab; -static inline struct ccid3_loss_interval_hist_entry *ccid3_loss_interval_hist_entry_new(int prio) +static inline struct ccid3_loss_interval_hist_entry * + ccid3_loss_interval_hist_entry_new(const unsigned int __nocast prio) { return kmem_cache_alloc(ccid3_loss_interval_hist_slab, prio); } @@ -1593,7 +1594,9 @@ static void ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) * These are integers as per section 8 of RFC3448. We can then divide by 4 * * when we use it. */ -const int ccid3_hc_rx_w[TFRC_RECV_IVAL_F_LENGTH] = { 4, 4, 4, 4, 3, 2, 1, 1, }; +static const int ccid3_hc_rx_w[TFRC_RECV_IVAL_F_LENGTH] = { + 4, 4, 4, 4, 3, 2, 1, 1, +}; /* * args: fvalue - function value to match @@ -1601,7 +1604,7 @@ const int ccid3_hc_rx_w[TFRC_RECV_IVAL_F_LENGTH] = { 4, 4, 4, 4, 3, 2, 1, 1, }; * * both fvalue and p are multiplied by 1,000,000 to use ints */ -u32 calcx_reverse_lookup(u32 fvalue) { +static u32 calcx_reverse_lookup(u32 fvalue) { int ctr = 0; int small; diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 270f194..148e8a6 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -231,19 +231,22 @@ extern void dccp_close(struct sock *sk, long timeout); extern struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst, struct request_sock *req); +extern struct sk_buff *dccp_make_reset(struct sock *sk, + struct dst_entry *dst, + enum dccp_reset_codes code); extern int dccp_connect(struct sock *sk); extern int dccp_disconnect(struct sock *sk, int flags); extern int dccp_getsockopt(struct sock *sk, int level, int optname, - char *optval, int *optlen); + char __user *optval, int __user *optlen); +extern int dccp_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, int optlen); extern int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg); extern int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t size); extern int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); -extern int dccp_setsockopt(struct sock *sk, int level, int optname, - char *optval, int optlen); extern void dccp_shutdown(struct sock *sk, int how); extern int dccp_v4_checksum(const struct sk_buff *skb, @@ -419,7 +422,9 @@ struct dccp_ackpkts { u8 dccpap_buf[0]; }; -extern struct dccp_ackpkts *dccp_ackpkts_alloc(unsigned int len, int priority); +extern struct dccp_ackpkts * + dccp_ackpkts_alloc(unsigned int len, + const unsigned int __nocast priority); extern void dccp_ackpkts_free(struct dccp_ackpkts *ap); extern int dccp_ackpkts_add(struct dccp_ackpkts *ap, u64 ackno, u8 state); extern void dccp_ackpkts_check_rcv_ackno(struct dccp_ackpkts *ap, diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 42d9c87..bc3cfc0 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -617,9 +617,6 @@ out: sock_put(sk); } -extern struct sk_buff *dccp_make_reset(struct sock *sk, struct dst_entry *dst, - enum dccp_reset_codes code); - int dccp_v4_send_reset(struct sock *sk, enum dccp_reset_codes code) { struct sk_buff *skb; @@ -881,7 +878,7 @@ static struct dst_entry* dccp_v4_route_skb(struct sock *sk, return &rt->u.dst; } -void dccp_v4_ctl_send_reset(struct sk_buff *rxskb) +static void dccp_v4_ctl_send_reset(struct sk_buff *rxskb) { int err; struct dccp_hdr *rxdh = dccp_hdr(rxskb), *dh; @@ -1268,7 +1265,7 @@ static int dccp_v4_init_sock(struct sock *sk) return 0; } -int dccp_v4_destroy_sock(struct sock *sk) +static int dccp_v4_destroy_sock(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); diff --git a/net/dccp/options.c b/net/dccp/options.c index fc363aa..d87d6be 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -499,7 +499,8 @@ void dccp_insert_options(struct sock *sk, struct sk_buff *skb) } } -struct dccp_ackpkts *dccp_ackpkts_alloc(unsigned int len, int priority) +struct dccp_ackpkts *dccp_ackpkts_alloc(const unsigned int len, + const unsigned int __nocast priority) { struct dccp_ackpkts *ap = kmalloc(sizeof(*ap) + len, priority); diff --git a/net/dccp/packet_history.h b/net/dccp/packet_history.h index 489fff4..2e5ba34 100644 --- a/net/dccp/packet_history.h +++ b/net/dccp/packet_history.h @@ -79,8 +79,8 @@ extern struct dccp_rx_hist_entry * dccp_rx_hist_find_data_packet(const struct list_head *list); static inline struct dccp_tx_hist_entry * - dccp_tx_hist_entry_new(struct dccp_tx_hist *hist, - const int prio) + dccp_tx_hist_entry_new(struct dccp_tx_hist *hist, + const unsigned int __nocast prio) { struct dccp_tx_hist_entry *entry = kmem_cache_alloc(hist->dccptxh_slab, prio); @@ -127,10 +127,10 @@ static inline struct dccp_tx_hist_entry * } static inline struct dccp_rx_hist_entry * - dccp_rx_hist_entry_new(struct dccp_rx_hist *hist, - const u32 ndp, - const struct sk_buff *skb, - const int prio) + dccp_rx_hist_entry_new(struct dccp_rx_hist *hist, + const u32 ndp, + const struct sk_buff *skb, + const unsigned int __nocast prio) { struct dccp_rx_hist_entry *entry = kmem_cache_alloc(hist->dccprxh_slab, prio); diff --git a/net/dccp/proto.c b/net/dccp/proto.c index be06692..0b715ce 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -147,7 +147,7 @@ int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg) } int dccp_setsockopt(struct sock *sk, int level, int optname, - char *optval, int optlen) + char __user *optval, int optlen) { dccp_pr_debug("entry\n"); @@ -158,7 +158,7 @@ int dccp_setsockopt(struct sock *sk, int level, int optname, } int dccp_getsockopt(struct sock *sk, int level, int optname, - char *optval, int *optlen) + char __user *optval, int __user *optlen) { dccp_pr_debug("entry\n"); @@ -439,7 +439,7 @@ void dccp_shutdown(struct sock *sk, int how) dccp_pr_debug("entry\n"); } -struct proto_ops inet_dccp_ops = { +static struct proto_ops inet_dccp_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, @@ -539,9 +539,11 @@ static int thash_entries; module_param(thash_entries, int, 0444); MODULE_PARM_DESC(thash_entries, "Number of ehash buckets"); +#ifdef CONFIG_IP_DCCP_DEBUG int dccp_debug; module_param(dccp_debug, int, 0444); MODULE_PARM_DESC(dccp_debug, "Enable debug messages"); +#endif static int __init dccp_init(void) { -- cgit v0.10.2 From a10cedd4b905236603c6c4fd77cf338ebbfb1a60 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sun, 14 Aug 2005 21:05:53 -0300 Subject: [DCCP]: Fix compiler warnings may be a false warning if there always is something on ccid3hcrx_hist: net/dccp/ccids/ccid3.c: In function 'ccid3_hc_rx_packet_recv': net/dccp/ccids/ccid3.c:1634: warning: 'tstamp.tv_usec' may be used uninitialized in this function net/dccp/ccids/ccid3.c:1634: warning: 'tstamp.tv_sec' may be used uninitialized in this function const on inline functions doesn't have any effect: net/dccp/dccp.h:64: warning: type qualifiers ignored on function return type net/dccp/dccp.h:70: warning: type qualifiers ignored on function return type net/dccp/dccp.h:76: warning: type qualifiers ignored on function return type Signed-off-by: Patrick McHardy Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 21948d0..2dd3e94 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -1634,7 +1634,7 @@ static u32 ccid3_hc_rx_calc_first_li(struct sock *sk) struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; struct dccp_rx_hist_entry *entry, *next, *tail = NULL; u32 rtt, delta, x_recv, fval, p, tmp2; - struct timeval tstamp, tmp_tv; + struct timeval tstamp = { 0 }, tmp_tv; int interval = 0; int win_count = 0; int step = 0; diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 148e8a6..fff794c 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -60,20 +60,19 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo); extern struct proto dccp_v4_prot; /* is seq1 < seq2 ? */ -static inline const int before48(const u64 seq1, const u64 seq2) +static inline int before48(const u64 seq1, const u64 seq2) { - return (const s64)((seq1 << 16) - (seq2 << 16)) < 0; + return (s64)((seq1 << 16) - (seq2 << 16)) < 0; } /* is seq1 > seq2 ? */ -static inline const int after48(const u64 seq1, const u64 seq2) +static inline int after48(const u64 seq1, const u64 seq2) { - return (const s64)((seq2 << 16) - (seq1 << 16)) < 0; + return (s64)((seq2 << 16) - (seq1 << 16)) < 0; } /* is seq2 <= seq1 <= seq3 ? */ -static inline const int between48(const u64 seq1, const u64 seq2, - const u64 seq3) +static inline int between48(const u64 seq1, const u64 seq2, const u64 seq3) { return (seq3 << 16) - (seq2 << 16) >= (seq1 << 16) - (seq2 << 16); } -- cgit v0.10.2 From 7de76272b54e3677bcd247d1e1809015236a298d Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sun, 14 Aug 2005 18:01:08 -0700 Subject: [IPX]: Fix build error in ipx_recvmsg() Missing semicolon introduced by skb->stamp changeset: d3258b7d8ed96f97032639bc745179f1951b0da5 Signed-off-by: David S. Miller diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index c54f8ac..180e383 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -1797,7 +1797,7 @@ static int ipx_recvmsg(struct kiocb *iocb, struct socket *sock, if (rc) goto out_free; if (skb->tstamp.off_sec) - skb_get_timestamp(skb, &sk->sk_stamp) + skb_get_timestamp(skb, &sk->sk_stamp); msg->msg_namelen = sizeof(*sipx); -- cgit v0.10.2 From ad93e266a17c6f606e96304c866eb73665ae34fa Mon Sep 17 00:00:00 2001 From: Evgeniy Polyakov Date: Sun, 14 Aug 2005 19:24:58 -0700 Subject: [NETLINK]: w1_int.c: fix default netlink group w1 does not need to multicast its state to several groups at once, and upcoming netlink changes will not allow bitmask for groups anyway. Signed-off-by: Evgeniy Polyakov Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller diff --git a/drivers/w1/w1_int.c b/drivers/w1/w1_int.c index 8809788..f3f339d 100644 --- a/drivers/w1/w1_int.c +++ b/drivers/w1/w1_int.c @@ -86,7 +86,7 @@ static struct w1_master * w1_alloc_dev(u32 id, int slave_count, int slave_ttl, dev->driver = driver; - dev->groups = 23; + dev->groups = 1; dev->seq = 1; dev->nls = netlink_kernel_create(NETLINK_W1, NULL, THIS_MODULE); if (!dev->nls) { -- cgit v0.10.2 From 43e943c32b9213b5d25407b281c94aaa474fd9a6 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sun, 14 Aug 2005 19:25:47 -0700 Subject: [NETLINK]: Fix missing dst_groups initializations in netlink_broadcast users netlink_broadcast users must initialize NETLINK_CB(skb).dst_groups to the destination group mask for netlink_recvmsg. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c index 88f4d74..bc00061 100644 --- a/lib/kobject_uevent.c +++ b/lib/kobject_uevent.c @@ -93,6 +93,7 @@ static int send_uevent(const char *signal, const char *obj, } } + NETLINK_CB(skb).dst_groups = 1; return netlink_broadcast(uevent_sock, skb, 0, 1, gfp_mask); } diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 33ceeea..4d553a1 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1152,6 +1152,8 @@ static int xfrm_notify_sa_flush(struct km_event *c) nlh->nlmsg_len = skb->tail - b; + NETLINK_CB(skb).dst_groups = XFRMGRP_SA; + return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_SA, GFP_ATOMIC); nlmsg_failure: @@ -1226,6 +1228,8 @@ static int xfrm_notify_sa(struct xfrm_state *x, struct km_event *c) nlh->nlmsg_len = skb->tail - b; + NETLINK_CB(skb).dst_groups = XFRMGRP_SA; + return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_SA, GFP_ATOMIC); nlmsg_failure: @@ -1455,6 +1459,8 @@ static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, struct km_event * nlh->nlmsg_len = skb->tail - b; + NETLINK_CB(skb).dst_groups = XFRMGRP_POLICY; + return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_POLICY, GFP_ATOMIC); nlmsg_failure: @@ -1480,6 +1486,8 @@ static int xfrm_notify_policy_flush(struct km_event *c) nlh->nlmsg_len = skb->tail - b; + NETLINK_CB(skb).dst_groups = XFRMGRP_POLICY; + return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_POLICY, GFP_ATOMIC); nlmsg_failure: diff --git a/security/selinux/netlink.c b/security/selinux/netlink.c index 341dbe2..0f7be65 100644 --- a/security/selinux/netlink.c +++ b/security/selinux/netlink.c @@ -80,6 +80,7 @@ static void selnl_notify(int msgtype, void *data) nlh = NLMSG_PUT(skb, 0, 0, msgtype, len); selnl_add_payload(nlh, len, msgtype, data); nlh->nlmsg_len = skb->tail - tmp; + NETLINK_CB(skb).dst_groups = SELNL_GRP_AVC; netlink_broadcast(selnl, skb, 0, SELNL_GRP_AVC, GFP_USER); out: return; -- cgit v0.10.2 From db080529798b497eb5a37b92a25e966be5a7dd5d Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sun, 14 Aug 2005 19:26:34 -0700 Subject: [NETLINK]: Remove unused groups member from struct netlink_skb_parms Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller diff --git a/include/linux/netlink.h b/include/linux/netlink.h index d5e09bc..eab51f9 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -106,7 +106,6 @@ struct netlink_skb_parms { struct ucred creds; /* Skb credentials */ __u32 pid; - __u32 groups; __u32 dst_pid; __u32 dst_groups; kernel_cap_t eff_cap; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index b5e2f15..75d03e3 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -558,7 +558,6 @@ static void nl_fib_input(struct sock *sk, int len) nl_fib_lookup(frn, tb); pid = nlh->nlmsg_pid; /*pid of sending process */ - NETLINK_CB(skb).groups = 0; /* not in mcast group */ NETLINK_CB(skb).pid = 0; /* from kernel */ NETLINK_CB(skb).dst_pid = pid; NETLINK_CB(skb).dst_groups = 0; /* unicast */ diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 5d487cd..7b7b45a 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -950,7 +950,6 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, goto out; NETLINK_CB(skb).pid = nlk->pid; - NETLINK_CB(skb).groups = nlk->groups; NETLINK_CB(skb).dst_pid = dst_pid; NETLINK_CB(skb).dst_groups = dst_groups; NETLINK_CB(skb).loginuid = audit_get_loginuid(current->audit_context); -- cgit v0.10.2 From 77247bbb3094246be9d057e7be442cc708f123a8 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sun, 14 Aug 2005 19:27:13 -0700 Subject: [NETLINK]: Fix module refcounting problems Use-after-free: the struct proto_ops containing the module pointer is freed when a socket with pid=0 is released, which besides for kernel sockets is true for all unbound sockets. Module refcount leak: when the kernel socket is closed before all user sockets have been closed the proto_ops struct for this family is replaced by the generic one and the module refcount can't be dropped. The second problem can't be solved cleanly using module refcounting in the generic socket code, so this patch adds explicit refcounting to netlink_create/netlink_release. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 7b7b45a..c41a881 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -73,8 +73,12 @@ struct netlink_sock { struct netlink_callback *cb; spinlock_t cb_lock; void (*data_ready)(struct sock *sk, int bytes); + struct module *module; + u32 flags; }; +#define NETLINK_KERNEL_SOCKET 0x1 + static inline struct netlink_sock *nlk_sk(struct sock *sk) { return (struct netlink_sock *)sk; @@ -97,7 +101,7 @@ struct netlink_table { struct nl_pid_hash hash; struct hlist_head mc_list; unsigned int nl_nonroot; - struct proto_ops *p_ops; + struct module *module; }; static struct netlink_table *nl_table; @@ -338,6 +342,7 @@ static int netlink_create(struct socket *sock, int protocol) { struct sock *sk; struct netlink_sock *nlk; + struct module *module; sock->state = SS_UNCONNECTED; @@ -347,30 +352,36 @@ static int netlink_create(struct socket *sock, int protocol) if (protocol<0 || protocol >= MAX_LINKS) return -EPROTONOSUPPORT; - netlink_table_grab(); + netlink_lock_table(); if (!nl_table[protocol].hash.entries) { #ifdef CONFIG_KMOD /* We do 'best effort'. If we find a matching module, * it is loaded. If not, we don't return an error to * allow pure userspace<->userspace communication. -HW */ - netlink_table_ungrab(); + netlink_unlock_table(); request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol); - netlink_table_grab(); + netlink_lock_table(); #endif } - netlink_table_ungrab(); + module = nl_table[protocol].module; + if (!try_module_get(module)) + module = NULL; + netlink_unlock_table(); - sock->ops = nl_table[protocol].p_ops; + sock->ops = &netlink_ops; sk = sk_alloc(PF_NETLINK, GFP_KERNEL, &netlink_proto, 1); - if (!sk) + if (!sk) { + module_put(module); return -ENOMEM; + } sock_init_data(sock, sk); nlk = nlk_sk(sk); + nlk->module = module; spin_lock_init(&nlk->cb_lock); init_waitqueue_head(&nlk->wait); sk->sk_destruct = netlink_sock_destruct; @@ -415,22 +426,15 @@ static int netlink_release(struct socket *sock) notifier_call_chain(&netlink_chain, NETLINK_URELEASE, &n); } - /* When this is a kernel socket, we need to remove the owner pointer, - * since we don't know whether the module will be dying at any given - * point - HW - */ - if (!nlk->pid) { - struct proto_ops *p_tmp; + if (nlk->module) + module_put(nlk->module); + if (nlk->flags & NETLINK_KERNEL_SOCKET) { netlink_table_grab(); - p_tmp = nl_table[sk->sk_protocol].p_ops; - if (p_tmp != &netlink_ops) { - nl_table[sk->sk_protocol].p_ops = &netlink_ops; - kfree(p_tmp); - } + nl_table[sk->sk_protocol].module = NULL; netlink_table_ungrab(); } - + sock_put(sk); return 0; } @@ -1060,9 +1064,9 @@ static void netlink_data_ready(struct sock *sk, int len) struct sock * netlink_kernel_create(int unit, void (*input)(struct sock *sk, int len), struct module *module) { - struct proto_ops *p_ops; struct socket *sock; struct sock *sk; + struct netlink_sock *nlk; if (!nl_table) return NULL; @@ -1070,64 +1074,32 @@ netlink_kernel_create(int unit, void (*input)(struct sock *sk, int len), struct if (unit<0 || unit>=MAX_LINKS) return NULL; - /* Do a quick check, to make us not go down to netlink_insert() - * if protocol already has kernel socket. - */ - sk = netlink_lookup(unit, 0); - if (unlikely(sk)) { - sock_put(sk); - return NULL; - } - if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock)) return NULL; - sk = NULL; - if (module) { - /* Every registering protocol implemented in a module needs - * it's own p_ops, since the socket code cannot deal with - * module refcounting otherwise. -HW - */ - p_ops = kmalloc(sizeof(*p_ops), GFP_KERNEL); - if (!p_ops) - goto out_sock_release; - - memcpy(p_ops, &netlink_ops, sizeof(*p_ops)); - p_ops->owner = module; - } else - p_ops = &netlink_ops; - - netlink_table_grab(); - nl_table[unit].p_ops = p_ops; - netlink_table_ungrab(); - - if (netlink_create(sock, unit) < 0) { - sk = NULL; - goto out_kfree_p_ops; - } + if (netlink_create(sock, unit) < 0) + goto out_sock_release; sk = sock->sk; sk->sk_data_ready = netlink_data_ready; if (input) nlk_sk(sk)->data_ready = input; - if (netlink_insert(sk, 0)) { - sk = NULL; - goto out_kfree_p_ops; - } + if (netlink_insert(sk, 0)) + goto out_sock_release; - return sk; + nlk = nlk_sk(sk); + nlk->flags |= NETLINK_KERNEL_SOCKET; -out_kfree_p_ops: netlink_table_grab(); - if (nl_table[unit].p_ops != &netlink_ops) { - kfree(nl_table[unit].p_ops); - nl_table[unit].p_ops = &netlink_ops; - } + nl_table[unit].module = module; netlink_table_ungrab(); + + return sk; + out_sock_release: sock_release(sock); - return sk; + return NULL; } void netlink_set_nonroot(int protocol, unsigned int flags) @@ -1490,8 +1462,6 @@ enomem: for (i = 0; i < MAX_LINKS; i++) { struct nl_pid_hash *hash = &nl_table[i].hash; - nl_table[i].p_ops = &netlink_ops; - hash->table = nl_pid_hash_alloc(1 * sizeof(*hash->table)); if (!hash->table) { while (i-- > 0) -- cgit v0.10.2 From d629b836d151d43332492651dd841d32e57ebe3b Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sun, 14 Aug 2005 19:27:50 -0700 Subject: [NETLINK]: Use group numbers instead of bitmasks internally Using the group number allows increasing the number of groups without beeing limited by the size of the bitmask. It introduces one limitation for netlink users: messages can't be broadcasted to multiple groups anymore, however this feature was never used inside the kernel. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller diff --git a/include/linux/netlink.h b/include/linux/netlink.h index eab51f9..c724c9d 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -107,7 +107,7 @@ struct netlink_skb_parms struct ucred creds; /* Skb credentials */ __u32 pid; __u32 dst_pid; - __u32 dst_groups; + __u32 dst_group; kernel_cap_t eff_cap; __u32 loginuid; /* Login (audit) uid */ }; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index c41a881..3c56b96 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -67,7 +67,7 @@ struct netlink_sock { u32 pid; unsigned int groups; u32 dst_pid; - unsigned int dst_groups; + u32 dst_group; unsigned long state; wait_queue_head_t wait; struct netlink_callback *cb; @@ -116,6 +116,11 @@ static atomic_t nl_table_users = ATOMIC_INIT(0); static struct notifier_block *netlink_chain; +static u32 netlink_group_mask(u32 group) +{ + return group ? 1 << (group - 1) : 0; +} + static struct hlist_head *nl_pid_hashfn(struct nl_pid_hash *hash, u32 pid) { return &hash->table[jhash_1word(pid, hash->rnd) & hash->mask]; @@ -533,7 +538,7 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr, if (addr->sa_family == AF_UNSPEC) { sk->sk_state = NETLINK_UNCONNECTED; nlk->dst_pid = 0; - nlk->dst_groups = 0; + nlk->dst_group = 0; return 0; } if (addr->sa_family != AF_NETLINK) @@ -549,7 +554,7 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr, if (err == 0) { sk->sk_state = NETLINK_CONNECTED; nlk->dst_pid = nladdr->nl_pid; - nlk->dst_groups = nladdr->nl_groups; + nlk->dst_group = ffs(nladdr->nl_groups); } return err; @@ -567,10 +572,10 @@ static int netlink_getname(struct socket *sock, struct sockaddr *addr, int *addr if (peer) { nladdr->nl_pid = nlk->dst_pid; - nladdr->nl_groups = nlk->dst_groups; + nladdr->nl_groups = netlink_group_mask(nlk->dst_group); } else { nladdr->nl_pid = nlk->pid; - nladdr->nl_groups = nlk->groups; + nladdr->nl_groups = nlk->groups; } return 0; } @@ -771,7 +776,7 @@ static inline int do_one_broadcast(struct sock *sk, if (p->exclude_sk == sk) goto out; - if (nlk->pid == p->pid || !(nlk->groups & p->group)) + if (nlk->pid == p->pid || !(nlk->groups & netlink_group_mask(p->group))) goto out; if (p->failure) { @@ -867,7 +872,7 @@ static inline int do_one_set_err(struct sock *sk, if (sk == p->exclude_sk) goto out; - if (nlk->pid == p->pid || !(nlk->groups & p->group)) + if (nlk->pid == p->pid || !(nlk->groups & netlink_group_mask(p->group))) goto out; sk->sk_err = p->code; @@ -913,7 +918,7 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, struct netlink_sock *nlk = nlk_sk(sk); struct sockaddr_nl *addr=msg->msg_name; u32 dst_pid; - u32 dst_groups; + u32 dst_group; struct sk_buff *skb; int err; struct scm_cookie scm; @@ -931,12 +936,12 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, if (addr->nl_family != AF_NETLINK) return -EINVAL; dst_pid = addr->nl_pid; - dst_groups = addr->nl_groups; - if (dst_groups && !netlink_capable(sock, NL_NONROOT_SEND)) + dst_group = ffs(addr->nl_groups); + if (dst_group && !netlink_capable(sock, NL_NONROOT_SEND)) return -EPERM; } else { dst_pid = nlk->dst_pid; - dst_groups = nlk->dst_groups; + dst_group = nlk->dst_group; } if (!nlk->pid) { @@ -955,7 +960,7 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, NETLINK_CB(skb).pid = nlk->pid; NETLINK_CB(skb).dst_pid = dst_pid; - NETLINK_CB(skb).dst_groups = dst_groups; + NETLINK_CB(skb).dst_group = dst_group; NETLINK_CB(skb).loginuid = audit_get_loginuid(current->audit_context); memcpy(NETLINK_CREDS(skb), &siocb->scm->creds, sizeof(struct ucred)); @@ -977,9 +982,9 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, goto out; } - if (dst_groups) { + if (dst_group) { atomic_inc(&skb->users); - netlink_broadcast(sk, skb, dst_pid, dst_groups, GFP_KERNEL); + netlink_broadcast(sk, skb, dst_pid, dst_group, GFP_KERNEL); } err = netlink_unicast(sk, skb, dst_pid, msg->msg_flags&MSG_DONTWAIT); @@ -1025,7 +1030,7 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock, addr->nl_family = AF_NETLINK; addr->nl_pad = 0; addr->nl_pid = NETLINK_CB(skb).pid; - addr->nl_groups = NETLINK_CB(skb).dst_groups; + addr->nl_groups = netlink_group_mask(NETLINK_CB(skb).dst_group); msg->msg_namelen = sizeof(*addr); } -- cgit v0.10.2 From ac6d439d2097b72ea0cbc2322ce1263a38bc1fd0 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sun, 14 Aug 2005 19:29:52 -0700 Subject: [NETLINK]: Convert netlink users to use group numbers instead of bitmasks Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller diff --git a/drivers/w1/w1_netlink.c b/drivers/w1/w1_netlink.c index 2a82fb0..e7b7744 100644 --- a/drivers/w1/w1_netlink.c +++ b/drivers/w1/w1_netlink.c @@ -51,7 +51,7 @@ void w1_netlink_send(struct w1_master *dev, struct w1_netlink_msg *msg) memcpy(data, msg, sizeof(struct w1_netlink_msg)); - NETLINK_CB(skb).dst_groups = dev->groups; + NETLINK_CB(skb).dst_group = dev->groups; netlink_broadcast(dev->nls, skb, 0, dev->groups, GFP_ATOMIC); nlmsg_failure: diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index b0feb23..1d5b10a 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -2,13 +2,34 @@ #define _NFNETLINK_H #include -/* nfnetlink groups: Up to 32 maximum */ +#ifndef __KERNEL__ +/* nfnetlink groups: Up to 32 maximum - backwards compatibility for userspace */ #define NF_NETLINK_CONNTRACK_NEW 0x00000001 #define NF_NETLINK_CONNTRACK_UPDATE 0x00000002 #define NF_NETLINK_CONNTRACK_DESTROY 0x00000004 #define NF_NETLINK_CONNTRACK_EXP_NEW 0x00000008 #define NF_NETLINK_CONNTRACK_EXP_UPDATE 0x00000010 #define NF_NETLINK_CONNTRACK_EXP_DESTROY 0x00000020 +#endif + +enum nfnetlink_groups { + NFNLGRP_NONE, +#define NFNLGRP_NONE NFNLGRP_NONE + NFNLGRP_CONNTRACK_NEW, +#define NFNLGRP_CONNTRACK_NEW NFNLGRP_CONNTRACK_NEW + NFNLGRP_CONNTRACK_UPDATE, +#define NFNLGRP_CONNTRACK_UPDATE NFNLGRP_CONNTRACK_UPDATE + NFNLGRP_CONNTRACK_DESTROY, +#define NFNLGRP_CONNTRACK_DESTROY NFNLGRP_CONNTRACK_DESTROY + NFNLGRP_CONNTRACK_EXP_NEW, +#define NFNLGRP_CONNTRACK_EXP_NEW NFNLGRP_CONNTRACK_EXP_NEW + NFNLGRP_CONNTRACK_EXP_UPDATE, +#define NFNLGRP_CONNTRACK_EXP_UPDATE NFNLGRP_CONNTRACK_EXP_UPDATE + NFNLGRP_CONNTRACK_EXP_DESTROY, +#define NFNLGRP_CONNTRACK_EXP_DESTROY NFNLGRP_CONNTRACK_EXP_DESTROY + __NFNLGRP_MAX, +}; +#define NFNLGRP_MAX (__NFNLGRP_MAX - 1) /* Generic structure for encapsulation optional netfilter information. * It is reminiscent of sockaddr, but with sa_family replaced diff --git a/include/linux/netfilter_decnet.h b/include/linux/netfilter_decnet.h index 0189794..6f42536 100644 --- a/include/linux/netfilter_decnet.h +++ b/include/linux/netfilter_decnet.h @@ -56,7 +56,21 @@ struct nf_dn_rtmsg { #define NFDN_RTMSG(r) ((unsigned char *)(r) + NLMSG_ALIGN(sizeof(struct nf_dn_rtmsg))) +#ifndef __KERNEL__ +/* backwards compatibility for userspace */ #define DNRMG_L1_GROUP 0x01 #define DNRMG_L2_GROUP 0x02 +#endif + +enum { + DNRNG_NLGRP_NONE, +#define DNRNG_NLGRP_NONE DNRNG_NLGRP_NONE + DNRNG_NLGRP_L1, +#define DNRNG_NLGRP_L1 DNRNG_NLGRP_L1 + DNRNG_NLGRP_L2, +#define DNRNG_NLGRP_L2 DNRNG_NLGRP_L2 + __DNRNG_NLGRP_MAX +}; +#define DNRNG_NLGRP_MAX (__DNRNG_NLGRP_MAX - 1) #endif /*__LINUX_DECNET_NETFILTER_H*/ diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 657c05a..c231e9a 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -826,9 +826,8 @@ enum #define TCA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcmsg)))) #define TCA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcmsg)) - -/* RTnetlink multicast groups */ - +#ifndef __KERNEL__ +/* RTnetlink multicast groups - backwards compatibility for userspace */ #define RTMGRP_LINK 1 #define RTMGRP_NOTIFY 2 #define RTMGRP_NEIGH 4 @@ -847,6 +846,43 @@ enum #define RTMGRP_DECnet_ROUTE 0x4000 #define RTMGRP_IPV6_PREFIX 0x20000 +#endif + +/* RTnetlink multicast groups */ +enum rtnetlink_groups { + RTNLGRP_NONE, +#define RTNLGRP_NONE RTNLGRP_NONE + RTNLGRP_LINK, +#define RTNLGRP_LINK RTNLGRP_LINK + RTNLGRP_NOTIFY, +#define RTNLGRP_NOTIFY RTNLGRP_NOTIFY + RTNLGRP_NEIGH, +#define RTNLGRP_NEIGH RTNLGRP_NEIGH + RTNLGRP_TC, +#define RTNLGRP_TC RTNLGRP_TC + RTNLGRP_IPV4_IFADDR, +#define RTNLGRP_IPV4_IFADDR RTNLGRP_IPV4_IFADDR + RTNLGRP_IPV4_MROUTE, +#define RTNLGRP_IPV4_MROUTE RTNLGRP_IPV4_MROUTE + RTNLGRP_IPV4_ROUTE, +#define RTNLGRP_IPV4_ROUTE RTNLGRP_IPV4_ROUTE + RTNLGRP_IPV6_IFADDR, +#define RTNLGRP_IPV6_IFADDR RTNLGRP_IPV6_IFADDR + RTNLGRP_IPV6_MROUTE, +#define RTNLGRP_IPV6_MROUTE RTNLGRP_IPV6_MROUTE + RTNLGRP_IPV6_ROUTE, +#define RTNLGRP_IPV6_ROUTE RTNLGRP_IPV6_ROUTE + RTNLGRP_IPV6_IFINFO, +#define RTNLGRP_IPV6_IFINFO RTNLGRP_IPV6_IFINFO + RTNLGRP_DECnet_IFADDR, +#define RTNLGRP_DECnet_IFADDR RTNLGRP_DECnet_IFADDR + RTNLGRP_DECnet_ROUTE, +#define RTNLGRP_DECnet_ROUTE RTNLGRP_DECnet_ROUTE + RTNLGRP_IPV6_PREFIX, +#define RTNLGRP_IPV6_PREFIX RTNLGRP_IPV6_PREFIX + __RTNLGRP_MAX +}; +#define RTNLGRP_MAX (__RTNLGRP_MAX - 1) /* TC action piece */ struct tcamsg diff --git a/include/linux/selinux_netlink.h b/include/linux/selinux_netlink.h index 957e6eb..bbf489d 100644 --- a/include/linux/selinux_netlink.h +++ b/include/linux/selinux_netlink.h @@ -20,10 +20,21 @@ enum { SELNL_MSG_MAX }; -/* Multicast groups */ +#ifndef __KERNEL__ +/* Multicast groups - backwards compatiblility for userspace */ #define SELNL_GRP_NONE 0x00000000 #define SELNL_GRP_AVC 0x00000001 /* AVC notifications */ #define SELNL_GRP_ALL 0xffffffff +#endif + +enum selinux_nlgroups { + SELNLGRP_NONE, +#define SELNLGRP_NONE SELNLGRP_NONE + SELNLGRP_AVC, +#define SELNLGRP_AVC SELNLGRP_AVC + __SELNLGRP_MAX +}; +#define SELNLGRP_MAX (__SELNLGRP_MAX - 1) /* Message structures */ struct selnl_msg_setenforce { diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h index f0d4233..0fb077d 100644 --- a/include/linux/xfrm.h +++ b/include/linux/xfrm.h @@ -258,9 +258,27 @@ struct xfrm_usersa_flush { __u8 proto; }; +#ifndef __KERNEL__ +/* backwards compatibility for userspace */ #define XFRMGRP_ACQUIRE 1 #define XFRMGRP_EXPIRE 2 #define XFRMGRP_SA 4 #define XFRMGRP_POLICY 8 +#endif + +enum xfrm_nlgroups { + XFRMNLGRP_NONE, +#define XFRMNLGRP_NONE XFRMNLGRP_NONE + XFRMNLGRP_ACQUIRE, +#define XFRMNLGRP_ACQUIRE XFRMNLGRP_ACQUIRE + XFRMNLGRP_EXPIRE, +#define XFRMNLGRP_EXPIRE XFRMNLGRP_EXPIRE + XFRMNLGRP_SA, +#define XFRMNLGRP_SA XFRMNLGRP_SA + XFRMNLGRP_POLICY, +#define XFRMNLGRP_POLICY XFRMNLGRP_POLICY + __XFRMNLGRP_MAX +}; +#define XFRMNLGRP_MAX (__XFRMNLGRP_MAX - 1) #endif /* _LINUX_XFRM_H */ diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c index bc00061..1ebd735 100644 --- a/lib/kobject_uevent.c +++ b/lib/kobject_uevent.c @@ -93,7 +93,7 @@ static int send_uevent(const char *signal, const char *obj, } } - NETLINK_CB(skb).dst_groups = 1; + NETLINK_CB(skb).dst_group = 1; return netlink_broadcast(uevent_sock, skb, 0, 1, gfp_mask); } diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c index acb888d..6845b5d 100644 --- a/net/bridge/netfilter/ebt_ulog.c +++ b/net/bridge/netfilter/ebt_ulog.c @@ -78,8 +78,8 @@ static void ulog_send(unsigned int nlgroup) if (ub->qlen > 1) ub->lastnlh->nlmsg_type = NLMSG_DONE; - NETLINK_CB(ub->skb).dst_groups = 1 << nlgroup; - netlink_broadcast(ebtulognl, ub->skb, 0, 1 << nlgroup, GFP_ATOMIC); + NETLINK_CB(ub->skb).dst_group = nlgroup + 1; + netlink_broadcast(ebtulognl, ub->skb, 0, nlgroup + 1, GFP_ATOMIC); ub->qlen = 0; ub->skb = NULL; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 72ee00f..39fc55e 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2343,8 +2343,8 @@ void neigh_app_ns(struct neighbour *n) } nlh = (struct nlmsghdr *)skb->data; nlh->nlmsg_flags = NLM_F_REQUEST; - NETLINK_CB(skb).dst_groups = RTMGRP_NEIGH; - netlink_broadcast(rtnl, skb, 0, RTMGRP_NEIGH, GFP_ATOMIC); + NETLINK_CB(skb).dst_group = RTNLGRP_NEIGH; + netlink_broadcast(rtnl, skb, 0, RTNLGRP_NEIGH, GFP_ATOMIC); } static void neigh_app_notify(struct neighbour *n) @@ -2361,8 +2361,8 @@ static void neigh_app_notify(struct neighbour *n) return; } nlh = (struct nlmsghdr *)skb->data; - NETLINK_CB(skb).dst_groups = RTMGRP_NEIGH; - netlink_broadcast(rtnl, skb, 0, RTMGRP_NEIGH, GFP_ATOMIC); + NETLINK_CB(skb).dst_group = RTNLGRP_NEIGH; + netlink_broadcast(rtnl, skb, 0, RTNLGRP_NEIGH, GFP_ATOMIC); } #endif /* CONFIG_ARPD */ diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 9b3c61f..5f3f95b 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -148,7 +148,7 @@ int rtnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo) { int err = 0; - NETLINK_CB(skb).dst_groups = group; + NETLINK_CB(skb).dst_group = group; if (echo) atomic_inc(&skb->users); netlink_broadcast(rtnl, skb, pid, group, GFP_KERNEL); @@ -458,8 +458,8 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change) kfree_skb(skb); return; } - NETLINK_CB(skb).dst_groups = RTMGRP_LINK; - netlink_broadcast(rtnl, skb, 0, RTMGRP_LINK, GFP_KERNEL); + NETLINK_CB(skb).dst_group = RTNLGRP_LINK; + netlink_broadcast(rtnl, skb, 0, RTNLGRP_LINK, GFP_KERNEL); } static int rtnetlink_done(struct netlink_callback *cb) diff --git a/net/core/wireless.c b/net/core/wireless.c index 3ff5639..19fa6a5 100644 --- a/net/core/wireless.c +++ b/net/core/wireless.c @@ -1144,8 +1144,8 @@ static inline void rtmsg_iwinfo(struct net_device * dev, kfree_skb(skb); return; } - NETLINK_CB(skb).dst_groups = RTMGRP_LINK; - netlink_broadcast(rtnl, skb, 0, RTMGRP_LINK, GFP_ATOMIC); + NETLINK_CB(skb).dst_group = RTNLGRP_LINK; + netlink_broadcast(rtnl, skb, 0, RTNLGRP_LINK, GFP_ATOMIC); } #endif /* WE_EVENT_NETLINK */ diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index 00233ec..5610bb1 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -752,16 +752,16 @@ static void rtmsg_ifa(int event, struct dn_ifaddr *ifa) skb = alloc_skb(size, GFP_KERNEL); if (!skb) { - netlink_set_err(rtnl, 0, RTMGRP_DECnet_IFADDR, ENOBUFS); + netlink_set_err(rtnl, 0, RTNLGRP_DECnet_IFADDR, ENOBUFS); return; } if (dn_dev_fill_ifaddr(skb, ifa, 0, 0, event, 0) < 0) { kfree_skb(skb); - netlink_set_err(rtnl, 0, RTMGRP_DECnet_IFADDR, EINVAL); + netlink_set_err(rtnl, 0, RTNLGRP_DECnet_IFADDR, EINVAL); return; } - NETLINK_CB(skb).dst_groups = RTMGRP_DECnet_IFADDR; - netlink_broadcast(rtnl, skb, 0, RTMGRP_DECnet_IFADDR, GFP_KERNEL); + NETLINK_CB(skb).dst_group = RTNLGRP_DECnet_IFADDR; + netlink_broadcast(rtnl, skb, 0, RTNLGRP_DECnet_IFADDR, GFP_KERNEL); } static int dn_dev_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c index 28ba5777a..73a8848 100644 --- a/net/decnet/dn_table.c +++ b/net/decnet/dn_table.c @@ -349,10 +349,10 @@ static void dn_rtmsg_fib(int event, struct dn_fib_node *f, int z, int tb_id, kfree_skb(skb); return; } - NETLINK_CB(skb).dst_groups = RTMGRP_DECnet_ROUTE; + NETLINK_CB(skb).dst_group = RTNLGRP_DECnet_ROUTE; if (nlh->nlmsg_flags & NLM_F_ECHO) atomic_inc(&skb->users); - netlink_broadcast(rtnl, skb, pid, RTMGRP_DECnet_ROUTE, GFP_KERNEL); + netlink_broadcast(rtnl, skb, pid, RTNLGRP_DECnet_ROUTE, GFP_KERNEL); if (nlh->nlmsg_flags & NLM_F_ECHO) netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT); } diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c index 3068fdd..353fed6 100644 --- a/net/decnet/netfilter/dn_rtmsg.c +++ b/net/decnet/netfilter/dn_rtmsg.c @@ -71,10 +71,10 @@ static void dnrmg_send_peer(struct sk_buff *skb) switch(flags & DN_RT_CNTL_MSK) { case DN_RT_PKT_L1RT: - group = DNRMG_L1_GROUP; + group = DNRMG_L1_NLGRP; break; case DN_RT_PKT_L2RT: - group = DNRMG_L2_GROUP; + group = DNRMG_L2_NLGRP; break; default: return; @@ -83,7 +83,7 @@ static void dnrmg_send_peer(struct sk_buff *skb) skb2 = dnrmg_build_message(skb, &status); if (skb2 == NULL) return; - NETLINK_CB(skb2).dst_groups = group; + NETLINK_CB(skb2).dst_group = group; netlink_broadcast(dnrmg, skb2, 0, group, GFP_ATOMIC); } diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index d8a10e3..ba2895a 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1111,13 +1111,12 @@ static void rtmsg_ifa(int event, struct in_ifaddr* ifa) struct sk_buff *skb = alloc_skb(size, GFP_KERNEL); if (!skb) - netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, ENOBUFS); + netlink_set_err(rtnl, 0, RTNLGRP_IPV4_IFADDR, ENOBUFS); else if (inet_fill_ifaddr(skb, ifa, current->pid, 0, event, 0) < 0) { kfree_skb(skb); - netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, EINVAL); + netlink_set_err(rtnl, 0, RTNLGRP_IPV4_IFADDR, EINVAL); } else { - NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_IFADDR; - netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV4_IFADDR, GFP_KERNEL); + netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV4_IFADDR, GFP_KERNEL); } } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 75d03e3..d4e7b57 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -560,7 +560,7 @@ static void nl_fib_input(struct sock *sk, int len) pid = nlh->nlmsg_pid; /*pid of sending process */ NETLINK_CB(skb).pid = 0; /* from kernel */ NETLINK_CB(skb).dst_pid = pid; - NETLINK_CB(skb).dst_groups = 0; /* unicast */ + NETLINK_CB(skb).dst_group = 0; /* unicast */ netlink_unicast(sk, skb, pid, MSG_DONTWAIT); } diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index e278cb9..7e4651b 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -290,10 +290,10 @@ void rtmsg_fib(int event, u32 key, struct fib_alias *fa, kfree_skb(skb); return; } - NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_ROUTE; + NETLINK_CB(skb).dst_group = RTNLGRP_IPV4_ROUTE; if (n->nlmsg_flags&NLM_F_ECHO) atomic_inc(&skb->users); - netlink_broadcast(rtnl, skb, pid, RTMGRP_IPV4_ROUTE, GFP_KERNEL); + netlink_broadcast(rtnl, skb, pid, RTNLGRP_IPV4_ROUTE, GFP_KERNEL); if (n->nlmsg_flags&NLM_F_ECHO) netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT); } diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 1221a9c..a4e9278 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -297,7 +297,7 @@ static int ctnetlink_conntrack_event(struct notifier_block *this, struct sk_buff *skb; unsigned int type; unsigned char *b; - unsigned int flags = 0, groups; + unsigned int flags = 0, group; /* ignore our fake conntrack entry */ if (ct == &ip_conntrack_untracked) @@ -305,7 +305,7 @@ static int ctnetlink_conntrack_event(struct notifier_block *this, if (events & IPCT_DESTROY) { type = IPCTNL_MSG_CT_DELETE; - groups = NF_NETLINK_CONNTRACK_DESTROY; + group = NFNLGRP_CONNTRACK_DESTROY; goto alloc_skb; } if (events & (IPCT_NEW | IPCT_RELATED)) { @@ -313,7 +313,7 @@ static int ctnetlink_conntrack_event(struct notifier_block *this, flags = NLM_F_CREATE|NLM_F_EXCL; /* dump everything */ events = ~0UL; - groups = NF_NETLINK_CONNTRACK_NEW; + group = NFNLGRP_CONNTRACK_NEW; goto alloc_skb; } if (events & (IPCT_STATUS | @@ -322,7 +322,7 @@ static int ctnetlink_conntrack_event(struct notifier_block *this, IPCT_HELPINFO | IPCT_NATINFO)) { type = IPCTNL_MSG_CT_NEW; - groups = NF_NETLINK_CONNTRACK_UPDATE; + group = NFNLGRP_CONNTRACK_UPDATE; goto alloc_skb; } @@ -375,7 +375,7 @@ alloc_skb: goto nfattr_failure; nlh->nlmsg_len = skb->tail - b; - nfnetlink_send(skb, 0, groups, 0); + nfnetlink_send(skb, 0, group, 0); return NOTIFY_DONE; nlmsg_failure: @@ -1194,7 +1194,7 @@ static int ctnetlink_expect_event(struct notifier_block *this, nlh->nlmsg_len = skb->tail - b; proto = exp->tuple.dst.protonum; - nfnetlink_send(skb, 0, NF_NETLINK_CONNTRACK_EXP_NEW, 0); + nfnetlink_send(skb, 0, NFNLGRP_CONNTRACK_EXP_NEW, 0); return NOTIFY_DONE; nlmsg_failure: diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index 1d8ac45..89816b8 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -116,10 +116,10 @@ static void ulog_send(unsigned int nlgroupnum) if (ub->qlen > 1) ub->lastnlh->nlmsg_type = NLMSG_DONE; - NETLINK_CB(ub->skb).dst_groups = (1 << nlgroupnum); - DEBUGP("ipt_ULOG: throwing %d packets to netlink mask %u\n", - ub->qlen, nlgroupnum); - netlink_broadcast(nflognl, ub->skb, 0, (1 << nlgroupnum), GFP_ATOMIC); + NETLINK_CB(ub->skb).dst_group = nlgroupnum + 1; + DEBUGP("ipt_ULOG: throwing %d packets to netlink group %u\n", + ub->qlen, nlgroupnum + 1); + netlink_broadcast(nflognl, ub->skb, 0, nlgroupnum + 1, GFP_ATOMIC); ub->qlen = 0; ub->skb = NULL; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index b9c3da3..493abf9 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2858,16 +2858,16 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa) skb = alloc_skb(size, GFP_ATOMIC); if (!skb) { - netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFADDR, ENOBUFS); + netlink_set_err(rtnl, 0, RTNLGRP_IPV6_IFADDR, ENOBUFS); return; } if (inet6_fill_ifaddr(skb, ifa, current->pid, 0, event, 0) < 0) { kfree_skb(skb); - netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFADDR, EINVAL); + netlink_set_err(rtnl, 0, RTNLGRP_IPV6_IFADDR, EINVAL); return; } - NETLINK_CB(skb).dst_groups = RTMGRP_IPV6_IFADDR; - netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_IFADDR, GFP_ATOMIC); + NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_IFADDR; + netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_IFADDR, GFP_ATOMIC); } static void inline ipv6_store_devconf(struct ipv6_devconf *cnf, @@ -2994,16 +2994,16 @@ void inet6_ifinfo_notify(int event, struct inet6_dev *idev) skb = alloc_skb(size, GFP_ATOMIC); if (!skb) { - netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFINFO, ENOBUFS); + netlink_set_err(rtnl, 0, RTNLGRP_IPV6_IFINFO, ENOBUFS); return; } if (inet6_fill_ifinfo(skb, idev, current->pid, 0, event, 0) < 0) { kfree_skb(skb); - netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFINFO, EINVAL); + netlink_set_err(rtnl, 0, RTNLGRP_IPV6_IFINFO, EINVAL); return; } - NETLINK_CB(skb).dst_groups = RTMGRP_IPV6_IFINFO; - netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_IFINFO, GFP_ATOMIC); + NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_IFINFO; + netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_IFINFO, GFP_ATOMIC); } static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev, @@ -3054,16 +3054,16 @@ static void inet6_prefix_notify(int event, struct inet6_dev *idev, skb = alloc_skb(size, GFP_ATOMIC); if (!skb) { - netlink_set_err(rtnl, 0, RTMGRP_IPV6_PREFIX, ENOBUFS); + netlink_set_err(rtnl, 0, RTNLGRP_IPV6_PREFIX, ENOBUFS); return; } if (inet6_fill_prefix(skb, idev, pinfo, current->pid, 0, event, 0) < 0) { kfree_skb(skb); - netlink_set_err(rtnl, 0, RTMGRP_IPV6_PREFIX, EINVAL); + netlink_set_err(rtnl, 0, RTNLGRP_IPV6_PREFIX, EINVAL); return; } - NETLINK_CB(skb).dst_groups = RTMGRP_IPV6_PREFIX; - netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_PREFIX, GFP_ATOMIC); + NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_PREFIX; + netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_PREFIX, GFP_ATOMIC); } static struct rtnetlink_link inet6_rtnetlink_table[RTM_NR_MSGTYPES] = { diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 878789b..6ea494a 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1850,16 +1850,16 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh, skb = alloc_skb(size, gfp_any()); if (!skb) { - netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, ENOBUFS); + netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, ENOBUFS); return; } if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0) < 0) { kfree_skb(skb); - netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, EINVAL); + netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, EINVAL); return; } - NETLINK_CB(skb).dst_groups = RTMGRP_IPV6_ROUTE; - netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_ROUTE, gfp_any()); + NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_ROUTE; + netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_ROUTE, gfp_any()); } /* diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 84efffd..36a4c5f 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -198,7 +198,7 @@ int nfnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo) int allocation = in_interrupt() ? GFP_ATOMIC : GFP_KERNEL; int err = 0; - NETLINK_CB(skb).dst_groups = group; + NETLINK_CB(skb).dst_group = group; if (echo) atomic_inc(&skb->users); netlink_broadcast(nfnl, skb, pid, group, allocation); diff --git a/net/sched/act_api.c b/net/sched/act_api.c index c896a01..8aebe8f 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -593,7 +593,7 @@ static int tca_action_flush(struct rtattr *rta, struct nlmsghdr *n, u32 pid) nlh->nlmsg_flags |= NLM_F_ROOT; module_put(a->ops->owner); kfree(a); - err = rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO); + err = rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO); if (err > 0) return 0; @@ -656,7 +656,7 @@ tca_action_gd(struct rtattr *rta, struct nlmsghdr *n, u32 pid, int event) /* now do the delete */ tcf_action_destroy(head, 0); - ret = rtnetlink_send(skb, pid, RTMGRP_TC, + ret = rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO); if (ret > 0) return 0; @@ -698,9 +698,9 @@ static int tcf_add_notify(struct tc_action *a, u32 pid, u32 seq, int event, x->rta_len = skb->tail - (u8*)x; nlh->nlmsg_len = skb->tail - b; - NETLINK_CB(skb).dst_groups = RTMGRP_TC; + NETLINK_CB(skb).dst_group = RTNLGRP_TC; - err = rtnetlink_send(skb, pid, RTMGRP_TC, flags&NLM_F_ECHO); + err = rtnetlink_send(skb, pid, RTNLGRP_TC, flags&NLM_F_ECHO); if (err > 0) err = 0; return err; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 3b5714e..b4d89fb 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -367,7 +367,7 @@ static int tfilter_notify(struct sk_buff *oskb, struct nlmsghdr *n, return -EINVAL; } - return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO); + return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO); } struct tcf_dump_args diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index b9a069a..737681c 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -816,7 +816,7 @@ static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, } if (skb->len) - return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO); + return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO); err_out: kfree_skb(skb); @@ -1040,7 +1040,7 @@ static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n, return -EINVAL; } - return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO); + return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO); } struct qdisc_dump_args diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 4d553a1..0579d20 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1125,9 +1125,8 @@ static int xfrm_exp_state_notify(struct xfrm_state *x, struct km_event *c) if (build_expire(skb, x, c->data.hard) < 0) BUG(); - NETLINK_CB(skb).dst_groups = XFRMGRP_EXPIRE; - - return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_EXPIRE, GFP_ATOMIC); + NETLINK_CB(skb).dst_group = XFRMNLGRP_EXPIRE; + return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_EXPIRE, GFP_ATOMIC); } static int xfrm_notify_sa_flush(struct km_event *c) @@ -1152,9 +1151,8 @@ static int xfrm_notify_sa_flush(struct km_event *c) nlh->nlmsg_len = skb->tail - b; - NETLINK_CB(skb).dst_groups = XFRMGRP_SA; - - return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_SA, GFP_ATOMIC); + NETLINK_CB(skb).dst_group = XFRMNLGRP_SA; + return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_SA, GFP_ATOMIC); nlmsg_failure: kfree_skb(skb); @@ -1228,9 +1226,8 @@ static int xfrm_notify_sa(struct xfrm_state *x, struct km_event *c) nlh->nlmsg_len = skb->tail - b; - NETLINK_CB(skb).dst_groups = XFRMGRP_SA; - - return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_SA, GFP_ATOMIC); + NETLINK_CB(skb).dst_group = XFRMNLGRP_SA; + return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_SA, GFP_ATOMIC); nlmsg_failure: rtattr_failure: @@ -1308,9 +1305,8 @@ static int xfrm_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *xt, if (build_acquire(skb, x, xt, xp, dir) < 0) BUG(); - NETLINK_CB(skb).dst_groups = XFRMGRP_ACQUIRE; - - return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_ACQUIRE, GFP_ATOMIC); + NETLINK_CB(skb).dst_group = XFRMNLGRP_ACQUIRE; + return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_ACQUIRE, GFP_ATOMIC); } /* User gives us xfrm_user_policy_info followed by an array of 0 @@ -1409,9 +1405,8 @@ static int xfrm_exp_policy_notify(struct xfrm_policy *xp, int dir, struct km_eve if (build_polexpire(skb, xp, dir, c->data.hard) < 0) BUG(); - NETLINK_CB(skb).dst_groups = XFRMGRP_EXPIRE; - - return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_EXPIRE, GFP_ATOMIC); + NETLINK_CB(skb).dst_group = XFRMNLGRP_EXPIRE; + return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_EXPIRE, GFP_ATOMIC); } static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, struct km_event *c) @@ -1459,9 +1454,8 @@ static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, struct km_event * nlh->nlmsg_len = skb->tail - b; - NETLINK_CB(skb).dst_groups = XFRMGRP_POLICY; - - return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_POLICY, GFP_ATOMIC); + NETLINK_CB(skb).dst_group = XFRMNLGRP_POLICY; + return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_POLICY, GFP_ATOMIC); nlmsg_failure: rtattr_failure: @@ -1486,9 +1480,8 @@ static int xfrm_notify_policy_flush(struct km_event *c) nlh->nlmsg_len = skb->tail - b; - NETLINK_CB(skb).dst_groups = XFRMGRP_POLICY; - - return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_POLICY, GFP_ATOMIC); + NETLINK_CB(skb).dst_group = XFRMNLGRP_POLICY; + return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_POLICY, GFP_ATOMIC); nlmsg_failure: kfree_skb(skb); diff --git a/security/selinux/netlink.c b/security/selinux/netlink.c index 0f7be65..20f4810 100644 --- a/security/selinux/netlink.c +++ b/security/selinux/netlink.c @@ -80,8 +80,8 @@ static void selnl_notify(int msgtype, void *data) nlh = NLMSG_PUT(skb, 0, 0, msgtype, len); selnl_add_payload(nlh, len, msgtype, data); nlh->nlmsg_len = skb->tail - tmp; - NETLINK_CB(skb).dst_groups = SELNL_GRP_AVC; - netlink_broadcast(selnl, skb, 0, SELNL_GRP_AVC, GFP_USER); + NETLINK_CB(skb).dst_group = SELNLGRP_AVC; + netlink_broadcast(selnl, skb, 0, SELNLGRP_AVC, GFP_USER); out: return; -- cgit v0.10.2 From ab33a1711cf60bfb562b1ab89ac9f23d1425e8b1 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sun, 14 Aug 2005 19:31:36 -0700 Subject: [NETLINK]: Return -EPROTONOSUPPORT in netlink_create() if no kernel socket is registered This is necessary for dynamic number of netlink groups to make sure we know the number of possible groups before bind() is called. With this change pure userspace communication using unused netlink protocols becomes impossible. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 3c56b96..444ed22 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -102,6 +102,7 @@ struct netlink_table { struct hlist_head mc_list; unsigned int nl_nonroot; struct module *module; + int registered; }; static struct netlink_table *nl_table; @@ -343,11 +344,32 @@ static struct proto netlink_proto = { .obj_size = sizeof(struct netlink_sock), }; -static int netlink_create(struct socket *sock, int protocol) +static int __netlink_create(struct socket *sock, int protocol) { struct sock *sk; struct netlink_sock *nlk; - struct module *module; + + sock->ops = &netlink_ops; + + sk = sk_alloc(PF_NETLINK, GFP_KERNEL, &netlink_proto, 1); + if (!sk) + return -ENOMEM; + + sock_init_data(sock, sk); + + nlk = nlk_sk(sk); + spin_lock_init(&nlk->cb_lock); + init_waitqueue_head(&nlk->wait); + + sk->sk_destruct = netlink_sock_destruct; + sk->sk_protocol = protocol; + return 0; +} + +static int netlink_create(struct socket *sock, int protocol) +{ + struct module *module = NULL; + int err = 0; sock->state = SS_UNCONNECTED; @@ -358,41 +380,33 @@ static int netlink_create(struct socket *sock, int protocol) return -EPROTONOSUPPORT; netlink_lock_table(); - if (!nl_table[protocol].hash.entries) { #ifdef CONFIG_KMOD - /* We do 'best effort'. If we find a matching module, - * it is loaded. If not, we don't return an error to - * allow pure userspace<->userspace communication. -HW - */ + if (!nl_table[protocol].registered) { netlink_unlock_table(); request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol); netlink_lock_table(); -#endif } - module = nl_table[protocol].module; - if (!try_module_get(module)) - module = NULL; +#endif + if (nl_table[protocol].registered && + try_module_get(nl_table[protocol].module)) + module = nl_table[protocol].module; + else + err = -EPROTONOSUPPORT; netlink_unlock_table(); - sock->ops = &netlink_ops; - - sk = sk_alloc(PF_NETLINK, GFP_KERNEL, &netlink_proto, 1); - if (!sk) { - module_put(module); - return -ENOMEM; - } - - sock_init_data(sock, sk); + if (err) + goto out; - nlk = nlk_sk(sk); + if ((err = __netlink_create(sock, protocol) < 0)) + goto out_module; - nlk->module = module; - spin_lock_init(&nlk->cb_lock); - init_waitqueue_head(&nlk->wait); - sk->sk_destruct = netlink_sock_destruct; + nlk_sk(sock->sk)->module = module; +out: + return err; - sk->sk_protocol = protocol; - return 0; +out_module: + module_put(module); + goto out; } static int netlink_release(struct socket *sock) @@ -437,6 +451,7 @@ static int netlink_release(struct socket *sock) if (nlk->flags & NETLINK_KERNEL_SOCKET) { netlink_table_grab(); nl_table[sk->sk_protocol].module = NULL; + nl_table[sk->sk_protocol].registered = 0; netlink_table_ungrab(); } @@ -1082,7 +1097,7 @@ netlink_kernel_create(int unit, void (*input)(struct sock *sk, int len), struct if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock)) return NULL; - if (netlink_create(sock, unit) < 0) + if (__netlink_create(sock, unit) < 0) goto out_sock_release; sk = sock->sk; @@ -1098,6 +1113,7 @@ netlink_kernel_create(int unit, void (*input)(struct sock *sk, int len), struct netlink_table_grab(); nl_table[unit].module = module; + nl_table[unit].registered = 1; netlink_table_ungrab(); return sk; -- cgit v0.10.2 From f7fa9b10edbb9391bdd4ec8e8b3d621d0664b198 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 15 Aug 2005 12:29:13 -0700 Subject: [NETLINK]: Support dynamic number of multicast groups per netlink family Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 444ed22..58d4ca4 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -60,21 +60,24 @@ #include #define Nprintk(a...) +#define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) struct netlink_sock { /* struct sock has to be the first member of netlink_sock */ struct sock sk; u32 pid; - unsigned int groups; u32 dst_pid; u32 dst_group; + u32 flags; + u32 subscriptions; + u32 ngroups; + unsigned long *groups; unsigned long state; wait_queue_head_t wait; struct netlink_callback *cb; spinlock_t cb_lock; void (*data_ready)(struct sock *sk, int bytes); struct module *module; - u32 flags; }; #define NETLINK_KERNEL_SOCKET 0x1 @@ -101,6 +104,7 @@ struct netlink_table { struct nl_pid_hash hash; struct hlist_head mc_list; unsigned int nl_nonroot; + unsigned int groups; struct module *module; int registered; }; @@ -138,6 +142,7 @@ static void netlink_sock_destruct(struct sock *sk) BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc)); BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc)); BUG_TRAP(!nlk_sk(sk)->cb); + BUG_TRAP(!nlk_sk(sk)->groups); } /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on SMP. @@ -333,7 +338,7 @@ static void netlink_remove(struct sock *sk) netlink_table_grab(); if (sk_del_node_init(sk)) nl_table[sk->sk_protocol].hash.entries--; - if (nlk_sk(sk)->groups) + if (nlk_sk(sk)->subscriptions) __sk_del_bind_node(sk); netlink_table_ungrab(); } @@ -369,6 +374,8 @@ static int __netlink_create(struct socket *sock, int protocol) static int netlink_create(struct socket *sock, int protocol) { struct module *module = NULL; + struct netlink_sock *nlk; + unsigned int groups; int err = 0; sock->state = SS_UNCONNECTED; @@ -392,15 +399,23 @@ static int netlink_create(struct socket *sock, int protocol) module = nl_table[protocol].module; else err = -EPROTONOSUPPORT; + groups = nl_table[protocol].groups; netlink_unlock_table(); - if (err) - goto out; + if (err || (err = __netlink_create(sock, protocol) < 0)) + goto out_module; + + nlk = nlk_sk(sock->sk); - if ((err = __netlink_create(sock, protocol) < 0)) + nlk->groups = kmalloc(NLGRPSZ(groups), GFP_KERNEL); + if (nlk->groups == NULL) { + err = -ENOMEM; goto out_module; + } + memset(nlk->groups, 0, NLGRPSZ(groups)); + nlk->ngroups = groups; - nlk_sk(sock->sk)->module = module; + nlk->module = module; out: return err; @@ -437,7 +452,7 @@ static int netlink_release(struct socket *sock) skb_queue_purge(&sk->sk_write_queue); - if (nlk->pid && !nlk->groups) { + if (nlk->pid && !nlk->subscriptions) { struct netlink_notify n = { .protocol = sk->sk_protocol, .pid = nlk->pid, @@ -455,6 +470,9 @@ static int netlink_release(struct socket *sock) netlink_table_ungrab(); } + kfree(nlk->groups); + nlk->groups = NULL; + sock_put(sk); return 0; } @@ -503,6 +521,18 @@ static inline int netlink_capable(struct socket *sock, unsigned int flag) capable(CAP_NET_ADMIN); } +static void +netlink_update_subscriptions(struct sock *sk, unsigned int subscriptions) +{ + struct netlink_sock *nlk = nlk_sk(sk); + + if (nlk->subscriptions && !subscriptions) + __sk_del_bind_node(sk); + else if (!nlk->subscriptions && subscriptions) + sk_add_bind_node(sk, &nl_table[sk->sk_protocol].mc_list); + nlk->subscriptions = subscriptions; +} + static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sock *sk = sock->sk; @@ -528,15 +558,14 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len return err; } - if (!nladdr->nl_groups && !nlk->groups) + if (!nladdr->nl_groups && !(u32)nlk->groups[0]) return 0; netlink_table_grab(); - if (nlk->groups && !nladdr->nl_groups) - __sk_del_bind_node(sk); - else if (!nlk->groups && nladdr->nl_groups) - sk_add_bind_node(sk, &nl_table[sk->sk_protocol].mc_list); - nlk->groups = nladdr->nl_groups; + netlink_update_subscriptions(sk, nlk->subscriptions + + hweight32(nladdr->nl_groups) - + hweight32(nlk->groups[0])); + nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | nladdr->nl_groups; netlink_table_ungrab(); return 0; @@ -590,7 +619,7 @@ static int netlink_getname(struct socket *sock, struct sockaddr *addr, int *addr nladdr->nl_groups = netlink_group_mask(nlk->dst_group); } else { nladdr->nl_pid = nlk->pid; - nladdr->nl_groups = nlk->groups; + nladdr->nl_groups = nlk->groups[0]; } return 0; } @@ -791,7 +820,8 @@ static inline int do_one_broadcast(struct sock *sk, if (p->exclude_sk == sk) goto out; - if (nlk->pid == p->pid || !(nlk->groups & netlink_group_mask(p->group))) + if (nlk->pid == p->pid || p->group - 1 >= nlk->ngroups || + !test_bit(p->group - 1, nlk->groups)) goto out; if (p->failure) { @@ -887,7 +917,8 @@ static inline int do_one_set_err(struct sock *sk, if (sk == p->exclude_sk) goto out; - if (nlk->pid == p->pid || !(nlk->groups & netlink_group_mask(p->group))) + if (nlk->pid == p->pid || p->group - 1 >= nlk->ngroups || + !test_bit(p->group - 1, nlk->groups)) goto out; sk->sk_err = p->code; @@ -1112,6 +1143,7 @@ netlink_kernel_create(int unit, void (*input)(struct sock *sk, int len), struct nlk->flags |= NETLINK_KERNEL_SOCKET; netlink_table_grab(); + nl_table[unit].groups = 32; nl_table[unit].module = module; nl_table[unit].registered = 1; netlink_table_ungrab(); @@ -1358,7 +1390,8 @@ static int netlink_seq_show(struct seq_file *seq, void *v) s, s->sk_protocol, nlk->pid, - nlk->groups, + nlk->flags & NETLINK_KERNEL_SOCKET ? + 0 : (unsigned int)nlk->groups[0], atomic_read(&s->sk_rmem_alloc), atomic_read(&s->sk_wmem_alloc), nlk->cb, -- cgit v0.10.2 From 9a4595bc7e67962f13232ee55a64e063062c3a99 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 15 Aug 2005 12:32:15 -0700 Subject: [NETLINK]: Add set/getsockopt options to support more than 32 groups NETLINK_ADD_MEMBERSHIP/NETLINK_DROP_MEMBERSHIP are used to join/leave groups, NETLINK_PKTINFO is used to enable nl_pktinfo control messages for received packets to get the extended destination group number. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller diff --git a/include/linux/netlink.h b/include/linux/netlink.h index c724c9d..36a4044 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -90,6 +90,15 @@ struct nlmsgerr struct nlmsghdr msg; }; +#define NETLINK_ADD_MEMBERSHIP 1 +#define NETLINK_DROP_MEMBERSHIP 2 +#define NETLINK_PKTINFO 3 + +struct nl_pktinfo +{ + __u32 group; +}; + #define NET_MAJOR 36 /* Major 36 is reserved for networking */ enum { diff --git a/include/linux/socket.h b/include/linux/socket.h index ddf2255..acc55aa 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -272,6 +272,7 @@ struct ucred { #define SOL_NETBEUI 267 #define SOL_LLC 268 #define SOL_DCCP 269 +#define SOL_NETLINK 270 /* IPX options */ #define IPX_TYPE 1 diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 58d4ca4..47e7917 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -81,6 +81,7 @@ struct netlink_sock { }; #define NETLINK_KERNEL_SOCKET 0x1 +#define NETLINK_RECV_PKTINFO 0x2 static inline struct netlink_sock *nlk_sk(struct sock *sk) { @@ -946,6 +947,94 @@ void netlink_set_err(struct sock *ssk, u32 pid, u32 group, int code) read_unlock(&nl_table_lock); } +static int netlink_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, int optlen) +{ + struct sock *sk = sock->sk; + struct netlink_sock *nlk = nlk_sk(sk); + int val = 0, err; + + if (level != SOL_NETLINK) + return -ENOPROTOOPT; + + if (optlen >= sizeof(int) && + get_user(val, (int __user *)optval)) + return -EFAULT; + + switch (optname) { + case NETLINK_PKTINFO: + if (val) + nlk->flags |= NETLINK_RECV_PKTINFO; + else + nlk->flags &= ~NETLINK_RECV_PKTINFO; + err = 0; + break; + case NETLINK_ADD_MEMBERSHIP: + case NETLINK_DROP_MEMBERSHIP: { + unsigned int subscriptions; + int old, new = optname == NETLINK_ADD_MEMBERSHIP ? 1 : 0; + + if (!netlink_capable(sock, NL_NONROOT_RECV)) + return -EPERM; + if (!val || val - 1 >= nlk->ngroups) + return -EINVAL; + netlink_table_grab(); + old = test_bit(val - 1, nlk->groups); + subscriptions = nlk->subscriptions - old + new; + if (new) + __set_bit(val - 1, nlk->groups); + else + __clear_bit(val - 1, nlk->groups); + netlink_update_subscriptions(sk, subscriptions); + netlink_table_ungrab(); + err = 0; + break; + } + default: + err = -ENOPROTOOPT; + } + return err; +} + +static int netlink_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + struct netlink_sock *nlk = nlk_sk(sk); + int len, val, err; + + if (level != SOL_NETLINK) + return -ENOPROTOOPT; + + if (get_user(len, optlen)) + return -EFAULT; + if (len < 0) + return -EINVAL; + + switch (optname) { + case NETLINK_PKTINFO: + if (len < sizeof(int)) + return -EINVAL; + len = sizeof(int); + val = nlk->flags & NETLINK_RECV_PKTINFO ? 1 : 0; + put_user(len, optlen); + put_user(val, optval); + err = 0; + break; + default: + err = -ENOPROTOOPT; + } + return err; +} + +static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) +{ + struct nl_pktinfo info; + + info.group = NETLINK_CB(skb).dst_group; + put_cmsg(msg, SOL_NETLINK, NETLINK_PKTINFO, sizeof(info), &info); +} + static inline void netlink_rcv_wake(struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); @@ -1091,6 +1180,8 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock, netlink_dump(sk); scm_recv(sock, msg, siocb->scm, flags); + if (nlk->flags & NETLINK_RECV_PKTINFO) + netlink_cmsg_recv_pktinfo(msg, skb); out: netlink_rcv_wake(sk); @@ -1465,8 +1556,8 @@ static struct proto_ops netlink_ops = { .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, - .setsockopt = sock_no_setsockopt, - .getsockopt = sock_no_getsockopt, + .setsockopt = netlink_setsockopt, + .getsockopt = netlink_getsockopt, .sendmsg = netlink_sendmsg, .recvmsg = netlink_recvmsg, .mmap = sock_no_mmap, -- cgit v0.10.2 From 066286071d3542243baa68166acb779187c848b3 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 15 Aug 2005 12:33:26 -0700 Subject: [NETLINK]: Add "groups" argument to netlink_kernel_create Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller diff --git a/drivers/w1/w1_int.c b/drivers/w1/w1_int.c index f3f339d..498ad505 100644 --- a/drivers/w1/w1_int.c +++ b/drivers/w1/w1_int.c @@ -88,7 +88,7 @@ static struct w1_master * w1_alloc_dev(u32 id, int slave_count, int slave_ttl, dev->groups = 1; dev->seq = 1; - dev->nls = netlink_kernel_create(NETLINK_W1, NULL, THIS_MODULE); + dev->nls = netlink_kernel_create(NETLINK_W1, 1, NULL, THIS_MODULE); if (!dev->nls) { printk(KERN_ERR "Failed to create new netlink socket(%u) for w1 master %s.\n", NETLINK_NFLOG, dev->dev.bus_id); diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 36a4044..7d1d968 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -125,7 +125,7 @@ struct netlink_skb_parms #define NETLINK_CREDS(skb) (&NETLINK_CB((skb)).creds) -extern struct sock *netlink_kernel_create(int unit, void (*input)(struct sock *sk, int len), struct module *module); +extern struct sock *netlink_kernel_create(int unit, unsigned int groups, void (*input)(struct sock *sk, int len), struct module *module); extern void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err); extern int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 pid, int nonblock); extern int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 pid, diff --git a/kernel/audit.c b/kernel/audit.c index ed40195..7f06997 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -514,7 +514,7 @@ static int __init audit_init(void) { printk(KERN_INFO "audit: initializing netlink socket (%s)\n", audit_default ? "enabled" : "disabled"); - audit_sock = netlink_kernel_create(NETLINK_AUDIT, audit_receive, + audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive, THIS_MODULE); if (!audit_sock) audit_panic("cannot initialize netlink socket"); diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c index 1ebd735..04ca442 100644 --- a/lib/kobject_uevent.c +++ b/lib/kobject_uevent.c @@ -154,7 +154,7 @@ EXPORT_SYMBOL_GPL(kobject_uevent_atomic); static int __init kobject_uevent_init(void) { - uevent_sock = netlink_kernel_create(NETLINK_KOBJECT_UEVENT, NULL, + uevent_sock = netlink_kernel_create(NETLINK_KOBJECT_UEVENT, 1, NULL, THIS_MODULE); if (!uevent_sock) { diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c index 6845b5d..aae26ae 100644 --- a/net/bridge/netfilter/ebt_ulog.c +++ b/net/bridge/netfilter/ebt_ulog.c @@ -258,7 +258,8 @@ static int __init init(void) spin_lock_init(&ulog_buffers[i].lock); } - ebtulognl = netlink_kernel_create(NETLINK_NFLOG, NULL, THIS_MODULE); + ebtulognl = netlink_kernel_create(NETLINK_NFLOG, EBT_ULOG_MAXNLGROUPS, + NULL, THIS_MODULE); if (!ebtulognl) ret = -ENOMEM; else if ((ret = ebt_register_watcher(&ulog))) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 5f3f95b..9bed756 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -708,7 +708,8 @@ void __init rtnetlink_init(void) if (!rta_buf) panic("rtnetlink_init: cannot allocate rta_buf\n"); - rtnl = netlink_kernel_create(NETLINK_ROUTE, rtnetlink_rcv, THIS_MODULE); + rtnl = netlink_kernel_create(NETLINK_ROUTE, RTNLGRP_MAX, rtnetlink_rcv, + THIS_MODULE); if (rtnl == NULL) panic("rtnetlink_init: cannot initialize rtnetlink\n"); netlink_set_nonroot(NETLINK_ROUTE, NL_NONROOT_RECV); diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c index 353fed6..afb33a2 100644 --- a/net/decnet/netfilter/dn_rtmsg.c +++ b/net/decnet/netfilter/dn_rtmsg.c @@ -138,8 +138,8 @@ static int __init init(void) { int rv = 0; - dnrmg = netlink_kernel_create(NETLINK_DNRTMSG, dnrmg_receive_user_sk, - THIS_MODULE); + dnrmg = netlink_kernel_create(NETLINK_DNRTMSG, DNRNG_NLGRP_MAX, + dnrmg_receive_user_sk, THIS_MODULE); if (dnrmg == NULL) { printk(KERN_ERR "dn_rtmsg: Cannot create netlink socket"); return -ENOMEM; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index d4e7b57..4e1379f 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -566,7 +566,7 @@ static void nl_fib_input(struct sock *sk, int len) static void nl_fib_lookup_init(void) { - netlink_kernel_create(NETLINK_FIB_LOOKUP, nl_fib_input, THIS_MODULE); + netlink_kernel_create(NETLINK_FIB_LOOKUP, 0, nl_fib_input, THIS_MODULE); } static void fib_disable_ip(struct net_device *dev, int force) diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 1880ad8..71f3c73 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -845,7 +845,7 @@ static int __init inet_diag_init(void) goto out; memset(inet_diag_table, 0, inet_diag_table_size); - idiagnl = netlink_kernel_create(NETLINK_INET_DIAG, inet_diag_rcv, + idiagnl = netlink_kernel_create(NETLINK_INET_DIAG, 0, inet_diag_rcv, THIS_MODULE); if (idiagnl == NULL) goto out_free_table; diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index 7f2bcc7..d54f14d 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -671,7 +671,7 @@ init_or_cleanup(int init) goto cleanup; netlink_register_notifier(&ipq_nl_notifier); - ipqnl = netlink_kernel_create(NETLINK_FIREWALL, ipq_rcv_sk, + ipqnl = netlink_kernel_create(NETLINK_FIREWALL, 0, ipq_rcv_sk, THIS_MODULE); if (ipqnl == NULL) { printk(KERN_ERR "ip_queue: failed to create netlink socket\n"); diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index 89816b8..e2c14f3 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -388,7 +388,8 @@ static int __init init(void) ulog_buffers[i].timer.data = i; } - nflognl = netlink_kernel_create(NETLINK_NFLOG, NULL, THIS_MODULE); + nflognl = netlink_kernel_create(NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL, + THIS_MODULE); if (!nflognl) return -ENOMEM; diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c index 4467645..aa11cf3 100644 --- a/net/ipv6/netfilter/ip6_queue.c +++ b/net/ipv6/netfilter/ip6_queue.c @@ -667,7 +667,8 @@ init_or_cleanup(int init) goto cleanup; netlink_register_notifier(&ipq_nl_notifier); - ipqnl = netlink_kernel_create(NETLINK_IP6_FW, ipq_rcv_sk, THIS_MODULE); + ipqnl = netlink_kernel_create(NETLINK_IP6_FW, 0, ipq_rcv_sk, + THIS_MODULE); if (ipqnl == NULL) { printk(KERN_ERR "ip6_queue: failed to create netlink socket\n"); goto cleanup_netlink_notifier; diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 36a4c5f..e089f17 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -355,8 +355,8 @@ int __init nfnetlink_init(void) { printk("Netfilter messages via NETLINK v%s.\n", nfversion); - nfnl = netlink_kernel_create(NETLINK_NETFILTER, nfnetlink_rcv, - THIS_MODULE); + nfnl = netlink_kernel_create(NETLINK_NETFILTER, NFNLGRP_MAX, + nfnetlink_rcv, THIS_MODULE); if (!nfnl) { printk(KERN_ERR "cannot initialize nfnetlink!\n"); return -1; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 47e7917..e259f46 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1204,7 +1204,9 @@ static void netlink_data_ready(struct sock *sk, int len) */ struct sock * -netlink_kernel_create(int unit, void (*input)(struct sock *sk, int len), struct module *module) +netlink_kernel_create(int unit, unsigned int groups, + void (*input)(struct sock *sk, int len), + struct module *module) { struct socket *sock; struct sock *sk; @@ -1234,7 +1236,7 @@ netlink_kernel_create(int unit, void (*input)(struct sock *sk, int len), struct nlk->flags |= NETLINK_KERNEL_SOCKET; netlink_table_grab(); - nl_table[unit].groups = 32; + nl_table[unit].groups = groups < 32 ? 32 : groups; nl_table[unit].module = module; nl_table[unit].registered = 1; netlink_table_ungrab(); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 0579d20..c35336a 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1520,8 +1520,8 @@ static int __init xfrm_user_init(void) { printk(KERN_INFO "Initializing IPsec netlink socket\n"); - xfrm_nl = netlink_kernel_create(NETLINK_XFRM, xfrm_netlink_rcv, - THIS_MODULE); + xfrm_nl = netlink_kernel_create(NETLINK_XFRM, XFRMNLGRP_MAX, + xfrm_netlink_rcv, THIS_MODULE); if (xfrm_nl == NULL) return -ENOMEM; diff --git a/security/selinux/netlink.c b/security/selinux/netlink.c index 20f4810..e203883 100644 --- a/security/selinux/netlink.c +++ b/security/selinux/netlink.c @@ -104,7 +104,8 @@ void selnl_notify_policyload(u32 seqno) static int __init selnl_init(void) { - selnl = netlink_kernel_create(NETLINK_SELINUX, NULL, THIS_MODULE); + selnl = netlink_kernel_create(NETLINK_SELINUX, SELNLGRP_MAX, NULL, + THIS_MODULE); if (selnl == NULL) panic("SELinux: Cannot create netlink socket."); netlink_set_nonroot(NETLINK_SELINUX, NL_NONROOT_RECV); -- cgit v0.10.2 From 216efaaaa006d2f3ecbb5bbc2b6673423813254e Mon Sep 17 00:00:00 2001 From: James Morris Date: Mon, 15 Aug 2005 20:34:48 -0700 Subject: [SELINUX]: Update for tcp_diag rename to inet_diag. Also, support dccp sockets. Signed-off-by: James Morris Signed-off-by: David S. Miller diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 2253f38..8641f88 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -659,7 +659,7 @@ static inline u16 socket_type_to_security_class(int family, int type, int protoc return SECCLASS_NETLINK_ROUTE_SOCKET; case NETLINK_FIREWALL: return SECCLASS_NETLINK_FIREWALL_SOCKET; - case NETLINK_TCPDIAG: + case NETLINK_INET_DIAG: return SECCLASS_NETLINK_TCPDIAG_SOCKET; case NETLINK_NFLOG: return SECCLASS_NETLINK_NFLOG_SOCKET; diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c index 92b057b..69b9329 100644 --- a/security/selinux/nlmsgtab.c +++ b/security/selinux/nlmsgtab.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include @@ -76,6 +76,7 @@ static struct nlmsg_perm nlmsg_firewall_perms[] = static struct nlmsg_perm nlmsg_tcpdiag_perms[] = { { TCPDIAG_GETSOCK, NETLINK_TCPDIAG_SOCKET__NLMSG_READ }, + { DCCPDIAG_GETSOCK, NETLINK_TCPDIAG_SOCKET__NLMSG_READ }, }; static struct nlmsg_perm nlmsg_xfrm_perms[] = -- cgit v0.10.2 From 63a1222b1fd79c52491c14534b086bffbfaed8bf Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 15 Aug 2005 20:35:44 -0700 Subject: [DECNET]: Fix build after netlink changes. Signed-off-by: Andrew Morton Signed-off-by: David S. Miller diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c index afb33a2..1ab94c6 100644 --- a/net/decnet/netfilter/dn_rtmsg.c +++ b/net/decnet/netfilter/dn_rtmsg.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -71,10 +72,10 @@ static void dnrmg_send_peer(struct sk_buff *skb) switch(flags & DN_RT_CNTL_MSK) { case DN_RT_PKT_L1RT: - group = DNRMG_L1_NLGRP; + group = DNRNG_NLGRP_L1; break; case DN_RT_PKT_L2RT: - group = DNRMG_L2_NLGRP; + group = DNRNG_NLGRP_L2; break; default: return; -- cgit v0.10.2 From 9deff7f2365958c5c5aa8cb5a0dd651c4dd83f8f Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 15 Aug 2005 21:13:25 -0700 Subject: [RXRPC]: Fix build failure introduced by skb->stamp changes. Signed-off-by: Andrew Morton Signed-off-by: David S. Miller diff --git a/net/rxrpc/transport.c b/net/rxrpc/transport.c index 9bce779..122c086 100644 --- a/net/rxrpc/transport.c +++ b/net/rxrpc/transport.c @@ -330,7 +330,7 @@ static int rxrpc_incoming_msg(struct rxrpc_transport *trans, msg->trans = trans; msg->state = RXRPC_MSG_RECEIVED; - msg->stamp = pkt->stamp; + skb_get_timestamp(pkt, &msg->stamp); if (msg->stamp.tv_sec == 0) { do_gettimeofday(&msg->stamp); if (pkt->sk) -- cgit v0.10.2 From 20380731bc2897f2952ae055420972ded4cd786e Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 16 Aug 2005 02:18:02 -0300 Subject: [NET]: Fix sparse warnings Of this type, mostly: CHECK net/ipv6/netfilter.c net/ipv6/netfilter.c:96:12: warning: symbol 'ipv6_netfilter_init' was not declared. Should it be static? net/ipv6/netfilter.c:101:6: warning: symbol 'ipv6_netfilter_fini' was not declared. Should it be static? Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/linux/if_ether.h b/include/linux/if_ether.h index b5b58e9..fc2d4c8 100644 --- a/include/linux/if_ether.h +++ b/include/linux/if_ether.h @@ -110,6 +110,8 @@ static inline struct ethhdr *eth_hdr(const struct sk_buff *skb) { return (struct ethhdr *)skb->mac.raw; } + +extern struct ctl_table ether_table[]; #endif #endif /* _LINUX_IF_ETHER_H */ diff --git a/include/linux/if_frad.h b/include/linux/if_frad.h index 3c94b17..511999c 100644 --- a/include/linux/if_frad.h +++ b/include/linux/if_frad.h @@ -191,10 +191,12 @@ struct frad_local int buffer; /* current buffer for S508 firmware */ }; -extern void dlci_ioctl_set(int (*hook)(unsigned int, void __user *)); - #endif /* __KERNEL__ */ #endif /* CONFIG_DLCI || CONFIG_DLCI_MODULE */ +#ifdef __KERNEL__ +extern void dlci_ioctl_set(int (*hook)(unsigned int, void __user *)); +#endif + #endif diff --git a/include/linux/if_tr.h b/include/linux/if_tr.h index 3fba9e2..5502f59 100644 --- a/include/linux/if_tr.h +++ b/include/linux/if_tr.h @@ -43,12 +43,16 @@ struct trh_hdr { }; #ifdef __KERNEL__ +#include #include static inline struct trh_hdr *tr_hdr(const struct sk_buff *skb) { return (struct trh_hdr *)skb->mac.raw; } +#ifdef CONFIG_SYSCTL +extern struct ctl_table tr_table[]; +#endif #endif /* This is an Token-Ring LLC structure */ diff --git a/include/linux/igmp.h b/include/linux/igmp.h index 0c31ef0..28f4f3b 100644 --- a/include/linux/igmp.h +++ b/include/linux/igmp.h @@ -129,6 +129,9 @@ struct igmpv3_query { #include #include +extern int sysctl_igmp_max_memberships; +extern int sysctl_igmp_max_msf; + struct ip_sf_socklist { unsigned int sl_max; diff --git a/include/linux/net.h b/include/linux/net.h index 5f8b632..4e98158 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -286,5 +286,12 @@ static struct proto_ops name##_ops = { \ #define MODULE_ALIAS_NET_PF_PROTO(pf, proto) \ MODULE_ALIAS("net-pf-" __stringify(pf) "-proto-" __stringify(proto)) +#ifdef CONFIG_SYSCTL +#include +extern ctl_table net_table[]; +extern int net_msg_cost; +extern int net_msg_burst; +#endif + #endif /* __KERNEL__ */ #endif /* _LINUX_NET_H */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d8e52ed..1fcaa88 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -244,6 +244,7 @@ struct netdev_boot_setup { }; #define NETDEV_BOOT_SETUP_MAX 8 +extern int __init netdev_boot_setup(char *str); /* * The DEVICE structure. @@ -673,6 +674,7 @@ extern void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev); extern void dev_init(void); extern int netdev_nit; +extern int netdev_budget; /* Called by rtnetlink.c:rtnl_unlock() */ extern void netdev_run_todo(void); @@ -908,6 +910,14 @@ extern int skb_checksum_help(struct sk_buff *skb, int inward); extern void net_enable_timestamp(void); extern void net_disable_timestamp(void); +#ifdef CONFIG_PROC_FS +extern void *dev_seq_start(struct seq_file *seq, loff_t *pos); +extern void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos); +extern void dev_seq_stop(struct seq_file *seq, void *v); +#endif + +extern void linkwatch_run_queue(void); + #endif /* __KERNEL__ */ #endif /* _LINUX_DEV_H */ diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h index 5d204ee..edcc2c6 100644 --- a/include/linux/netfilter_ipv6.h +++ b/include/linux/netfilter_ipv6.h @@ -71,7 +71,7 @@ enum nf_ip6_hook_priorities { NF_IP6_PRI_LAST = INT_MAX, }; -int ipv6_netfilter_init(void); -void ipv6_netfilter_fini(void); +extern int ipv6_netfilter_init(void); +extern void ipv6_netfilter_fini(void); #endif /*__LINUX_IP6_NETFILTER_H*/ diff --git a/include/linux/security.h b/include/linux/security.h index b42095a..7aab6ab 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -2727,7 +2727,8 @@ static inline int security_socket_getpeersec(struct socket *sock, char __user *o return security_ops->socket_getpeersec(sock, optval, optlen, len); } -static inline int security_sk_alloc(struct sock *sk, int family, int priority) +static inline int security_sk_alloc(struct sock *sk, int family, + unsigned int __nocast priority) { return security_ops->sk_alloc_security(sk, family, priority); } @@ -2844,7 +2845,8 @@ static inline int security_socket_getpeersec(struct socket *sock, char __user *o return -ENOPROTOOPT; } -static inline int security_sk_alloc(struct sock *sk, int family, int priority) +static inline int security_sk_alloc(struct sock *sk, int family, + unsigned int __nocast priority) { return 0; } diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 32635c4..db10335 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1203,6 +1203,8 @@ extern void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to); extern void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len); +extern void skb_release_data(struct sk_buff *skb); + static inline void *skb_header_pointer(const struct sk_buff *skb, int offset, int len, void *buffer) { diff --git a/include/linux/socket.h b/include/linux/socket.h index acc55aa..1739c2d 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -26,6 +26,13 @@ struct __kernel_sockaddr_storage { #include /* pid_t */ #include /* __user */ +extern int sysctl_somaxconn; +extern void sock_init(void); +#ifdef CONFIG_PROC_FS +struct seq_file; +extern void socket_seq_show(struct seq_file *seq); +#endif + typedef unsigned short sa_family_t; /* diff --git a/include/net/addrconf.h b/include/net/addrconf.h index a0ed936..750e250 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -45,6 +45,7 @@ struct prefix_info { #ifdef __KERNEL__ +#include #include #include #include @@ -238,5 +239,10 @@ static inline int ipv6_addr_is_ll_all_routers(const struct in6_addr *addr) addr->s6_addr32[3] == htonl(0x00000002)); } +#ifdef CONFIG_PROC_FS +extern int if6_proc_init(void); +extern void if6_proc_exit(void); +#endif + #endif #endif diff --git a/include/net/af_unix.h b/include/net/af_unix.h index b60b384..b5d785a 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -1,5 +1,11 @@ #ifndef __LINUX_NET_AFUNIX_H #define __LINUX_NET_AFUNIX_H + +#include +#include +#include +#include + extern void unix_inflight(struct file *fp); extern void unix_notinflight(struct file *fp); extern void unix_gc(void); @@ -74,5 +80,14 @@ struct unix_sock { wait_queue_head_t peer_wait; }; #define unix_sk(__sk) ((struct unix_sock *)__sk) + +#ifdef CONFIG_SYSCTL +extern int sysctl_unix_max_dgram_qlen; +extern void unix_sysctl_register(void); +extern void unix_sysctl_unregister(void); +#else +static inline void unix_sysctl_register(void) {} +static inline void unix_sysctl_unregister(void) {} +#endif #endif #endif diff --git a/include/net/icmp.h b/include/net/icmp.h index e5ef0d15..6cdebee 100644 --- a/include/net/icmp.h +++ b/include/net/icmp.h @@ -57,4 +57,11 @@ static inline struct raw_sock *raw_sk(const struct sock *sk) return (struct raw_sock *)sk; } +extern int sysctl_icmp_echo_ignore_all; +extern int sysctl_icmp_echo_ignore_broadcasts; +extern int sysctl_icmp_ignore_bogus_error_responses; +extern int sysctl_icmp_errors_use_inbound_ifaddr; +extern int sysctl_icmp_ratelimit; +extern int sysctl_icmp_ratemask; + #endif /* _ICMP_H */ diff --git a/include/net/ip.h b/include/net/ip.h index c16fb6a..7623e41 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -163,6 +163,24 @@ extern int sysctl_local_port_range[2]; extern int sysctl_ip_default_ttl; extern int sysctl_ip_nonlocal_bind; +/* From ip_fragment.c */ +extern int sysctl_ipfrag_high_thresh; +extern int sysctl_ipfrag_low_thresh; +extern int sysctl_ipfrag_time; +extern int sysctl_ipfrag_secret_interval; + +/* From inetpeer.c */ +extern int inet_peer_threshold; +extern int inet_peer_minttl; +extern int inet_peer_maxttl; +extern int inet_peer_gc_mintime; +extern int inet_peer_gc_maxtime; + +/* From ip_output.c */ +extern int sysctl_ip_dynaddr; + +extern void ipfrag_init(void); + #ifdef CONFIG_INET /* The function in 2.2 was invalid, producing wrong result for * check=0xFEFF. It was noticed by Arthur Skawina _year_ ago. --ANK(000625) */ @@ -348,5 +366,10 @@ int ipv4_doint_and_flush_strategy(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen, void **context); +#ifdef CONFIG_PROC_FS +extern int ip_misc_proc_init(void); +#endif + +extern struct ctl_table ipv4_table[]; #endif /* _IP_H */ diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index a4208a3..14de4eb 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -295,4 +295,9 @@ static inline void fib_res_put(struct fib_result *res) #endif } +#ifdef CONFIG_PROC_FS +extern int fib_proc_init(void); +extern void fib_proc_exit(void); +#endif + #endif /* _NET_FIB_H */ diff --git a/include/net/ipv6.h b/include/net/ipv6.h index c5a02dd..3203eaf 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -104,6 +104,7 @@ struct frag_hdr { #ifdef __KERNEL__ +#include #include /* sysctls */ @@ -464,8 +465,38 @@ extern int sysctl_ip6frag_low_thresh; extern int sysctl_ip6frag_time; extern int sysctl_ip6frag_secret_interval; -#endif /* __KERNEL__ */ -#endif /* _NET_IPV6_H */ +extern struct proto_ops inet6_stream_ops; +extern struct proto_ops inet6_dgram_ops; + +extern int ip6_mc_source(int add, int omode, struct sock *sk, + struct group_source_req *pgsr); +extern int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf); +extern int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, + struct group_filter __user *optval, + int __user *optlen); + +#ifdef CONFIG_PROC_FS +extern int ac6_proc_init(void); +extern void ac6_proc_exit(void); +extern int raw6_proc_init(void); +extern void raw6_proc_exit(void); +extern int tcp6_proc_init(void); +extern void tcp6_proc_exit(void); +extern int udp6_proc_init(void); +extern void udp6_proc_exit(void); +extern int ipv6_misc_proc_init(void); +extern void ipv6_misc_proc_exit(void); + +extern struct rt6_statistics rt6_stats; +#endif +#ifdef CONFIG_SYSCTL +extern ctl_table ipv6_route_table[]; +extern ctl_table ipv6_icmp_table[]; +extern void ipv6_sysctl_register(void); +extern void ipv6_sysctl_unregister(void); +#endif +#endif /* __KERNEL__ */ +#endif /* _NET_IPV6_H */ diff --git a/include/net/p8022.h b/include/net/p8022.h index 223f8fa..42e9fac 100644 --- a/include/net/p8022.h +++ b/include/net/p8022.h @@ -8,4 +8,6 @@ extern struct datalink_proto * struct net_device *orig_dev)); extern void unregister_8022_client(struct datalink_proto *proto); +extern struct datalink_proto *make_8023_client(void); +extern void destroy_8023_client(struct datalink_proto *dl); #endif diff --git a/include/net/raw.h b/include/net/raw.h index 1c4bc3e..f479174 100644 --- a/include/net/raw.h +++ b/include/net/raw.h @@ -17,10 +17,10 @@ #ifndef _RAW_H #define _RAW_H +#include extern struct proto raw_prot; - extern void raw_err(struct sock *, struct sk_buff *, u32 info); extern int raw_rcv(struct sock *, struct sk_buff *); @@ -39,4 +39,9 @@ extern struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num, extern int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash); +#ifdef CONFIG_PROC_FS +extern int raw_proc_init(void); +extern void raw_proc_exit(void); +#endif + #endif /* _RAW_H */ diff --git a/include/net/route.h b/include/net/route.h index 63c9455..dbe79ca 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -195,4 +195,6 @@ static inline struct inet_peer *rt_get_peer(struct rtable *rt) return rt->peer; } +extern ctl_table ipv4_route_table[]; + #endif /* _ROUTE_H */ diff --git a/include/net/sock.h b/include/net/sock.h index d594288..1418388 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1370,4 +1370,16 @@ static inline int siocdevprivate_ioctl(unsigned int fd, unsigned int cmd, unsign } #endif +extern void sk_init(void); + +#ifdef CONFIG_SYSCTL +extern struct ctl_table core_table[]; +extern int sysctl_optmem_max; +#endif + +#ifdef CONFIG_PROC_FS +extern __u32 sysctl_wmem_default; +extern __u32 sysctl_rmem_default; +#endif + #endif /* _SOCK_H */ diff --git a/include/net/tcp.h b/include/net/tcp.h index d958260..d6bcf13 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1183,4 +1183,16 @@ struct tcp_iter_state { extern int tcp_proc_register(struct tcp_seq_afinfo *afinfo); extern void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo); +extern struct request_sock_ops tcp_request_sock_ops; + +extern int tcp_v4_destroy_sock(struct sock *sk); + +#ifdef CONFIG_PROC_FS +extern int tcp4_proc_init(void); +extern void tcp4_proc_exit(void); +#endif + +extern void tcp_v4_init(struct net_proto_family *ops); +extern void tcp_init(void); + #endif /* _TCP_H */ diff --git a/include/net/udp.h b/include/net/udp.h index ac229b7..107b9d7 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -94,6 +94,11 @@ struct udp_iter_state { struct seq_operations seq_ops; }; +#ifdef CONFIG_PROC_FS extern int udp_proc_register(struct udp_seq_afinfo *afinfo); extern void udp_proc_unregister(struct udp_seq_afinfo *afinfo); + +extern int udp4_proc_init(void); +extern void udp4_proc_exit(void); +#endif #endif /* _UDP_H */ diff --git a/init/main.c b/init/main.c index c9c311c..ff41006 100644 --- a/init/main.c +++ b/init/main.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include @@ -80,7 +81,6 @@ static int init(void *); extern void init_IRQ(void); -extern void sock_init(void); extern void fork_init(unsigned long); extern void mca_init(void); extern void sbus_init(void); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3e0bbee..8e56e24 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -136,9 +137,6 @@ static struct ctl_table_header root_table_header = static ctl_table kern_table[]; static ctl_table vm_table[]; -#ifdef CONFIG_NET -extern ctl_table net_table[]; -#endif static ctl_table proc_table[]; static ctl_table fs_table[]; static ctl_table debug_table[]; diff --git a/net/802/p8023.c b/net/802/p8023.c index a0b61b4..6368d3d 100644 --- a/net/802/p8023.c +++ b/net/802/p8023.c @@ -20,6 +20,7 @@ #include #include +#include /* * Place an 802.3 header on a packet. The driver will do the mac diff --git a/net/802/sysctl_net_802.c b/net/802/sysctl_net_802.c index 3607963..7001295 100644 --- a/net/802/sysctl_net_802.c +++ b/net/802/sysctl_net_802.c @@ -10,9 +10,10 @@ * 2 of the License, or (at your option) any later version. */ +#include #include +#include #include -#include #ifdef CONFIG_TR extern int sysctl_tr_rif_timeout; diff --git a/net/core/dev.c b/net/core/dev.c index a3ed53c..c01511e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -267,10 +267,6 @@ void dev_add_pack(struct packet_type *pt) spin_unlock_bh(&ptype_lock); } -extern void linkwatch_run_queue(void); - - - /** * __dev_remove_pack - remove packet handler * @pt: packet type declaration @@ -1133,8 +1129,6 @@ static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb) #define illegal_highdma(dev, skb) (0) #endif -extern void skb_release_data(struct sk_buff *); - /* Keep head the same: replace data */ int __skb_linearize(struct sk_buff *skb, unsigned int __nocast gfp_mask) { diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 8f817ad..2f278c8 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -9,23 +9,18 @@ #include #include #include +#include +#include #ifdef CONFIG_SYSCTL extern int netdev_max_backlog; -extern int netdev_budget; extern int weight_p; -extern int net_msg_cost; -extern int net_msg_burst; extern __u32 sysctl_wmem_max; extern __u32 sysctl_rmem_max; -extern __u32 sysctl_wmem_default; -extern __u32 sysctl_rmem_default; extern int sysctl_core_destroy_delay; -extern int sysctl_optmem_max; -extern int sysctl_somaxconn; #ifdef CONFIG_NET_DIVERT extern char sysctl_divert_version[]; diff --git a/net/core/utils.c b/net/core/utils.c index 88eb8b6..7b5970f 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -16,7 +16,9 @@ #include #include #include +#include #include +#include #include #include #include diff --git a/net/core/wireless.c b/net/core/wireless.c index 19fa6a5..5caae23 100644 --- a/net/core/wireless.c +++ b/net/core/wireless.c @@ -571,10 +571,6 @@ static int wireless_seq_show(struct seq_file *seq, void *v) return 0; } -extern void *dev_seq_start(struct seq_file *seq, loff_t *pos); -extern void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos); -extern void dev_seq_stop(struct seq_file *seq, void *v); - static struct seq_operations wireless_seq_ops = { .start = dev_seq_start, .next = dev_seq_next, diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index f444a2f..87a052a 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -62,8 +62,6 @@ #include #include -extern int __init netdev_boot_setup(char *str); - __setup("ether=", netdev_boot_setup); /* diff --git a/net/ethernet/sysctl_net_ether.c b/net/ethernet/sysctl_net_ether.c index b81a6d5..66b39fc 100644 --- a/net/ethernet/sysctl_net_ether.c +++ b/net/ethernet/sysctl_net_ether.c @@ -7,6 +7,7 @@ #include #include +#include ctl_table ether_table[] = { {0} diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 20f52b5..5810f9d 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -859,10 +859,6 @@ static struct net_proto_family inet_family_ops = { .owner = THIS_MODULE, }; - -extern void tcp_init(void); -extern void tcp_v4_init(struct net_proto_family *); - /* Upon startup we insert all the elements in inetsw_array[] into * the linked list inetsw. */ @@ -1132,7 +1128,6 @@ static int __init init_ipv4_mibs(void) } static int ipv4_proc_init(void); -extern void ipfrag_init(void); /* * IP protocol layer initialiser @@ -1253,19 +1248,10 @@ module_init(inet_init); /* ------------------------------------------------------------------------ */ #ifdef CONFIG_PROC_FS -extern int fib_proc_init(void); -extern void fib_proc_exit(void); #ifdef CONFIG_IP_FIB_TRIE extern int fib_stat_proc_init(void); extern void fib_stat_proc_exit(void); #endif -extern int ip_misc_proc_init(void); -extern int raw_proc_init(void); -extern void raw_proc_exit(void); -extern int tcp4_proc_init(void); -extern void tcp4_proc_exit(void); -extern int udp4_proc_init(void); -extern void udp4_proc_exit(void); static int __init ipv4_proc_init(void) { diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 3fd49f4..c1b42b5 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 3c513ce..4410b9d 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -20,6 +20,7 @@ #include #include #include +#include #include /* diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index ddb1aed..aca088b 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -614,7 +614,6 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, } case IP_MSFILTER: { - extern int sysctl_optmem_max; extern int sysctl_igmp_max_msf; struct ip_msfilter *msf; @@ -769,7 +768,6 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, } case MCAST_MSFILTER: { - extern int sysctl_optmem_max; extern int sysctl_igmp_max_msf; struct sockaddr_in *psin; struct ip_msfilter *msf = NULL; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 3eadbb2..f7943ba 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -59,9 +59,6 @@ static int fold_prot_inuse(struct proto *proto) */ static int sockstat_seq_show(struct seq_file *seq, void *v) { - /* From net/socket.c */ - extern void socket_seq_show(struct seq_file *seq); - socket_seq_show(seq); seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n", fold_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count), diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 8692cb9..a34e60e 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -169,8 +169,6 @@ static inline int cookie_check(struct sk_buff *skb, __u32 cookie) return mssind < NUM_MSS ? msstab[mssind] + 1 : 0; } -extern struct request_sock_ops tcp_request_sock_ops; - static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst) diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index ce47a34..6526856 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -11,7 +11,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -19,36 +21,6 @@ /* From af_inet.c */ extern int sysctl_ip_nonlocal_bind; -/* From icmp.c */ -extern int sysctl_icmp_echo_ignore_all; -extern int sysctl_icmp_echo_ignore_broadcasts; -extern int sysctl_icmp_ignore_bogus_error_responses; -extern int sysctl_icmp_errors_use_inbound_ifaddr; - -/* From ip_fragment.c */ -extern int sysctl_ipfrag_low_thresh; -extern int sysctl_ipfrag_high_thresh; -extern int sysctl_ipfrag_time; -extern int sysctl_ipfrag_secret_interval; - -/* From ip_output.c */ -extern int sysctl_ip_dynaddr; - -/* From icmp.c */ -extern int sysctl_icmp_ratelimit; -extern int sysctl_icmp_ratemask; - -/* From igmp.c */ -extern int sysctl_igmp_max_memberships; -extern int sysctl_igmp_max_msf; - -/* From inetpeer.c */ -extern int inet_peer_threshold; -extern int inet_peer_minttl; -extern int inet_peer_maxttl; -extern int inet_peer_gc_mintime; -extern int inet_peer_gc_maxtime; - #ifdef CONFIG_SYSCTL static int tcp_retr1_max = 255; static int ip_local_port_range_min[] = { 1, 1 }; @@ -57,8 +29,6 @@ static int ip_local_port_range_max[] = { 65535, 65535 }; struct ipv4_config ipv4_config; -extern ctl_table ipv4_route_table[]; - #ifdef CONFIG_SYSCTL static @@ -136,10 +106,11 @@ static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file * return ret; } -int sysctl_tcp_congestion_control(ctl_table *table, int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, - void **context) +static int sysctl_tcp_congestion_control(ctl_table *table, int __user *name, + int nlen, void __user *oldval, + size_t __user *oldlenp, + void __user *newval, size_t newlen, + void **context) { char val[TCP_CA_NAME_MAX]; ctl_table tbl = { diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ebb8654..1afb080 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4229,7 +4229,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, */ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && !tp->srtt) - tcp_ack_saw_tstamp(sk, 0, 0); + tcp_ack_saw_tstamp(sk, NULL, 0); if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 97bbf59..13dfb39 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -66,6 +66,7 @@ #include #include #include +#include #include #include #include @@ -76,7 +77,6 @@ #include #include -extern int sysctl_ip_dynaddr; int sysctl_tcp_tw_reuse; int sysctl_tcp_low_latency; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 493abf9..937ad32 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1126,7 +1126,7 @@ void addrconf_leave_solict(struct inet6_dev *idev, struct in6_addr *addr) __ipv6_dev_mc_dec(idev, &maddr); } -void addrconf_join_anycast(struct inet6_ifaddr *ifp) +static void addrconf_join_anycast(struct inet6_ifaddr *ifp) { struct in6_addr addr; ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len); @@ -1135,7 +1135,7 @@ void addrconf_join_anycast(struct inet6_ifaddr *ifp) ipv6_dev_ac_inc(ifp->idev->dev, &addr); } -void addrconf_leave_anycast(struct inet6_ifaddr *ifp) +static void addrconf_leave_anycast(struct inet6_ifaddr *ifp) { struct in6_addr addr; ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 7df2ccb..4f8795a 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -67,23 +67,6 @@ MODULE_AUTHOR("Cast of dozens"); MODULE_DESCRIPTION("IPv6 protocol stack for Linux"); MODULE_LICENSE("GPL"); -/* IPv6 procfs goodies... */ - -#ifdef CONFIG_PROC_FS -extern int raw6_proc_init(void); -extern void raw6_proc_exit(void); -extern int tcp6_proc_init(void); -extern void tcp6_proc_exit(void); -extern int udp6_proc_init(void); -extern void udp6_proc_exit(void); -extern int ipv6_misc_proc_init(void); -extern void ipv6_misc_proc_exit(void); -extern int ac6_proc_init(void); -extern void ac6_proc_exit(void); -extern int if6_proc_init(void); -extern void if6_proc_exit(void); -#endif - int sysctl_ipv6_bindv6only; /* The inetsw table contains everything that inet_create needs to @@ -505,11 +488,6 @@ static struct net_proto_family inet6_family_ops = { .owner = THIS_MODULE, }; -#ifdef CONFIG_SYSCTL -extern void ipv6_sysctl_register(void); -extern void ipv6_sysctl_unregister(void); -#endif - /* Same as inet6_dgram_ops, sans udp_poll. */ static struct proto_ops inet6_sockraw_ops = { .family = PF_INET6, @@ -676,8 +654,6 @@ static void cleanup_ipv6_mibs(void) snmp6_mib_free((void **)udp_stats_in6); } -extern int ipv6_misc_proc_init(void); - static int __init inet6_init(void) { struct sk_buff *dummy_skb; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 76fe239..7516b88 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -109,13 +109,6 @@ int ip6_ra_control(struct sock *sk, int sel, void (*destructor)(struct sock *)) return 0; } -extern int ip6_mc_source(int add, int omode, struct sock *sk, - struct group_source_req *pgsr); -extern int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf); -extern int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, - struct group_filter __user *optval, int __user *optlen); - - int ipv6_setsockopt(struct sock *sk, int level, int optname, char __user *optval, int optlen) { @@ -446,7 +439,6 @@ done: } case MCAST_MSFILTER: { - extern int sysctl_optmem_max; extern int sysctl_mld_max_msf; struct group_filter *gsf; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 6ea494a..5d5bbb4 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1372,7 +1372,7 @@ int ipv6_route_ioctl(unsigned int cmd, void __user *arg) * Drop the packet on the floor */ -int ip6_pkt_discard(struct sk_buff *skb) +static int ip6_pkt_discard(struct sk_buff *skb) { IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES); icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev); @@ -1380,7 +1380,7 @@ int ip6_pkt_discard(struct sk_buff *skb) return 0; } -int ip6_pkt_discard_out(struct sk_buff *skb) +static int ip6_pkt_discard_out(struct sk_buff *skb) { skb->dev = skb->dst->dev; return ip6_pkt_discard(skb); @@ -1960,8 +1960,6 @@ static int rt6_proc_info(char *buffer, char **start, off_t offset, int length) return arg.len; } -extern struct rt6_statistics rt6_stats; - static int rt6_stats_seq_show(struct seq_file *seq, void *v) { seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index e553e5b..c3123c9 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -770,7 +770,7 @@ static int ipip6_tunnel_init(struct net_device *dev) return 0; } -int __init ipip6_fb_tunnel_init(struct net_device *dev) +static int __init ipip6_fb_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = dev->priv; struct iphdr *iph = &tunnel->parms.iph; diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 3a18e0e..8eff9fa 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -14,9 +14,6 @@ #include #include -extern ctl_table ipv6_route_table[]; -extern ctl_table ipv6_icmp_table[]; - #ifdef CONFIG_SYSCTL static ctl_table ipv6_table[] = { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index fb291b8..794734f 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1910,8 +1910,6 @@ static int tcp_v6_init_sock(struct sock *sk) static int tcp_v6_destroy_sock(struct sock *sk) { - extern int tcp_v4_destroy_sock(struct sock *sk); - tcp_v4_destroy_sock(sk); return inet6_destroy_sock(sk); } @@ -2123,8 +2121,6 @@ static struct inet6_protocol tcpv6_protocol = { .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, }; -extern struct proto_ops inet6_stream_ops; - static struct inet_protosw tcpv6_protosw = { .type = SOCK_STREAM, .protocol = IPPROTO_TCP, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index c348307..67d9a04 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1054,8 +1054,6 @@ struct proto udpv6_prot = { .obj_size = sizeof(struct udp6_sock), }; -extern struct proto_ops inet6_dgram_ops; - static struct inet_protosw udpv6_protosw = { .type = SOCK_DGRAM, .protocol = IPPROTO_UDP, diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index 180e383..34b3bb8 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -1940,9 +1940,7 @@ static struct notifier_block ipx_dev_notifier = { }; extern struct datalink_proto *make_EII_client(void); -extern struct datalink_proto *make_8023_client(void); extern void destroy_EII_client(struct datalink_proto *); -extern void destroy_8023_client(struct datalink_proto *); static unsigned char ipx_8022_type = 0xE0; static unsigned char ipx_snap_id[5] = { 0x0, 0x0, 0x0, 0x81, 0x37 }; diff --git a/net/socket.c b/net/socket.c index 5f76ab8..ce69b78 100644 --- a/net/socket.c +++ b/net/socket.c @@ -70,6 +70,8 @@ #include #include #include +#include +#include #include #include #include @@ -724,8 +726,8 @@ static ssize_t sock_aio_write(struct kiocb *iocb, const char __user *ubuf, return __sock_sendmsg(iocb, sock, &x->async_msg, size); } -ssize_t sock_sendpage(struct file *file, struct page *page, - int offset, size_t size, loff_t *ppos, int more) +static ssize_t sock_sendpage(struct file *file, struct page *page, + int offset, size_t size, loff_t *ppos, int more) { struct socket *sock; int flags; @@ -948,7 +950,7 @@ static int sock_mmap(struct file * file, struct vm_area_struct * vma) return sock->ops->mmap(file, sock, vma); } -int sock_close(struct inode *inode, struct file *filp) +static int sock_close(struct inode *inode, struct file *filp) { /* * It was possible the inode is NULL we were @@ -2027,9 +2029,6 @@ int sock_unregister(int family) return 0; } - -extern void sk_init(void); - void __init sock_init(void) { /* diff --git a/net/sysctl_net.c b/net/sysctl_net.c index 3f6e310..c5241fc 100644 --- a/net/sysctl_net.c +++ b/net/sysctl_net.c @@ -17,17 +17,15 @@ #include #ifdef CONFIG_INET -extern struct ctl_table ipv4_table[]; +#include #endif -extern struct ctl_table core_table[]; - #ifdef CONFIG_NET -extern struct ctl_table ether_table[]; +#include #endif #ifdef CONFIG_TR -extern struct ctl_table tr_table[]; +#include #endif struct ctl_table net_table[] = { diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index bc4c445..41feca3 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2026,14 +2026,6 @@ static struct net_proto_family unix_family_ops = { .owner = THIS_MODULE, }; -#ifdef CONFIG_SYSCTL -extern void unix_sysctl_register(void); -extern void unix_sysctl_unregister(void); -#else -static inline void unix_sysctl_register(void) {} -static inline void unix_sysctl_unregister(void) {} -#endif - static int __init af_unix_init(void) { int rc = -1; diff --git a/net/unix/sysctl_net_unix.c b/net/unix/sysctl_net_unix.c index c974dac..690ffa5 100644 --- a/net/unix/sysctl_net_unix.c +++ b/net/unix/sysctl_net_unix.c @@ -12,7 +12,7 @@ #include #include -extern int sysctl_unix_max_dgram_qlen; +#include static ctl_table unix_table[] = { { -- cgit v0.10.2 From 6ed8a48582c08432e84e5610564c1d25fe00dd7f Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 16 Aug 2005 19:02:15 -0300 Subject: [NETLINK]: Fix sparse warnings Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 7d1d968..1675186 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -129,7 +129,7 @@ extern struct sock *netlink_kernel_create(int unit, unsigned int groups, void (* extern void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err); extern int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 pid, int nonblock); extern int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 pid, - __u32 group, int allocation); + __u32 group, unsigned int __nocast allocation); extern void netlink_set_err(struct sock *ssk, __u32 pid, __u32 group, int code); extern int netlink_register_notifier(struct notifier_block *nb); extern int netlink_unregister_notifier(struct notifier_block *nb); diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index e259f46..62435ff 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -861,7 +861,7 @@ out: } int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid, - u32 group, int allocation) + u32 group, unsigned int __nocast allocation) { struct netlink_broadcast_data info; struct hlist_node *node; -- cgit v0.10.2 From 4c6ea29d82e0d1b9b37e6b879e0a7fd6c409333d Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 16 Aug 2005 19:46:48 -0300 Subject: [IP]: Introduce ip_options_get_from_user This variant is needed to satisfy sparse __user annotations. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/net/ip.h b/include/net/ip.h index 7623e41..e4563bb 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -335,7 +335,10 @@ extern void ip_options_build(struct sk_buff *skb, struct ip_options *opt, u32 da extern int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb); extern void ip_options_fragment(struct sk_buff *skb); extern int ip_options_compile(struct ip_options *opt, struct sk_buff *skb); -extern int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen, int user); +extern int ip_options_get(struct ip_options **optp, + unsigned char *data, int optlen); +extern int ip_options_get_from_user(struct ip_options **optp, + unsigned char __user *data, int optlen); extern void ip_options_undo(struct ip_options * opt); extern void ip_forward_options(struct sk_buff *skb); extern int ip_options_rcv_srr(struct sk_buff *skb); diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 7e02ba5..bce4e87 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -489,23 +489,18 @@ void ip_options_undo(struct ip_options * opt) } } -int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen, int user) +static struct ip_options *ip_options_get_alloc(const int optlen) { - struct ip_options *opt; + struct ip_options *opt = kmalloc(sizeof(*opt) + ((optlen + 3) & ~3), + GFP_KERNEL); + if (opt) + memset(opt, 0, sizeof(*opt)); + return opt; +} - opt = kmalloc(sizeof(struct ip_options)+((optlen+3)&~3), GFP_KERNEL); - if (!opt) - return -ENOMEM; - memset(opt, 0, sizeof(struct ip_options)); - if (optlen) { - if (user) { - if (copy_from_user(opt->__data, data, optlen)) { - kfree(opt); - return -EFAULT; - } - } else - memcpy(opt->__data, data, optlen); - } +static int ip_options_get_finish(struct ip_options **optp, + struct ip_options *opt, int optlen) +{ while (optlen & 3) opt->__data[optlen++] = IPOPT_END; opt->optlen = optlen; @@ -521,6 +516,30 @@ int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen, in return 0; } +int ip_options_get_from_user(struct ip_options **optp, unsigned char __user *data, int optlen) +{ + struct ip_options *opt = ip_options_get_alloc(optlen); + + if (!opt) + return -ENOMEM; + if (optlen && copy_from_user(opt->__data, data, optlen)) { + kfree(opt); + return -EFAULT; + } + return ip_options_get_finish(optp, opt, optlen); +} + +int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen) +{ + struct ip_options *opt = ip_options_get_alloc(optlen); + + if (!opt) + return -ENOMEM; + if (optlen) + memcpy(opt->__data, data, optlen); + return ip_options_get_finish(optp, opt, optlen); +} + void ip_forward_options(struct sk_buff *skb) { struct ip_options * opt = &(IPCB(skb)->opt); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index aca088b..2f0b47d 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -153,7 +153,7 @@ int ip_cmsg_send(struct msghdr *msg, struct ipcm_cookie *ipc) switch (cmsg->cmsg_type) { case IP_RETOPTS: err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)); - err = ip_options_get(&ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40, 0); + err = ip_options_get(&ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40); if (err) return err; break; @@ -425,7 +425,7 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, struct ip_options * opt = NULL; if (optlen > 40 || optlen < 0) goto e_inval; - err = ip_options_get(&opt, optval, optlen, 1); + err = ip_options_get_from_user(&opt, optval, optlen); if (err) break; if (sk->sk_type == SOCK_STREAM) { -- cgit v0.10.2 From bf73d1c5d726ac717755efc7e15d2a86dd383448 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Tue, 16 Aug 2005 20:45:45 -0700 Subject: [IRDA]: Possible cleanups. This patch contains the following possible cleanups: - make the following needlessly global function static: - irnet/irnet_ppp.c: irnet_init - remove the following unneeded EXPORT_SYMBOL's: - irlmp.c: sysctl_discovery_timeout - irlmp.c: irlmp_reasons - irlmp.c: irlmp_dup - irqueue.c: hashbin_find_next Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: David S. Miller diff --git a/net/irda/irlmp.c b/net/irda/irlmp.c index 7a4a4d7..c19e9ce 100644 --- a/net/irda/irlmp.c +++ b/net/irda/irlmp.c @@ -53,7 +53,6 @@ struct irlmp_cb *irlmp = NULL; /* These can be altered by the sysctl interface */ int sysctl_discovery = 0; int sysctl_discovery_timeout = 3; /* 3 seconds by default */ -EXPORT_SYMBOL(sysctl_discovery_timeout); int sysctl_discovery_slots = 6; /* 6 slots by default */ int sysctl_lap_keepalive_time = LM_IDLE_TIMEOUT * 1000 / HZ; char sysctl_devname[65]; @@ -67,7 +66,6 @@ const char *irlmp_reasons[] = { "LM_INIT_DISCONNECT", "ERROR, NOT USED", }; -EXPORT_SYMBOL(irlmp_reasons); /* * Function irlmp_init (void) @@ -675,7 +673,6 @@ struct lsap_cb *irlmp_dup(struct lsap_cb *orig, void *instance) return new; } -EXPORT_SYMBOL(irlmp_dup); /* * Function irlmp_disconnect_request (handle, userdata) diff --git a/net/irda/irnet/irnet.h b/net/irda/irnet/irnet.h index 9004f73..b391cb3 100644 --- a/net/irda/irnet/irnet.h +++ b/net/irda/irnet/irnet.h @@ -517,9 +517,6 @@ extern int irda_irnet_init(void); /* Initialise IrDA part of IrNET */ extern void irda_irnet_cleanup(void); /* Teardown IrDA part of IrNET */ -/* ---------------------------- MODULE ---------------------------- */ -extern int - irnet_init(void); /* Initialise IrNET module */ /**************************** VARIABLES ****************************/ diff --git a/net/irda/irnet/irnet_ppp.c b/net/irda/irnet/irnet_ppp.c index f8f984b..e53bf9e 100644 --- a/net/irda/irnet/irnet_ppp.c +++ b/net/irda/irnet/irnet_ppp.c @@ -1107,7 +1107,7 @@ ppp_irnet_cleanup(void) /* * Module main entry point */ -int __init +static int __init irnet_init(void) { int err; diff --git a/net/irda/irqueue.c b/net/irda/irqueue.c index b0dd3ea..1ba8c71 100644 --- a/net/irda/irqueue.c +++ b/net/irda/irqueue.c @@ -822,7 +822,6 @@ void* hashbin_find_next( hashbin_t* hashbin, long hashv, const char* name, return entry; } -EXPORT_SYMBOL(hashbin_find_next); /* * Function hashbin_get_first (hashbin) -- cgit v0.10.2 From ba602a816132dcc66e875dddf2c62512a9f6f8cb Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 16 Aug 2005 20:50:16 -0700 Subject: [IPVS]: Rename tcp_{init,exit}() --> ip_vs_tcp_{init,exit}() Conflicts with global namespace functions with the same name. Signed-off-by: David S. Miller diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c index e65de67..c194089 100644 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c @@ -604,14 +604,14 @@ void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp) } -static void tcp_init(struct ip_vs_protocol *pp) +static void ip_vs_tcp_init(struct ip_vs_protocol *pp) { IP_VS_INIT_HASH_TABLE(tcp_apps); pp->timeout_table = tcp_timeouts; } -static void tcp_exit(struct ip_vs_protocol *pp) +static void ip_vs_tcp_exit(struct ip_vs_protocol *pp) { } @@ -621,8 +621,8 @@ struct ip_vs_protocol ip_vs_protocol_tcp = { .protocol = IPPROTO_TCP, .dont_defrag = 0, .appcnt = ATOMIC_INIT(0), - .init = tcp_init, - .exit = tcp_exit, + .init = ip_vs_tcp_init, + .exit = ip_vs_tcp_exit, .register_app = tcp_register_app, .unregister_app = tcp_unregister_app, .conn_schedule = tcp_conn_schedule, -- cgit v0.10.2 From e92ae93a8aa66aea12935420cb22d4df1c18d023 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 17 Aug 2005 03:10:59 -0300 Subject: [DCCP]: Send SYNCACK packets in response to SYNC packets Also fix step 6 when receiving SYNC or SYNCACK packets, i.e. we were not using the updated swl. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index fff794c..4efdce4 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -122,7 +122,8 @@ extern int dccp_retransmit_skb(struct sock *sk, struct sk_buff *skb); extern int dccp_send_response(struct sock *sk); extern void dccp_send_ack(struct sock *sk); extern void dccp_send_delayed_ack(struct sock *sk); -extern void dccp_send_sync(struct sock *sk, u64 seq); +extern void dccp_send_sync(struct sock *sk, const u64 seq, + const enum dccp_pkt_type pkt_type); extern int dccp_write_xmit(struct sock *sk, struct sk_buff *skb, const int len); diff --git a/net/dccp/input.c b/net/dccp/input.c index 9dadfc3..68b6e72 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -50,7 +50,7 @@ static void dccp_rcv_closereq(struct sock *sk, struct sk_buff *skb) * Drop packet and return */ if (dccp_sk(sk)->dccps_role != DCCP_ROLE_CLIENT) { - dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq); + dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_PKT_SYNC); return; } @@ -76,8 +76,7 @@ static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb) { const struct dccp_hdr *dh = dccp_hdr(skb); struct dccp_sock *dp = dccp_sk(sk); - u64 lswl = dp->dccps_swl; - u64 lawl = dp->dccps_awl; + u64 lswl, lawl; /* * Step 5: Prepare sequence numbers for Sync @@ -99,6 +98,8 @@ static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb) dccp_update_gsr(sk, DCCP_SKB_CB(skb)->dccpd_seq); else return -1; + } + /* * Step 6: Check sequence numbers * Let LSWL = S.SWL and LAWL = S.AWL @@ -113,7 +114,10 @@ static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb) * Send Sync packet acknowledging P.seqno * Drop packet and return */ - } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ || + lswl = dp->dccps_swl; + lawl = dp->dccps_awl; + + if (dh->dccph_type == DCCP_PKT_CLOSEREQ || dh->dccph_type == DCCP_PKT_CLOSE || dh->dccph_type == DCCP_PKT_RESET) { lswl = dp->dccps_gsr; @@ -132,8 +136,8 @@ static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb) DCCP_PKT_WITHOUT_ACK_SEQ)) dp->dccps_gar = DCCP_SKB_CB(skb)->dccpd_ack_seq; } else { - dccp_pr_debug("Step 6 failed, sending SYNC...\n"); - dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq); + LIMIT_NETDEBUG("Step 6 failed, sending SYNC...\n"); + dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_PKT_SYNC); return -1; } @@ -242,9 +246,21 @@ int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, check_seq: if (!before48(DCCP_SKB_CB(skb)->dccpd_seq, dp->dccps_osr)) { send_sync: - dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq); + dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, + DCCP_PKT_SYNC); } break; + case DCCP_PKT_SYNC: + dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, + DCCP_PKT_SYNCACK); + /* + * From the draft: + * + * As with DCCP-Ack packets, DCCP-Sync and DCCP-SyncAck packets + * MAY have non-zero-length application data areas, whose + * contents * receivers MUST ignore. + */ + goto discard; } DCCP_INC_STATS_BH(DCCP_MIB_INERRS); @@ -517,7 +533,8 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, dh->dccph_type == DCCP_PKT_REQUEST) || (sk->sk_state == DCCP_RESPOND && dh->dccph_type == DCCP_PKT_DATA)) { - dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq); + dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, + DCCP_PKT_SYNC); goto discard; } diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index bc3cfc0..335e00e 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -376,7 +376,7 @@ static inline void dccp_do_pmtu_discovery(struct sock *sk, * probing, since DCCP-Sync probes do not risk application * data loss. */ - dccp_send_sync(sk, dp->dccps_gsr); + dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC); } /* else let the usual retransmit timer handle it */ } @@ -1008,7 +1008,7 @@ static inline int dccp_invalid_packet(struct sk_buff *skb) return 1; if (!pskb_may_pull(skb, sizeof(struct dccp_hdr))) { - dccp_pr_debug("pskb_may_pull failed\n"); + printk(KERN_WARNING "DCCP: pskb_may_pull failed\n"); return 1; } @@ -1016,7 +1016,7 @@ static inline int dccp_invalid_packet(struct sk_buff *skb) /* If the packet type is not understood, drop packet and return */ if (dh->dccph_type >= DCCP_PKT_INVALID) { - dccp_pr_debug("invalid packet type\n"); + printk(KERN_WARNING "DCCP: invalid packet type\n"); return 1; } @@ -1025,12 +1025,13 @@ static inline int dccp_invalid_packet(struct sk_buff *skb) * packet, drop packet and return */ if (dh->dccph_doff < dccp_hdr_len(skb) / sizeof(u32)) { - dccp_pr_debug("Offset(%u) too small 1\n", dh->dccph_doff); + printk(KERN_WARNING "DCCP: Offset(%u) too small 1\n", + dh->dccph_doff); return 1; } if (!pskb_may_pull(skb, dh->dccph_doff * sizeof(u32))) { - dccp_pr_debug("P.Data Offset(%u) too small 2\n", + printk(KERN_WARNING "DCCP: P.Data Offset(%u) too small 2\n", dh->dccph_doff); return 1; } @@ -1045,15 +1046,16 @@ static inline int dccp_invalid_packet(struct sk_buff *skb) dh->dccph_type != DCCP_PKT_DATA && dh->dccph_type != DCCP_PKT_ACK && dh->dccph_type != DCCP_PKT_DATAACK) { - dccp_pr_debug("P.type (%s) not Data, Ack nor DataAck and " - "P.X == 0\n", dccp_packet_name(dh->dccph_type)); + printk(KERN_WARNING "DCCP: P.type (%s) not Data, Ack nor " + "DataAck and P.X == 0\n", + dccp_packet_name(dh->dccph_type)); return 1; } /* If the header checksum is incorrect, drop packet and return */ if (dccp_v4_verify_checksum(skb, skb->nh.iph->saddr, skb->nh.iph->daddr) < 0) { - dccp_pr_debug("header checksum is incorrect\n"); + printk(KERN_WARNING "DCCP: header checksum is incorrect\n"); return 1; } diff --git a/net/dccp/output.c b/net/dccp/output.c index dcc061b..384fd09 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -404,7 +404,8 @@ void dccp_send_delayed_ack(struct sock *sk) sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout); } -void dccp_send_sync(struct sock *sk, u64 seq) +void dccp_send_sync(struct sock *sk, const u64 seq, + const enum dccp_pkt_type pkt_type) { /* * We are not putting this on the write queue, so @@ -420,7 +421,7 @@ void dccp_send_sync(struct sock *sk, u64 seq) /* Reserve space for headers and prepare control bits. */ skb_reserve(skb, MAX_DCCP_HEADER); skb->csum = 0; - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_SYNC; + DCCP_SKB_CB(skb)->dccpd_type = pkt_type; DCCP_SKB_CB(skb)->dccpd_seq = seq; skb_set_owner_w(skb, sk); -- cgit v0.10.2 From d179cd12928443f3ec29cfbc3567439644bd0afc Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 17 Aug 2005 14:57:30 -0700 Subject: [NET]: Implement SKB fast cloning. Protocols that make extensive use of SKB cloning, for example TCP, eat at least 2 allocations per packet sent as a result. To cut the kmalloc() count in half, we implement a pre-allocation scheme wherein we allocate 2 sk_buff objects in advance, then use a simple reference count to free up the memory at the correct time. Based upon an initial patch by Thomas Graf and suggestions from Herbert Xu. Signed-off-by: David S. Miller diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index db10335..42edce6 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -162,6 +162,13 @@ struct skb_timeval { u32 off_usec; }; + +enum { + SKB_FCLONE_UNAVAILABLE, + SKB_FCLONE_ORIG, + SKB_FCLONE_CLONE, +}; + /** * struct sk_buff - socket buffer * @next: Next buffer in list @@ -255,7 +262,8 @@ struct sk_buff { ip_summed:2, nohdr:1, nfctinfo:3; - __u8 pkt_type; + __u8 pkt_type:3, + fclone:2; __be16 protocol; void (*destructor)(struct sk_buff *skb); @@ -295,8 +303,20 @@ struct sk_buff { #include extern void __kfree_skb(struct sk_buff *skb); -extern struct sk_buff *alloc_skb(unsigned int size, - unsigned int __nocast priority); +extern struct sk_buff *__alloc_skb(unsigned int size, + unsigned int __nocast priority, int fclone); +static inline struct sk_buff *alloc_skb(unsigned int size, + unsigned int __nocast priority) +{ + return __alloc_skb(size, priority, 0); +} + +static inline struct sk_buff *alloc_skb_fclone(unsigned int size, + unsigned int __nocast priority) +{ + return __alloc_skb(size, priority, 1); +} + extern struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp, unsigned int size, unsigned int __nocast priority); diff --git a/include/net/sock.h b/include/net/sock.h index 1418388..d57aece 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1200,7 +1200,7 @@ static inline struct sk_buff *sk_stream_alloc_pskb(struct sock *sk, int hdr_len; hdr_len = SKB_DATA_ALIGN(sk->sk_prot->max_header); - skb = alloc_skb(size + hdr_len, gfp); + skb = alloc_skb_fclone(size + hdr_len, gfp); if (skb) { skb->truesize += mem; if (sk->sk_forward_alloc >= (int)skb->truesize || diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 39a161d..b853a9b 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -69,6 +69,7 @@ #include static kmem_cache_t *skbuff_head_cache; +static kmem_cache_t *skbuff_fclone_cache; struct timeval __read_mostly skb_tv_base; @@ -120,7 +121,7 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here) */ /** - * alloc_skb - allocate a network buffer + * __alloc_skb - allocate a network buffer * @size: size to allocate * @gfp_mask: allocation mask * @@ -131,14 +132,20 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here) * Buffers may only be allocated from interrupts using a @gfp_mask of * %GFP_ATOMIC. */ -struct sk_buff *alloc_skb(unsigned int size, unsigned int __nocast gfp_mask) +struct sk_buff *__alloc_skb(unsigned int size, unsigned int __nocast gfp_mask, + int fclone) { struct sk_buff *skb; u8 *data; /* Get the HEAD */ - skb = kmem_cache_alloc(skbuff_head_cache, - gfp_mask & ~__GFP_DMA); + if (fclone) + skb = kmem_cache_alloc(skbuff_fclone_cache, + gfp_mask & ~__GFP_DMA); + else + skb = kmem_cache_alloc(skbuff_head_cache, + gfp_mask & ~__GFP_DMA); + if (!skb) goto out; @@ -155,7 +162,15 @@ struct sk_buff *alloc_skb(unsigned int size, unsigned int __nocast gfp_mask) skb->data = data; skb->tail = data; skb->end = data + size; + if (fclone) { + struct sk_buff *child = skb + 1; + atomic_t *fclone_ref = (atomic_t *) (child + 1); + skb->fclone = SKB_FCLONE_ORIG; + atomic_set(fclone_ref, 1); + + child->fclone = SKB_FCLONE_UNAVAILABLE; + } atomic_set(&(skb_shinfo(skb)->dataref), 1); skb_shinfo(skb)->nr_frags = 0; skb_shinfo(skb)->tso_size = 0; @@ -268,8 +283,34 @@ void skb_release_data(struct sk_buff *skb) */ void kfree_skbmem(struct sk_buff *skb) { + struct sk_buff *other; + atomic_t *fclone_ref; + skb_release_data(skb); - kmem_cache_free(skbuff_head_cache, skb); + switch (skb->fclone) { + case SKB_FCLONE_UNAVAILABLE: + kmem_cache_free(skbuff_head_cache, skb); + break; + + case SKB_FCLONE_ORIG: + fclone_ref = (atomic_t *) (skb + 2); + if (atomic_dec_and_test(fclone_ref)) + kmem_cache_free(skbuff_fclone_cache, skb); + break; + + case SKB_FCLONE_CLONE: + fclone_ref = (atomic_t *) (skb + 1); + other = skb - 1; + + /* The clone portion is available for + * fast-cloning again. + */ + skb->fclone = SKB_FCLONE_UNAVAILABLE; + + if (atomic_dec_and_test(fclone_ref)) + kmem_cache_free(skbuff_fclone_cache, other); + break; + }; } /** @@ -324,10 +365,20 @@ void __kfree_skb(struct sk_buff *skb) struct sk_buff *skb_clone(struct sk_buff *skb, unsigned int __nocast gfp_mask) { - struct sk_buff *n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); - - if (!n) - return NULL; + struct sk_buff *n; + + n = skb + 1; + if (skb->fclone == SKB_FCLONE_ORIG && + n->fclone == SKB_FCLONE_UNAVAILABLE) { + atomic_t *fclone_ref = (atomic_t *) (n + 1); + n->fclone = SKB_FCLONE_CLONE; + atomic_inc(fclone_ref); + } else { + n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); + if (!n) + return NULL; + n->fclone = SKB_FCLONE_UNAVAILABLE; + } #define C(x) n->x = skb->x @@ -409,6 +460,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->mac.raw = old->mac.raw + offset; memcpy(new->cb, old->cb, sizeof(old->cb)); new->local_df = old->local_df; + new->fclone = SKB_FCLONE_UNAVAILABLE; new->pkt_type = old->pkt_type; new->tstamp = old->tstamp; new->destructor = NULL; @@ -1647,13 +1699,23 @@ void __init skb_init(void) NULL, NULL); if (!skbuff_head_cache) panic("cannot create skbuff cache"); + + skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", + (2*sizeof(struct sk_buff)) + + sizeof(atomic_t), + 0, + SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!skbuff_fclone_cache) + panic("cannot create skbuff cache"); + do_gettimeofday(&skb_tv_base); } EXPORT_SYMBOL(___pskb_trim); EXPORT_SYMBOL(__kfree_skb); EXPORT_SYMBOL(__pskb_pull_tail); -EXPORT_SYMBOL(alloc_skb); +EXPORT_SYMBOL(__alloc_skb); EXPORT_SYMBOL(pskb_copy); EXPORT_SYMBOL(pskb_expand_head); EXPORT_SYMBOL(skb_checksum); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 8d92ab5..75b6811 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1582,7 +1582,7 @@ void tcp_send_fin(struct sock *sk) } else { /* Socket is locked, keep trying until memory is available. */ for (;;) { - skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL); + skb = alloc_skb_fclone(MAX_TCP_HEADER, GFP_KERNEL); if (skb) break; yield(); @@ -1804,7 +1804,7 @@ int tcp_connect(struct sock *sk) tcp_connect_init(sk); - buff = alloc_skb(MAX_TCP_HEADER + 15, sk->sk_allocation); + buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation); if (unlikely(buff == NULL)) return -ENOBUFS; -- cgit v0.10.2 From b1c9fe7b818acbd36dc908c5c1ad4cab34c67b39 Mon Sep 17 00:00:00 2001 From: Ian McDonald Date: Thu, 18 Aug 2005 20:45:29 -0300 Subject: [DCCP]: Fix elapsed time option as per section 13.2 of spec v11 The elapsed time can be two bytes or four bytes only. Signed-off-by: Ian McDonald Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/net/dccp/options.c b/net/dccp/options.c index d87d6be..85a86bd 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -293,10 +293,7 @@ static void dccp_insert_option_ndp(struct sock *sk, struct sk_buff *skb) static inline int dccp_elapsed_time_len(const u32 elapsed_time) { - return elapsed_time == 0 ? 0 : - elapsed_time <= 0xFF ? 1 : - elapsed_time <= 0xFFFF ? 2 : - elapsed_time <= 0xFFFFFF ? 3 : 4; + return elapsed_time == 0 ? 0 : elapsed_time <= 0xFFFF ? 2 : 4; } void dccp_insert_option_elapsed_time(struct sock *sk, -- cgit v0.10.2 From 5480855bfbc125f34d9b752689bb9a64da7e1fc6 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 18 Aug 2005 20:47:02 -0300 Subject: [DCCP]: Set dccp_ctl_socket to NULL in dccp_ctl_sock_exit Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 0b715ce..8b613c3 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -506,8 +506,10 @@ static int __init dccp_ctl_sock_init(void) #ifdef CONFIG_IP_DCCP_UNLOAD_HACK void dccp_ctl_sock_exit(void) { - if (dccp_ctl_socket != NULL) + if (dccp_ctl_socket != NULL) { sock_release(dccp_ctl_socket); + dccp_ctl_socket = NULL; + } } EXPORT_SYMBOL_GPL(dccp_ctl_sock_exit); -- cgit v0.10.2 From c59eab4637dbc3f832503be4ccb9213b0f323d92 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 18 Aug 2005 21:12:02 -0300 Subject: [DCCP]: Use LIMIT_NETDEBUG in some debugging printks Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/net/dccp/input.c b/net/dccp/input.c index 68b6e72..3c4cbff 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -118,8 +118,8 @@ static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb) lawl = dp->dccps_awl; if (dh->dccph_type == DCCP_PKT_CLOSEREQ || - dh->dccph_type == DCCP_PKT_CLOSE || - dh->dccph_type == DCCP_PKT_RESET) { + dh->dccph_type == DCCP_PKT_CLOSE || + dh->dccph_type == DCCP_PKT_RESET) { lswl = dp->dccps_gsr; dccp_inc_seqno(&lswl); lawl = dp->dccps_gar; @@ -136,7 +136,8 @@ static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb) DCCP_PKT_WITHOUT_ACK_SEQ)) dp->dccps_gar = DCCP_SKB_CB(skb)->dccpd_ack_seq; } else { - LIMIT_NETDEBUG("Step 6 failed, sending SYNC...\n"); + LIMIT_NETDEBUG(KERN_WARNING "DCCP: Step 6 failed, " + "sending SYNC...\n"); dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_PKT_SYNC); return -1; } @@ -168,8 +169,8 @@ int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, if (dccp_ackpkts_add(dp->dccps_hc_rx_ackpkts, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_ACKPKTS_STATE_RECEIVED)) { - LIMIT_NETDEBUG(KERN_INFO "DCCP: acknowledgeable " - "packets buffer full!\n"); + LIMIT_NETDEBUG(KERN_WARNING "DCCP: acknowledgeable " + "packets buffer full!\n"); ap->dccpap_ack_seqno = DCCP_MAX_SEQNO + 1; inet_csk_schedule_ack(sk); inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 335e00e..cc5d60d 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -1008,7 +1008,7 @@ static inline int dccp_invalid_packet(struct sk_buff *skb) return 1; if (!pskb_may_pull(skb, sizeof(struct dccp_hdr))) { - printk(KERN_WARNING "DCCP: pskb_may_pull failed\n"); + LIMIT_NETDEBUG(KERN_WARNING "DCCP: pskb_may_pull failed\n"); return 1; } @@ -1016,7 +1016,7 @@ static inline int dccp_invalid_packet(struct sk_buff *skb) /* If the packet type is not understood, drop packet and return */ if (dh->dccph_type >= DCCP_PKT_INVALID) { - printk(KERN_WARNING "DCCP: invalid packet type\n"); + LIMIT_NETDEBUG(KERN_WARNING "DCCP: invalid packet type\n"); return 1; } @@ -1025,14 +1025,16 @@ static inline int dccp_invalid_packet(struct sk_buff *skb) * packet, drop packet and return */ if (dh->dccph_doff < dccp_hdr_len(skb) / sizeof(u32)) { - printk(KERN_WARNING "DCCP: Offset(%u) too small 1\n", - dh->dccph_doff); + LIMIT_NETDEBUG(KERN_WARNING "DCCP: P.Data Offset(%u) " + "too small 1\n", + dh->dccph_doff); return 1; } if (!pskb_may_pull(skb, dh->dccph_doff * sizeof(u32))) { - printk(KERN_WARNING "DCCP: P.Data Offset(%u) too small 2\n", - dh->dccph_doff); + LIMIT_NETDEBUG(KERN_WARNING "DCCP: P.Data Offset(%u) " + "too small 2\n", + dh->dccph_doff); return 1; } @@ -1046,16 +1048,17 @@ static inline int dccp_invalid_packet(struct sk_buff *skb) dh->dccph_type != DCCP_PKT_DATA && dh->dccph_type != DCCP_PKT_ACK && dh->dccph_type != DCCP_PKT_DATAACK) { - printk(KERN_WARNING "DCCP: P.type (%s) not Data, Ack nor " - "DataAck and P.X == 0\n", - dccp_packet_name(dh->dccph_type)); + LIMIT_NETDEBUG(KERN_WARNING "DCCP: P.type (%s) not Data, Ack " + "nor DataAck and P.X == 0\n", + dccp_packet_name(dh->dccph_type)); return 1; } /* If the header checksum is incorrect, drop packet and return */ if (dccp_v4_verify_checksum(skb, skb->nh.iph->saddr, skb->nh.iph->daddr) < 0) { - printk(KERN_WARNING "DCCP: header checksum is incorrect\n"); + LIMIT_NETDEBUG(KERN_WARNING "DCCP: header checksum is " + "incorrect\n"); return 1; } -- cgit v0.10.2 From bf0ff9e578ba7dd8331005f00ad7310122011f43 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 19 Aug 2005 16:37:30 -0700 Subject: [IPVS]: ipv4_table --> ipvs_ipv4_table Fix conflict with symbol of same name in global namespace. Signed-off-by: David S. Miller diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index 7d99ede..2d66848 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -1598,7 +1598,7 @@ static ctl_table vs_table[] = { { .ctl_name = 0 } }; -static ctl_table ipv4_table[] = { +static ctl_table ipvs_ipv4_table[] = { { .ctl_name = NET_IPV4, .procname = "ipv4", @@ -1613,7 +1613,7 @@ static ctl_table vs_root_table[] = { .ctl_name = CTL_NET, .procname = "net", .mode = 0555, - .child = ipv4_table, + .child = ipvs_ipv4_table, }, { .ctl_name = 0 } }; diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c index c035838..561cda3 100644 --- a/net/ipv4/ipvs/ip_vs_lblc.c +++ b/net/ipv4/ipvs/ip_vs_lblc.c @@ -131,7 +131,7 @@ static ctl_table vs_table[] = { { .ctl_name = 0 } }; -static ctl_table ipv4_table[] = { +static ctl_table ipvs_ipv4_table[] = { { .ctl_name = NET_IPV4, .procname = "ipv4", @@ -146,7 +146,7 @@ static ctl_table lblc_root_table[] = { .ctl_name = CTL_NET, .procname = "net", .mode = 0555, - .child = ipv4_table + .child = ipvs_ipv4_table }, { .ctl_name = 0 } }; diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c index 22b5dd5..ce456db 100644 --- a/net/ipv4/ipvs/ip_vs_lblcr.c +++ b/net/ipv4/ipvs/ip_vs_lblcr.c @@ -320,7 +320,7 @@ static ctl_table vs_table[] = { { .ctl_name = 0 } }; -static ctl_table ipv4_table[] = { +static ctl_table ipvs_ipv4_table[] = { { .ctl_name = NET_IPV4, .procname = "ipv4", @@ -335,7 +335,7 @@ static ctl_table lblcr_root_table[] = { .ctl_name = CTL_NET, .procname = "net", .mode = 0555, - .child = ipv4_table + .child = ipvs_ipv4_table }, { .ctl_name = 0 } }; -- cgit v0.10.2 From 1bc0986957b63a2fbbc46ab95d3d1d72830bda83 Mon Sep 17 00:00:00 2001 From: Ian McDonald Date: Sat, 20 Aug 2005 00:23:43 -0300 Subject: [DCCP]: Fix the timestamp options This changes timestamp, timestamp echo, and elapsed time to use units of 10 usecs as per DCCP spec. This has been tested to verify that times are correct. Also fixed up length and used hton/ntoh more. Still to add in later patches: - actually use elapsed time to adjust RTT (commented out as was prior to this patch) - send options at times more closely following the spec (content is now correct) Signed-off-by: Ian McDonald Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 3dccdd5..9e3a137 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -415,7 +415,7 @@ struct dccp_sock { __u64 dccps_gsr; __u64 dccps_gar; unsigned long dccps_service; - unsigned long dccps_timestamp_time; + struct timeval dccps_timestamp_time; __u32 dccps_timestamp_echo; __u32 dccps_avg_packet_size; unsigned long dccps_ndp_count; diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 2dd3e94..6941490 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -2,12 +2,12 @@ * net/dccp/ccids/ccid3.c * * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand. + * Copyright (c) 2005 Ian McDonald * * An implementation of the DCCP protocol * * This code has been developed by the University of Waikato WAND * research group. For further information please see http://www.wand.net.nz/ - * or e-mail Ian McDonald - iam4@cs.waikato.ac.nz * * This code also uses code from Lulea University, rereleased as GPL by its * authors: @@ -174,14 +174,6 @@ static inline void timeval_fix(struct timeval *tv) } } -/* returns the difference in usecs between timeval passed in and current time */ -static inline u32 now_delta(struct timeval tv) { - struct timeval now; - - do_gettimeofday(&now); - return ((now.tv_sec-tv.tv_sec)*1000000+now.tv_usec-tv.tv_usec); -} - #define CALCX_ARRSIZE 500 #define CALCX_SPLIT 50000 @@ -1110,7 +1102,7 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) struct ccid3_options_received *opt_recv; struct dccp_tx_hist_entry *packet; unsigned long next_tmout; - u16 t_elapsed; + u32 t_elapsed; u32 pinv; u32 x_recv; u32 r_sample; diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 4efdce4..aab72b8 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -4,7 +4,8 @@ * net/dccp/dccp.h * * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo + * Copyright (c) 2005 Arnaldo Carvalho de Melo + * Copyright (c) 2005 Ian McDonald * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 as @@ -404,6 +405,7 @@ extern struct socket *dccp_ctl_socket; * @dccpap_ack_nonce - the one-bit sum of the ECN Nonces for all State 0. * * @dccpap_buf_len - circular buffer length + * @dccpap_time - the time in usecs * @dccpap_buf - circular buffer of acknowledgeable packets */ struct dccp_ackpkts { @@ -416,7 +418,7 @@ struct dccp_ackpkts { unsigned int dccpap_buf_vector_len; unsigned int dccpap_ack_vector_len; unsigned int dccpap_buf_len; - unsigned long dccpap_time; + struct timeval dccpap_time; u8 dccpap_buf_nonce; u8 dccpap_ack_nonce; u8 dccpap_buf[0]; @@ -430,6 +432,19 @@ extern int dccp_ackpkts_add(struct dccp_ackpkts *ap, u64 ackno, u8 state); extern void dccp_ackpkts_check_rcv_ackno(struct dccp_ackpkts *ap, struct sock *sk, u64 ackno); +/* + * Returns the difference in usecs between timeval + * passed in and current time + */ +static inline u32 now_delta(struct timeval tv) +{ + struct timeval now; + + do_gettimeofday(&now); + return (now.tv_sec - tv.tv_sec) * USEC_PER_SEC + + (now.tv_usec - tv.tv_usec); +} + #ifdef CONFIG_IP_DCCP_DEBUG extern void dccp_ackvector_print(const u64 ackno, const unsigned char *vector, int len); diff --git a/net/dccp/options.c b/net/dccp/options.c index 85a86bd..7ecffdf 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -2,8 +2,9 @@ * net/dccp/options.c * * An implementation of the DCCP protocol - * Aristeu Sergio Rozanski Filho - * Arnaldo Carvalho de Melo + * Copyright (c) 2005 Aristeu Sergio Rozanski Filho + * Copyright (c) 2005 Arnaldo Carvalho de Melo + * Copyright (c) 2005 Ian McDonald * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -138,7 +139,7 @@ int dccp_parse_options(struct sock *sk, struct sk_buff *skb) opt_recv->dccpor_timestamp = ntohl(*(u32 *)value); dp->dccps_timestamp_echo = opt_recv->dccpor_timestamp; - dp->dccps_timestamp_time = jiffies; + do_gettimeofday(&dp->dccps_timestamp_time); dccp_pr_debug("%sTIMESTAMP=%u, ackno=%llu\n", debug_prefix, opt_recv->dccpor_timestamp, @@ -146,36 +147,45 @@ int dccp_parse_options(struct sock *sk, struct sk_buff *skb) DCCP_SKB_CB(skb)->dccpd_ack_seq); break; case DCCPO_TIMESTAMP_ECHO: - if (len < 4 || len > 8) + if (len != 4 && len != 6 && len != 8) goto out_invalid_option; opt_recv->dccpor_timestamp_echo = ntohl(*(u32 *)value); - dccp_pr_debug("%sTIMESTAMP_ECHO=%u, len=%d, ackno=%llu, " - "diff=%u\n", + dccp_pr_debug("%sTIMESTAMP_ECHO=%u, len=%d, ackno=%llu, ", debug_prefix, opt_recv->dccpor_timestamp_echo, len + 2, (unsigned long long) - DCCP_SKB_CB(skb)->dccpd_ack_seq, - (tcp_time_stamp - - opt_recv->dccpor_timestamp_echo)); - - opt_recv->dccpor_elapsed_time = - dccp_decode_value_var(value + 4, - len - 4); - dccp_pr_debug("%sTIMESTAMP_ECHO ELAPSED_TIME=%d\n", + DCCP_SKB_CB(skb)->dccpd_ack_seq); + + if (len > 4) { + if (len == 6) + opt_recv->dccpor_elapsed_time = + ntohs(*(u16 *)(value + 4)); + else + opt_recv->dccpor_elapsed_time = + ntohl(*(u32 *)(value + 4)); + + dccp_pr_debug("%sTIMESTAMP_ECHO ELAPSED_TIME=%d\n", debug_prefix, opt_recv->dccpor_elapsed_time); + } break; case DCCPO_ELAPSED_TIME: - if (len > 4) + if (len != 2 && len != 4) goto out_invalid_option; if (pkt_type == DCCP_PKT_DATA) continue; - opt_recv->dccpor_elapsed_time = - dccp_decode_value_var(value, len); + + if (len == 2) + opt_recv->dccpor_elapsed_time = + ntohs(*(u16 *)value); + else + opt_recv->dccpor_elapsed_time = + ntohl(*(u32 *)value); + dccp_pr_debug("%sELAPSED_TIME=%d\n", debug_prefix, opt_recv->dccpor_elapsed_time); break; @@ -309,8 +319,7 @@ void dccp_insert_option_elapsed_time(struct sock *sk, const int len = 2 + elapsed_time_len; unsigned char *to; - /* If elapsed_time == 0... */ - if (elapsed_time_len == 2) + if (elapsed_time_len == 0) return; if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) { @@ -325,7 +334,13 @@ void dccp_insert_option_elapsed_time(struct sock *sk, *to++ = DCCPO_ELAPSED_TIME; *to++ = len; - dccp_encode_value_var(elapsed_time, to, elapsed_time_len); + if (elapsed_time_len == 2) { + const u16 var16 = htons((u16)elapsed_time); + memcpy(to, &var16, 2); + } else { + const u32 var32 = htonl(elapsed_time); + memcpy(to, &var32, 4); + } dccp_pr_debug("%sELAPSED_TIME=%u, len=%d, seqno=%llu\n", debug_prefix, elapsed_time, @@ -344,7 +359,7 @@ static void dccp_insert_option_ack_vector(struct sock *sk, struct sk_buff *skb) #endif struct dccp_ackpkts *ap = dp->dccps_hc_rx_ackpkts; int len = ap->dccpap_buf_vector_len + 2; - const u32 elapsed_time = jiffies_to_usecs(jiffies - ap->dccpap_time) / 10; + const u32 elapsed_time = now_delta(ap->dccpap_time) / 10; unsigned char *to, *from; if (elapsed_time != 0) @@ -414,7 +429,15 @@ static void dccp_insert_option_ack_vector(struct sock *sk, struct sk_buff *skb) static inline void dccp_insert_option_timestamp(struct sock *sk, struct sk_buff *skb) { - const u32 now = htonl(tcp_time_stamp); + struct timeval tv; + u32 now; + + do_gettimeofday(&tv); + now = (tv.tv_sec * USEC_PER_SEC + tv.tv_usec) / 10; + /* yes this will overflow but that is the point as we want a + * 10 usec 32 bit timer which mean it wraps every 11.9 hours */ + + now = htonl(now); dccp_insert_option(sk, skb, DCCPO_TIMESTAMP, &now, sizeof(now)); } @@ -427,8 +450,7 @@ static void dccp_insert_option_timestamp_echo(struct sock *sk, "CLIENT TX opt: " : "server TX opt: "; #endif u32 tstamp_echo; - const u32 elapsed_time = jiffies_to_usecs(jiffies - - dp->dccps_timestamp_time) / 10; + const u32 elapsed_time = now_delta(dp->dccps_timestamp_time) / 10; const int elapsed_time_len = dccp_elapsed_time_len(elapsed_time); const int len = 6 + elapsed_time_len; unsigned char *to; @@ -448,7 +470,14 @@ static void dccp_insert_option_timestamp_echo(struct sock *sk, tstamp_echo = htonl(dp->dccps_timestamp_echo); memcpy(to, &tstamp_echo, 4); to += 4; - dccp_encode_value_var(elapsed_time, to, elapsed_time_len); + + if (elapsed_time_len == 2) { + const u16 var16 = htons((u16)elapsed_time); + memcpy(to, &var16, 2); + } else if (elapsed_time_len == 4) { + const u32 var32 = htonl(elapsed_time); + memcpy(to, &var32, 4); + } dccp_pr_debug("%sTIMESTAMP_ECHO=%u, len=%d, seqno=%llu\n", debug_prefix, dp->dccps_timestamp_echo, @@ -456,7 +485,8 @@ static void dccp_insert_option_timestamp_echo(struct sock *sk, (unsigned long long) DCCP_SKB_CB(skb)->dccpd_seq); dp->dccps_timestamp_echo = 0; - dp->dccps_timestamp_time = 0; + dp->dccps_timestamp_time.tv_sec = 0; + dp->dccps_timestamp_time.tv_usec = 0; } void dccp_insert_options(struct sock *sk, struct sk_buff *skb) @@ -514,7 +544,8 @@ struct dccp_ackpkts *dccp_ackpkts_alloc(const unsigned int len, ap->dccpap_ack_seqno = DCCP_MAX_SEQNO + 1; ap->dccpap_buf_nonce = ap->dccpap_buf_nonce = 0; ap->dccpap_ack_ptr = 0; - ap->dccpap_time = 0; + ap->dccpap_time.tv_sec = 0; + ap->dccpap_time.tv_usec = 0; ap->dccpap_buf_vector_len = ap->dccpap_ack_vector_len = 0; } @@ -665,7 +696,7 @@ int dccp_ackpkts_add(struct dccp_ackpkts *ap, u64 ackno, u8 state) } ap->dccpap_buf_ackno = ackno; - ap->dccpap_time = jiffies; + do_gettimeofday(&ap->dccpap_time); out: dccp_pr_debug(""); dccp_ackpkts_print(ap); -- cgit v0.10.2 From 8cd25c1fcfbf6460983e99091d278187421c1a1d Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Sat, 20 Aug 2005 17:14:11 -0700 Subject: [NET]: fix PROC_FS=n compile Signed-off-by: Adrian Bunk Signed-off-by: David S. Miller diff --git a/include/net/sock.h b/include/net/sock.h index d57aece..312cb25 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1377,9 +1377,7 @@ extern struct ctl_table core_table[]; extern int sysctl_optmem_max; #endif -#ifdef CONFIG_PROC_FS extern __u32 sysctl_wmem_default; extern __u32 sysctl_rmem_default; -#endif #endif /* _SOCK_H */ -- cgit v0.10.2 From a6f9a70578b981321b63786ac8015f17cca4fcbd Mon Sep 17 00:00:00 2001 From: Jon Wetzel Date: Sat, 20 Aug 2005 17:15:54 -0700 Subject: [NET]: Add support for getting the permanent hardware address. This patch adds a new field to net device to hold the permanent hardware address, and adds a new generic ethtool_op function to get that address. Signed-off-by: Jon Wetzel Signed-off-by: John W. Linville Signed-off-by: David S. Miller diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index d7021c3..ed1440e 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -250,6 +250,12 @@ struct ethtool_stats { u64 data[0]; }; +struct ethtool_perm_addr { + u32 cmd; /* ETHTOOL_GPERMADDR */ + u32 size; + u8 data[0]; +}; + struct net_device; /* Some generic methods drivers may use in their ethtool_ops */ @@ -261,6 +267,8 @@ u32 ethtool_op_get_sg(struct net_device *dev); int ethtool_op_set_sg(struct net_device *dev, u32 data); u32 ethtool_op_get_tso(struct net_device *dev); int ethtool_op_set_tso(struct net_device *dev, u32 data); +int ethtool_op_get_perm_addr(struct net_device *dev, + struct ethtool_perm_addr *addr, u8 *data); /** * ðtool_ops - Alter and report network device settings @@ -294,7 +302,8 @@ int ethtool_op_set_tso(struct net_device *dev, u32 data); * get_strings: Return a set of strings that describe the requested objects * phys_id: Identify the device * get_stats: Return statistics about the device - * + * get_perm_addr: Gets the permanent hardware address + * * Description: * * get_settings: @@ -352,6 +361,7 @@ struct ethtool_ops { int (*phys_id)(struct net_device *, u32); int (*get_stats_count)(struct net_device *); void (*get_ethtool_stats)(struct net_device *, struct ethtool_stats *, u64 *); + int (*get_perm_addr)(struct net_device *, struct ethtool_perm_addr *, u8 *); int (*begin)(struct net_device *); void (*complete)(struct net_device *); }; @@ -389,6 +399,7 @@ struct ethtool_ops { #define ETHTOOL_GSTATS 0x0000001d /* get NIC-specific statistics */ #define ETHTOOL_GTSO 0x0000001e /* Get TSO enable (ethtool_value) */ #define ETHTOOL_STSO 0x0000001f /* Set TSO enable (ethtool_value) */ +#define ETHTOOL_GPERMADDR 0x00000020 /* Get permanent hardware address */ /* compatibility with older code */ #define SPARC_ETH_GSET ETHTOOL_GSET diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1fcaa88..7c71790 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -337,6 +337,7 @@ struct net_device /* Interface address info. */ unsigned char broadcast[MAX_ADDR_LEN]; /* hw bcast add */ unsigned char dev_addr[MAX_ADDR_LEN]; /* hw address */ + unsigned char perm_addr[MAX_ADDR_LEN]; /* permanent hw address */ unsigned char addr_len; /* hardware address length */ unsigned short dev_id; /* for shared network cards */ diff --git a/net/core/ethtool.c b/net/core/ethtool.c index a3eeb88..289c1b5 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -81,6 +81,18 @@ int ethtool_op_set_tso(struct net_device *dev, u32 data) return 0; } +int ethtool_op_get_perm_addr(struct net_device *dev, struct ethtool_perm_addr *addr, u8 *data) +{ + unsigned char len = dev->addr_len; + if ( addr->size < len ) + return -ETOOSMALL; + + addr->size = len; + memcpy(data, dev->perm_addr, len); + return 0; +} + + /* Handlers for each ethtool command */ static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) @@ -683,6 +695,39 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr) return ret; } +static int ethtool_get_perm_addr(struct net_device *dev, void *useraddr) +{ + struct ethtool_perm_addr epaddr; + u8 *data; + int ret; + + if (!dev->ethtool_ops->get_perm_addr) + return -EOPNOTSUPP; + + if (copy_from_user(&epaddr,useraddr,sizeof(epaddr))) + return -EFAULT; + + data = kmalloc(epaddr.size, GFP_USER); + if (!data) + return -ENOMEM; + + ret = dev->ethtool_ops->get_perm_addr(dev,&epaddr,data); + if (ret) + return ret; + + ret = -EFAULT; + if (copy_to_user(useraddr, &epaddr, sizeof(epaddr))) + goto out; + useraddr += sizeof(epaddr); + if (copy_to_user(useraddr, data, epaddr.size)) + goto out; + ret = 0; + + out: + kfree(data); + return ret; +} + /* The main entry point in this file. Called from net/core/dev.c */ int dev_ethtool(struct ifreq *ifr) @@ -806,6 +851,9 @@ int dev_ethtool(struct ifreq *ifr) case ETHTOOL_GSTATS: rc = ethtool_get_stats(dev, useraddr); break; + case ETHTOOL_GPERMADDR: + rc = ethtool_get_perm_addr(dev, useraddr); + break; default: rc = -EOPNOTSUPP; } @@ -826,6 +874,7 @@ int dev_ethtool(struct ifreq *ifr) EXPORT_SYMBOL(dev_ethtool); EXPORT_SYMBOL(ethtool_op_get_link); +EXPORT_SYMBOL_GPL(ethtool_op_get_perm_addr); EXPORT_SYMBOL(ethtool_op_get_sg); EXPORT_SYMBOL(ethtool_op_get_tso); EXPORT_SYMBOL(ethtool_op_get_tx_csum); -- cgit v0.10.2 From 2c656491e9ce77e12337073973794c4be467a489 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Sat, 20 Aug 2005 17:24:25 -0700 Subject: [NET]: Fix ipl=>ihl typo in ip_fast_csum Signed-off-by: Thomas Graf Signed-off-by: David S. Miller diff --git a/include/asm-i386/checksum.h b/include/asm-i386/checksum.h index f949e44..67d3630 100644 --- a/include/asm-i386/checksum.h +++ b/include/asm-i386/checksum.h @@ -83,7 +83,7 @@ static inline unsigned short ip_fast_csum(unsigned char * iph, "adcl $0, %0 ;\n" "notl %0 ;\n" "2: ;\n" - /* Since the input registers which are loaded with iph and ipl + /* Since the input registers which are loaded with iph and ihl are modified, we must also specify them as outputs, or gcc will assume they contain their original values. */ : "=r" (sum), "=r" (iph), "=r" (ihl) diff --git a/include/asm-m32r/checksum.h b/include/asm-m32r/checksum.h index 99f37db..877ebf4 100644 --- a/include/asm-m32r/checksum.h +++ b/include/asm-m32r/checksum.h @@ -105,7 +105,7 @@ static inline unsigned short ip_fast_csum(unsigned char * iph, " addx %0, %3 \n" " .fillinsn\n" "2: \n" - /* Since the input registers which are loaded with iph and ipl + /* Since the input registers which are loaded with iph and ihl are modified, we must also specify them as outputs, or gcc will assume they contain their original values. */ : "=&r" (sum), "=r" (iph), "=r" (ihl), "=&r" (tmpreg0), "=&r" (tmpreg1) diff --git a/include/asm-x86_64/checksum.h b/include/asm-x86_64/checksum.h index d01356f..989469e 100644 --- a/include/asm-x86_64/checksum.h +++ b/include/asm-x86_64/checksum.h @@ -64,7 +64,7 @@ static inline unsigned short ip_fast_csum(unsigned char *iph, unsigned int ihl) " adcl $0, %0\n" " notl %0\n" "2:" - /* Since the input registers which are loaded with iph and ipl + /* Since the input registers which are loaded with iph and ihl are modified, we must also specify them as outputs, or gcc will assume they contain their original values. */ : "=r" (sum), "=r" (iph), "=r" (ihl) -- cgit v0.10.2 From 58615242417638794a5ba299c49e3fbd6f47c2a3 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Sat, 20 Aug 2005 17:25:29 -0700 Subject: [IPV4]: Consistency and whitespace cleanup of ip_rcv() Signed-off-by: Thomas Graf Signed-off-by: David S. Miller diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 81e1802..322b082 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -361,6 +361,7 @@ drop: int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct iphdr *iph; + u32 len; /* When the interface is in promisc. mode, drop all the crap * that it receives, do not try to analyse it. @@ -392,7 +393,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, */ if (iph->ihl < 5 || iph->version != 4) - goto inhdr_error; + goto inhdr_error; if (!pskb_may_pull(skb, iph->ihl*4)) goto inhdr_error; @@ -400,21 +401,19 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, iph = skb->nh.iph; if (ip_fast_csum((u8 *)iph, iph->ihl) != 0) - goto inhdr_error; + goto inhdr_error; - { - __u32 len = ntohs(iph->tot_len); - if (skb->len < len || len < (iph->ihl<<2)) - goto inhdr_error; + len = ntohs(iph->tot_len); + if (skb->len < len || len < (iph->ihl*4)) + goto inhdr_error; - /* Our transport medium may have padded the buffer out. Now we know it - * is IP we can trim to the true length of the frame. - * Note this now means skb->len holds ntohs(iph->tot_len). - */ - if (pskb_trim_rcsum(skb, len)) { - IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS); - goto drop; - } + /* Our transport medium may have padded the buffer out. Now we know it + * is IP we can trim to the true length of the frame. + * Note this now means skb->len holds ntohs(iph->tot_len). + */ + if (pskb_trim_rcsum(skb, len)) { + IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS); + goto drop; } return NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, dev, NULL, -- cgit v0.10.2 From e9c604227391308b185aa6b14c7f93b0a0c2e51b Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Sat, 20 Aug 2005 17:25:52 -0700 Subject: [IPV4]: Avoid common branch misprediction while checking csum in ip_rcv() Signed-off-by: Thomas Graf Signed-off-by: David S. Miller diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 322b082..6a06e15 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -400,7 +400,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, iph = skb->nh.iph; - if (ip_fast_csum((u8 *)iph, iph->ihl) != 0) + if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) goto inhdr_error; len = ntohs(iph->tot_len); -- cgit v0.10.2 From d245407e758b14c464c609b632873f85709360c7 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Sat, 20 Aug 2005 17:26:12 -0700 Subject: [IPV4]: Move ip options parsing out of ip_rcv_finish() Signed-off-by: Thomas Graf Signed-off-by: David S. Miller diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 6a06e15..48e4ddc 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -279,6 +279,58 @@ int ip_local_deliver(struct sk_buff *skb) ip_local_deliver_finish); } +static inline int ip_rcv_options(struct sk_buff *skb) +{ + struct ip_options *opt; + struct iphdr *iph; + struct net_device *dev = skb->dev; + + /* It looks as overkill, because not all + IP options require packet mangling. + But it is the easiest for now, especially taking + into account that combination of IP options + and running sniffer is extremely rare condition. + --ANK (980813) + */ + if (skb_cow(skb, skb_headroom(skb))) { + IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS); + goto drop; + } + + iph = skb->nh.iph; + + if (ip_options_compile(NULL, skb)) { + IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); + goto drop; + } + + opt = &(IPCB(skb)->opt); + if (unlikely(opt->srr)) { + struct in_device *in_dev = in_dev_get(dev); + if (in_dev) { + if (!IN_DEV_SOURCE_ROUTE(in_dev)) { + if (IN_DEV_LOG_MARTIANS(in_dev) && + net_ratelimit()) + printk(KERN_INFO "source route option " + "%u.%u.%u.%u -> %u.%u.%u.%u\n", + NIPQUAD(iph->saddr), + NIPQUAD(iph->daddr)); + in_dev_put(in_dev); + goto drop; + } + + in_dev_put(in_dev); + } + + if (ip_options_rcv_srr(skb)) + goto drop; + } + + return 0; +drop: + return -1; +} + static inline int ip_rcv_finish(struct sk_buff *skb) { struct net_device *dev = skb->dev; @@ -308,48 +360,11 @@ static inline int ip_rcv_finish(struct sk_buff *skb) } #endif - if (iph->ihl > 5) { - struct ip_options *opt; - - /* It looks as overkill, because not all - IP options require packet mangling. - But it is the easiest for now, especially taking - into account that combination of IP options - and running sniffer is extremely rare condition. - --ANK (980813) - */ - - if (skb_cow(skb, skb_headroom(skb))) { - IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS); - goto drop; - } - iph = skb->nh.iph; - - if (ip_options_compile(NULL, skb)) - goto inhdr_error; - - opt = &(IPCB(skb)->opt); - if (opt->srr) { - struct in_device *in_dev = in_dev_get(dev); - if (in_dev) { - if (!IN_DEV_SOURCE_ROUTE(in_dev)) { - if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) - printk(KERN_INFO "source route option %u.%u.%u.%u -> %u.%u.%u.%u\n", - NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); - in_dev_put(in_dev); - goto drop; - } - in_dev_put(in_dev); - } - if (ip_options_rcv_srr(skb)) - goto drop; - } - } + if (iph->ihl > 5 && ip_rcv_options(skb)) + goto drop; return dst_input(skb); -inhdr_error: - IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); drop: kfree_skb(skb); return NET_RX_DROP; -- cgit v0.10.2 From 3e192beaf5ef260a31e84a12c0a04eff2eec02ab Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Sat, 20 Aug 2005 17:26:30 -0700 Subject: [IPV4]: Avoid common branch mispredictions in ip_rcv_finish() Signed-off-by: Thomas Graf Signed-off-by: David S. Miller diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 48e4ddc..7e78095 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -333,16 +333,16 @@ drop: static inline int ip_rcv_finish(struct sk_buff *skb) { - struct net_device *dev = skb->dev; struct iphdr *iph = skb->nh.iph; - int err; /* * Initialise the virtual path cache for the packet. It describes * how the packet travels inside Linux networking. */ - if (skb->dst == NULL) { - if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) { + if (likely(skb->dst == NULL)) { + int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, + skb->dev); + if (unlikely(err)) { if (err == -EHOSTUNREACH) IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS); goto drop; @@ -350,7 +350,7 @@ static inline int ip_rcv_finish(struct sk_buff *skb) } #ifdef CONFIG_NET_CLS_ROUTE - if (skb->dst->tclassid) { + if (unlikely(skb->dst->tclassid)) { struct ip_rt_acct *st = ip_rt_acct + 256*smp_processor_id(); u32 idx = skb->dst->tclassid; st[idx&0xFF].o_packets++; -- cgit v0.10.2 From 9070683bdac59a3b26e2ce6dd0d05fbfcb3fc7d8 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Sat, 20 Aug 2005 17:27:09 -0700 Subject: [IPV4]: Remove some dead code from ip_forward() Signed-off-by: Thomas Graf Signed-off-by: David S. Miller diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 77094aa..0923add1 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -76,16 +76,12 @@ int ip_forward(struct sk_buff *skb) * that reaches zero, we must reply an ICMP control message telling * that the packet's lifetime expired. */ - - iph = skb->nh.iph; - - if (iph->ttl <= 1) + if (skb->nh.iph->ttl <= 1) goto too_many_hops; if (!xfrm4_route_forward(skb)) goto drop; - iph = skb->nh.iph; rt = (struct rtable*)skb->dst; if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway) -- cgit v0.10.2 From 33d043d65bbd3d97efca96c9bbada443cac3c4da Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Sat, 20 Aug 2005 17:27:34 -0700 Subject: [IPV4]: ip_finish_output() can be inlined Signed-off-by: Thomas Graf Signed-off-by: David S. Miller diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 19f24f7..3f1a263 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -200,7 +200,7 @@ static inline int ip_finish_output2(struct sk_buff *skb) return -EINVAL; } -static int ip_finish_output(struct sk_buff *skb) +static inline int ip_finish_output(struct sk_buff *skb) { struct net_device *dev = skb->dst->dev; -- cgit v0.10.2 From c68e64cfb5ac675b002215b5659146b73d2e9d5d Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sun, 21 Aug 2005 05:07:37 -0300 Subject: [CCID3]: Reintroduce ccid3hctx_t_rto CCID3 keeps this variable in usecs, inet_connection_socks in jiffies, so to avoid Mars orbiter losses lets reintroduce ccid3hctx_t_rto 8) Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 6941490..ffd5b44 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -905,7 +905,7 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) hctx->ccid3hctx_x = 10; } /* Schedule no feedback timer to expire in max(4 * R, 2 * s / X) */ - next_tmout = max_t(u32, inet_csk(sk)->icsk_rto, + next_tmout = max_t(u32, hctx->ccid3hctx_t_rto, 2 * (hctx->ccid3hctx_s * 100000) / (hctx->ccid3hctx_x / 10)); break; default: @@ -1180,8 +1180,8 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) r_sample); /* Update timeout interval */ - inet_csk(sk)->icsk_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt, - USEC_PER_SEC); + hctx->ccid3hctx_t_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt, + USEC_PER_SEC); /* Update receive rate */ hctx->ccid3hctx_x_recv = x_recv; /* x_recv in bytes per second */ @@ -1227,7 +1227,7 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) /* to prevent divide by zero below */ /* Schedule no feedback timer to expire in max(4 * R, 2 * s / X) */ - next_tmout = max(inet_csk(sk)->icsk_rto, + next_tmout = max(hctx->ccid3hctx_t_rto, (2 * (hctx->ccid3hctx_s * 100000) / (hctx->ccid3hctx_x / 10))); /* maths with 100000 and 10 is to prevent overflow with 32 bit */ @@ -1340,7 +1340,7 @@ static int ccid3_hc_tx_init(struct sock *sk) hctx->ccid3hctx_x = hctx->ccid3hctx_s; /* set transmission rate to 1 packet per second */ hctx->ccid3hctx_rtt = 4; /* See ccid3_hc_tx_packet_sent win_count calculatation */ - inet_csk(sk)->icsk_rto = USEC_PER_SEC; + hctx->ccid3hctx_t_rto = USEC_PER_SEC; hctx->ccid3hctx_state = TFRC_SSTATE_NO_SENT; INIT_LIST_HEAD(&hctx->ccid3hctx_hist); init_timer(&hctx->ccid3hctx_no_feedback_timer); diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h index d2705fb..5ef72cd 100644 --- a/net/dccp/ccids/ccid3.h +++ b/net/dccp/ccids/ccid3.h @@ -80,6 +80,7 @@ struct ccid3_hc_tx_sock { struct timer_list ccid3hctx_no_feedback_timer; struct timeval ccid3hctx_t_ld; struct timeval ccid3hctx_t_nom; + u32 ccid3hctx_t_rto; u32 ccid3hctx_t_ipi; u32 ccid3hctx_delta; struct list_head ccid3hctx_hist; -- cgit v0.10.2 From 2807d4ffb0dccb8f932c3e1701b6b6163153d333 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sun, 21 Aug 2005 05:33:48 -0300 Subject: [DCCP]: Fix seqno setting in dccp_v4_ctl_send_reset Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index cc5d60d..02ebf1f 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -887,6 +887,7 @@ static void dccp_v4_ctl_send_reset(struct sk_buff *rxskb) sizeof(struct dccp_hdr_reset); struct sk_buff *skb; struct dst_entry *dst; + u64 seqno; /* Never send a reset in response to a reset. */ if (rxdh->dccph_type == DCCP_PKT_RESET) @@ -920,7 +921,12 @@ static void dccp_v4_ctl_send_reset(struct sk_buff *rxskb) dccp_hdr_reset(skb)->dccph_reset_code = DCCP_SKB_CB(rxskb)->dccpd_reset_code; - dccp_hdr_set_seq(dh, DCCP_SKB_CB(rxskb)->dccpd_ack_seq); + /* See "8.3.1. Abnormal Termination" in draft-ietf-dccp-spec-11 */ + seqno = 0; + if (DCCP_SKB_CB(rxskb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) + dccp_set_seqno(&seqno, DCCP_SKB_CB(rxskb)->dccpd_ack_seq + 1); + + dccp_hdr_set_seq(dh, seqno); dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), DCCP_SKB_CB(rxskb)->dccpd_seq); -- cgit v0.10.2 From a3054d48b9b9d6290eccc9fc09c286ef450d9b1d Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sun, 21 Aug 2005 05:35:18 -0300 Subject: [DCCP]: Give more info on Step 6 failure debug printk Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/net/dccp/input.c b/net/dccp/input.c index 3c4cbff..ce8396b 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -136,8 +136,15 @@ static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb) DCCP_PKT_WITHOUT_ACK_SEQ)) dp->dccps_gar = DCCP_SKB_CB(skb)->dccpd_ack_seq; } else { - LIMIT_NETDEBUG(KERN_WARNING "DCCP: Step 6 failed, " - "sending SYNC...\n"); + LIMIT_NETDEBUG(KERN_WARNING "DCCP: Step 6 failed for %s packet, " + "(LSWL(%llu) <= P.seqno(%llu) <= S.SWH(%llu)) and " + "(P.ackno %s or LAWL(%llu) <= P.ackno(%llu) <= S.AWH(%llu), " + "sending SYNC...\n", + dccp_packet_name(dh->dccph_type), + lswl, DCCP_SKB_CB(skb)->dccpd_seq, dp->dccps_swh, + (DCCP_SKB_CB(skb)->dccpd_ack_seq == + DCCP_PKT_WITHOUT_ACK_SEQ) ? "doesn't exist" : "exists", + lawl, DCCP_SKB_CB(skb)->dccpd_ack_seq, dp->dccps_awh); dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_PKT_SYNC); return -1; } -- cgit v0.10.2 From 03ace394ac9bcad38043a381ae5f4860b9c9fa1c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sun, 21 Aug 2005 05:36:45 -0300 Subject: [DCCP]: Fix the ACK and SEQ window variables settings This is from a first audit, more eyeballs are more than welcome. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index aab72b8..33968a9 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -340,13 +340,11 @@ static inline void dccp_hdr_set_ack(struct dccp_hdr_ack_bits *dhack, static inline void dccp_update_gsr(struct sock *sk, u64 seq) { struct dccp_sock *dp = dccp_sk(sk); - u64 tmp_gsr; - dccp_set_seqno(&tmp_gsr, + dp->dccps_gsr = seq; + dccp_set_seqno(&dp->dccps_swl, (dp->dccps_gsr + 1 - (dp->dccps_options.dccpo_sequence_window / 4))); - dp->dccps_gsr = seq; - dccp_set_seqno(&dp->dccps_swl, max48(tmp_gsr, dp->dccps_isr)); dccp_set_seqno(&dp->dccps_swh, (dp->dccps_gsr + (3 * dp->dccps_options.dccpo_sequence_window) / 4)); @@ -355,13 +353,11 @@ static inline void dccp_update_gsr(struct sock *sk, u64 seq) static inline void dccp_update_gss(struct sock *sk, u64 seq) { struct dccp_sock *dp = dccp_sk(sk); - u64 tmp_gss; - dccp_set_seqno(&tmp_gss, + dp->dccps_awh = dp->dccps_gss = seq; + dccp_set_seqno(&dp->dccps_awl, (dp->dccps_gss - dp->dccps_options.dccpo_sequence_window + 1)); - dp->dccps_awl = max48(tmp_gss, dp->dccps_iss); - dp->dccps_awh = dp->dccps_gss = seq; } extern void dccp_insert_options(struct sock *sk, struct sk_buff *skb); diff --git a/net/dccp/input.c b/net/dccp/input.c index ce8396b..5847cf4 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -314,7 +314,19 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk, } dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq; - dccp_update_gsr(sk, DCCP_SKB_CB(skb)->dccpd_seq); + dccp_update_gsr(sk, dp->dccps_isr); + /* + * SWL and AWL are initially adjusted so that they are not less than + * the initial Sequence Numbers received and sent, respectively: + * SWL := max(GSR + 1 - floor(W/4), ISR), + * AWL := max(GSS - W' + 1, ISS). + * These adjustments MUST be applied only at the beginning of the + * connection. + * + * AWL was adjusted in dccp_v4_connect -acme + */ + dccp_set_seqno(&dp->dccps_swl, + max48(dp->dccps_swl, dp->dccps_isr)); if (ccid_hc_rx_init(dp->dccps_hc_rx_ccid, sk) != 0 || ccid_hc_tx_init(dp->dccps_hc_tx_ccid, sk) != 0) { diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 02ebf1f..647e669 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -309,6 +309,16 @@ static int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, usin->sin_port); dccp_update_gss(sk, dp->dccps_iss); + /* + * SWL and AWL are initially adjusted so that they are not less than + * the initial Sequence Numbers received and sent, respectively: + * SWL := max(GSR + 1 - floor(W/4), ISR), + * AWL := max(GSS - W' + 1, ISS). + * These adjustments MUST be applied only at the beginning of the + * connection. + */ + dccp_set_seqno(&dp->dccps_awl, max48(dp->dccps_awl, dp->dccps_iss)); + inet->id = dp->dccps_iss ^ jiffies; err = dccp_connect(sk); diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index b8e6720..ce5dff4 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -146,6 +146,19 @@ out_free: newdp->dccps_iss = dreq->dreq_iss; dccp_update_gss(newsk, dreq->dreq_iss); + /* + * SWL and AWL are initially adjusted so that they are not less than + * the initial Sequence Numbers received and sent, respectively: + * SWL := max(GSR + 1 - floor(W/4), ISR), + * AWL := max(GSS - W' + 1, ISS). + * These adjustments MUST be applied only at the beginning of the + * connection. + */ + dccp_set_seqno(&newdp->dccps_swl, + max48(newdp->dccps_swl, newdp->dccps_isr)); + dccp_set_seqno(&newdp->dccps_awl, + max48(newdp->dccps_awl, newdp->dccps_iss)); + dccp_init_xmit_timers(newsk); DCCP_INC_STATS_BH(DCCP_MIB_PASSIVEOPENS); diff --git a/net/dccp/timer.c b/net/dccp/timer.c index 47b1616..aa34b57 100644 --- a/net/dccp/timer.c +++ b/net/dccp/timer.c @@ -144,7 +144,7 @@ static void dccp_retransmit_timer(struct sock *sk) /* * sk->sk_send_head has to have one skb with * DCCP_SKB_CB(skb)->dccpd_type set to one of the retransmittable DCCP - * packet types (REQUEST, RESPONSE, the ACK in the 3way hanshake + * packet types (REQUEST, RESPONSE, the ACK in the 3way handshake * (PARTOPEN timer), etc). */ BUG_TRAP(sk->sk_send_head != NULL); -- cgit v0.10.2 From 24117727b753426d85ba09671c24854834f81b2c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sun, 21 Aug 2005 05:40:16 -0300 Subject: [DCCP]: Fix ackno setting in SYNC/SYNCACK packets Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/net/dccp/output.c b/net/dccp/output.c index 384fd09..708fc3c 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -44,15 +44,8 @@ int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) int err, set_ack = 1; u64 ackno = dp->dccps_gsr; - /* - * FIXME: study DCCP_PKT_SYNC[ACK] to see what is the right - * thing to do here... - */ dccp_inc_seqno(&dp->dccps_gss); - dcb->dccpd_seq = dp->dccps_gss; - dccp_insert_options(sk, skb); - switch (dcb->dccpd_type) { case DCCP_PKT_DATA: set_ack = 0; @@ -62,6 +55,9 @@ int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) ackno = dcb->dccpd_seq; break; } + + dcb->dccpd_seq = dp->dccps_gss; + dccp_insert_options(sk, skb); skb->h.raw = skb_push(skb, dccp_header_size); dh = dccp_hdr(skb); -- cgit v0.10.2 From 7567662ba896ee0c33d6215f32e2011488a6d1bf Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 21 Aug 2005 23:30:34 -0700 Subject: [NETFILTER]: Add string match Signed-off-by: Pablo Neira Ayuso Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller diff --git a/include/linux/netfilter_ipv4/ipt_string.h b/include/linux/netfilter_ipv4/ipt_string.h new file mode 100644 index 0000000..a265f6e --- /dev/null +++ b/include/linux/netfilter_ipv4/ipt_string.h @@ -0,0 +1,18 @@ +#ifndef _IPT_STRING_H +#define _IPT_STRING_H + +#define IPT_STRING_MAX_PATTERN_SIZE 128 +#define IPT_STRING_MAX_ALGO_NAME_SIZE 16 + +struct ipt_string_info +{ + u_int16_t from_offset; + u_int16_t to_offset; + char algo[IPT_STRING_MAX_ALGO_NAME_SIZE]; + char pattern[IPT_STRING_MAX_PATTERN_SIZE]; + u_int8_t patlen; + u_int8_t invert; + struct ts_config __attribute__((aligned(8))) *config; +}; + +#endif /*_IPT_STRING_H*/ diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 3f7e6e4..f2bea6e 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -410,6 +410,18 @@ config IP_NF_MATCH_HASHLIMIT destination IP' or `500pps from any given source IP' with a single IPtables rule. +config IP_NF_MATCH_STRING + tristate 'string match support' + depends on IP_NF_IPTABLES + select TEXTSEARCH + select TEXTSEARCH_KMP + select TEXTSEARCH_FSM + help + This option adds a `string' match, which allows you to look for + pattern matchings in packets. + + To compile it as a module, choose M here. If unsure, say N. + # `filter', generic and specific targets config IP_NF_FILTER tristate "Packet filtering" diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 7c8ae85..89cae69 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -65,6 +65,7 @@ obj-$(CONFIG_IP_NF_MATCH_REALM) += ipt_realm.o obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o obj-$(CONFIG_IP_NF_MATCH_PHYSDEV) += ipt_physdev.o obj-$(CONFIG_IP_NF_MATCH_COMMENT) += ipt_comment.o +obj-$(CONFIG_IP_NF_MATCH_STRING) += ipt_string.o # targets obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o diff --git a/net/ipv4/netfilter/ipt_string.c b/net/ipv4/netfilter/ipt_string.c new file mode 100644 index 0000000..b5def20 --- /dev/null +++ b/net/ipv4/netfilter/ipt_string.c @@ -0,0 +1,91 @@ +/* String matching match for iptables + * + * (C) 2005 Pablo Neira Ayuso + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include + +MODULE_AUTHOR("Pablo Neira Ayuso "); +MODULE_DESCRIPTION("IP tables string match module"); +MODULE_LICENSE("GPL"); + +static int match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + struct ts_state state; + struct ipt_string_info *conf = (struct ipt_string_info *) matchinfo; + + memset(&state, 0, sizeof(struct ts_state)); + + return (skb_find_text((struct sk_buff *)skb, conf->from_offset, + conf->to_offset, conf->config, &state) + != UINT_MAX) && !conf->invert; +} + +#define STRING_TEXT_PRIV(m) ((struct ipt_string_info *) m) + +static int checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + struct ipt_string_info *conf = matchinfo; + struct ts_config *ts_conf; + + if (matchsize != IPT_ALIGN(sizeof(struct ipt_string_info))) + return 0; + + /* Damn, can't handle this case properly with iptables... */ + if (conf->from_offset > conf->to_offset) + return 0; + + ts_conf = textsearch_prepare(conf->algo, conf->pattern, conf->patlen, + GFP_KERNEL, TS_AUTOLOAD); + if (IS_ERR(ts_conf)) + return 0; + + conf->config = ts_conf; + + return 1; +} + +static void destroy(void *matchinfo, unsigned int matchsize) +{ + textsearch_destroy(STRING_TEXT_PRIV(matchinfo)->config); +} + +static struct ipt_match string_match = { + .name = "string", + .match = match, + .checkentry = checkentry, + .destroy = destroy, + .me = THIS_MODULE +}; + +static int __init init(void) +{ + return ipt_register_match(&string_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&string_match); +} + +module_init(init); +module_exit(fini); -- cgit v0.10.2 From 764d8a9f240729534a1d8a0ffd39e722cf5cc5af Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sun, 21 Aug 2005 23:31:06 -0700 Subject: [NETFILTER]: Add IPv6 REJECT target Originally written by Yasuyuki Kozakai , taken from netfilter patch-o-matic and fixed up to work with current kernels. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller diff --git a/include/linux/netfilter_ipv6/ip6t_REJECT.h b/include/linux/netfilter_ipv6/ip6t_REJECT.h new file mode 100644 index 0000000..6be6504 --- /dev/null +++ b/include/linux/netfilter_ipv6/ip6t_REJECT.h @@ -0,0 +1,18 @@ +#ifndef _IP6T_REJECT_H +#define _IP6T_REJECT_H + +enum ip6t_reject_with { + IP6T_ICMP6_NO_ROUTE, + IP6T_ICMP6_ADM_PROHIBITED, + IP6T_ICMP6_NOT_NEIGHBOUR, + IP6T_ICMP6_ADDR_UNREACH, + IP6T_ICMP6_PORT_UNREACH, + IP6T_ICMP6_ECHOREPLY, + IP6T_TCP_RESET +}; + +struct ip6t_reject_info { + u_int32_t with; /* reject type */ +}; + +#endif /*_IP6T_REJECT_H*/ diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index cd15519..8a10c2d 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -199,6 +199,16 @@ config IP6_NF_TARGET_LOG To compile it as a module, choose M here. If unsure, say N. +config IP6_NF_TARGET_REJECT + tristate "REJECT target support" + depends on IP6_NF_FILTER + help + The REJECT target allows a filtering rule to specify that an ICMPv6 + error should be issued in response to an incoming packet, rather + than silently being dropped. + + To compile it as a module, choose M here. If unsure, say N. + # if [ "$CONFIG_IP6_NF_FILTER" != "n" ]; then # dep_tristate ' REJECT target support' CONFIG_IP6_NF_TARGET_REJECT $CONFIG_IP6_NF_FILTER # if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index 847651d..70f6ba6 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -24,4 +24,5 @@ obj-$(CONFIG_IP6_NF_QUEUE) += ip6_queue.o obj-$(CONFIG_IP6_NF_TARGET_LOG) += ip6t_LOG.o obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o obj-$(CONFIG_IP6_NF_MATCH_HL) += ip6t_hl.o +obj-$(CONFIG_IP6_NF_TARGET_REJECT) += ip6t_REJECT.o obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += ip6t_NFQUEUE.o diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c new file mode 100644 index 0000000..14316c3 --- /dev/null +++ b/net/ipv6/netfilter/ip6t_REJECT.c @@ -0,0 +1,284 @@ +/* + * IP6 tables REJECT target module + * Linux INET6 implementation + * + * Copyright (C)2003 USAGI/WIDE Project + * + * Authors: + * Yasuyuki Kozakai + * + * Based on net/ipv4/netfilter/ipt_REJECT.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_AUTHOR("Yasuyuki KOZAKAI "); +MODULE_DESCRIPTION("IP6 tables REJECT target module"); +MODULE_LICENSE("GPL"); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +/* Send RST reply */ +static void send_reset(struct sk_buff *oldskb) +{ + struct sk_buff *nskb; + struct tcphdr otcph, *tcph; + unsigned int otcplen, hh_len; + int tcphoff, needs_ack; + struct ipv6hdr *oip6h = oldskb->nh.ipv6h, *ip6h; + struct dst_entry *dst = NULL; + u8 proto; + struct flowi fl; + + if ((!(ipv6_addr_type(&oip6h->saddr) & IPV6_ADDR_UNICAST)) || + (!(ipv6_addr_type(&oip6h->daddr) & IPV6_ADDR_UNICAST))) { + DEBUGP("ip6t_REJECT: addr is not unicast.\n"); + return; + } + + proto = oip6h->nexthdr; + tcphoff = ipv6_skip_exthdr(oldskb, ((u8*)(oip6h+1) - oldskb->data), &proto); + + if ((tcphoff < 0) || (tcphoff > oldskb->len)) { + DEBUGP("ip6t_REJECT: Can't get TCP header.\n"); + return; + } + + otcplen = oldskb->len - tcphoff; + + /* IP header checks: fragment, too short. */ + if ((proto != IPPROTO_TCP) || (otcplen < sizeof(struct tcphdr))) { + DEBUGP("ip6t_REJECT: proto(%d) != IPPROTO_TCP, or too short. otcplen = %d\n", + proto, otcplen); + return; + } + + if (skb_copy_bits(oldskb, tcphoff, &otcph, sizeof(struct tcphdr))) + BUG(); + + /* No RST for RST. */ + if (otcph.rst) { + DEBUGP("ip6t_REJECT: RST is set\n"); + return; + } + + /* Check checksum. */ + if (csum_ipv6_magic(&oip6h->saddr, &oip6h->daddr, otcplen, IPPROTO_TCP, + skb_checksum(oldskb, tcphoff, otcplen, 0))) { + DEBUGP("ip6t_REJECT: TCP checksum is invalid\n"); + return; + } + + memset(&fl, 0, sizeof(fl)); + fl.proto = IPPROTO_TCP; + ipv6_addr_copy(&fl.fl6_src, &oip6h->daddr); + ipv6_addr_copy(&fl.fl6_dst, &oip6h->saddr); + fl.fl_ip_sport = otcph.dest; + fl.fl_ip_dport = otcph.source; + dst = ip6_route_output(NULL, &fl); + if (dst == NULL) + return; + if (dst->error || + xfrm_lookup(&dst, &fl, NULL, 0)) { + dst_release(dst); + return; + } + + hh_len = (dst->dev->hard_header_len + 15)&~15; + nskb = alloc_skb(hh_len + 15 + dst->header_len + sizeof(struct ipv6hdr) + + sizeof(struct tcphdr) + dst->trailer_len, + GFP_ATOMIC); + + if (!nskb) { + if (net_ratelimit()) + printk("ip6t_REJECT: Can't alloc skb\n"); + dst_release(dst); + return; + } + + nskb->dst = dst; + + skb_reserve(nskb, hh_len + dst->header_len); + + ip6h = nskb->nh.ipv6h = (struct ipv6hdr *) + skb_put(nskb, sizeof(struct ipv6hdr)); + ip6h->version = 6; + ip6h->hop_limit = dst_metric(dst, RTAX_HOPLIMIT); + ip6h->nexthdr = IPPROTO_TCP; + ip6h->payload_len = htons(sizeof(struct tcphdr)); + ipv6_addr_copy(&ip6h->saddr, &oip6h->daddr); + ipv6_addr_copy(&ip6h->daddr, &oip6h->saddr); + + tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr)); + /* Truncate to length (no data) */ + tcph->doff = sizeof(struct tcphdr)/4; + tcph->source = otcph.dest; + tcph->dest = otcph.source; + + if (otcph.ack) { + needs_ack = 0; + tcph->seq = otcph.ack_seq; + tcph->ack_seq = 0; + } else { + needs_ack = 1; + tcph->ack_seq = htonl(ntohl(otcph.seq) + otcph.syn + otcph.fin + + otcplen - (otcph.doff<<2)); + tcph->seq = 0; + } + + /* Reset flags */ + ((u_int8_t *)tcph)[13] = 0; + tcph->rst = 1; + tcph->ack = needs_ack; + tcph->window = 0; + tcph->urg_ptr = 0; + tcph->check = 0; + + /* Adjust TCP checksum */ + tcph->check = csum_ipv6_magic(&nskb->nh.ipv6h->saddr, + &nskb->nh.ipv6h->daddr, + sizeof(struct tcphdr), IPPROTO_TCP, + csum_partial((char *)tcph, + sizeof(struct tcphdr), 0)); + + NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, nskb, NULL, nskb->dst->dev, + dst_output); +} + +static inline void +send_unreach(struct sk_buff *skb_in, unsigned char code, unsigned int hooknum) +{ + if (hooknum == NF_IP6_LOCAL_OUT && skb_in->dev == NULL) + skb_in->dev = &loopback_dev; + + icmpv6_send(skb_in, ICMPV6_DEST_UNREACH, code, 0, NULL); +} + +static unsigned int reject6_target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ip6t_reject_info *reject = targinfo; + + DEBUGP(KERN_DEBUG "%s: medium point\n", __FUNCTION__); + /* WARNING: This code causes reentry within ip6tables. + This means that the ip6tables jump stack is now crap. We + must return an absolute verdict. --RR */ + switch (reject->with) { + case IP6T_ICMP6_NO_ROUTE: + send_unreach(*pskb, ICMPV6_NOROUTE, hooknum); + break; + case IP6T_ICMP6_ADM_PROHIBITED: + send_unreach(*pskb, ICMPV6_ADM_PROHIBITED, hooknum); + break; + case IP6T_ICMP6_NOT_NEIGHBOUR: + send_unreach(*pskb, ICMPV6_NOT_NEIGHBOUR, hooknum); + break; + case IP6T_ICMP6_ADDR_UNREACH: + send_unreach(*pskb, ICMPV6_ADDR_UNREACH, hooknum); + break; + case IP6T_ICMP6_PORT_UNREACH: + send_unreach(*pskb, ICMPV6_PORT_UNREACH, hooknum); + break; + case IP6T_ICMP6_ECHOREPLY: + /* Do nothing */ + break; + case IP6T_TCP_RESET: + send_reset(*pskb); + break; + default: + if (net_ratelimit()) + printk(KERN_WARNING "ip6t_REJECT: case %u not handled yet\n", reject->with); + break; + } + + return NF_DROP; +} + +static int check(const char *tablename, + const struct ip6t_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + const struct ip6t_reject_info *rejinfo = targinfo; + + if (targinfosize != IP6T_ALIGN(sizeof(struct ip6t_reject_info))) { + DEBUGP("ip6t_REJECT: targinfosize %u != 0\n", targinfosize); + return 0; + } + + /* Only allow these for packet filtering. */ + if (strcmp(tablename, "filter") != 0) { + DEBUGP("ip6t_REJECT: bad table `%s'.\n", tablename); + return 0; + } + + if ((hook_mask & ~((1 << NF_IP6_LOCAL_IN) + | (1 << NF_IP6_FORWARD) + | (1 << NF_IP6_LOCAL_OUT))) != 0) { + DEBUGP("ip6t_REJECT: bad hook mask %X\n", hook_mask); + return 0; + } + + if (rejinfo->with == IP6T_ICMP6_ECHOREPLY) { + printk("ip6t_REJECT: ECHOREPLY is not supported.\n"); + return 0; + } else if (rejinfo->with == IP6T_TCP_RESET) { + /* Must specify that it's a TCP packet */ + if (e->ipv6.proto != IPPROTO_TCP + || (e->ipv6.invflags & IP6T_INV_PROTO)) { + DEBUGP("ip6t_REJECT: TCP_RESET illegal for non-tcp\n"); + return 0; + } + } + + return 1; +} + +static struct ip6t_target ip6t_reject_reg = { + .name = "REJECT", + .target = reject6_target, + .checkentry = check, + .me = THIS_MODULE +}; + +static int __init init(void) +{ + if (ip6t_register_target(&ip6t_reject_reg)) + return -EINVAL; + return 0; +} + +static void __exit fini(void) +{ + ip6t_unregister_target(&ip6t_reject_reg); +} + +module_init(init); +module_exit(fini); -- cgit v0.10.2 From 05465343bf74e00c8c2c5a310740157de3149f27 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sun, 21 Aug 2005 23:31:43 -0700 Subject: [NETFILTER]: Add goto target Originally written by Henrik Nordstrom , taken from netfilter patch-o-matic and added ip6_tables support. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller diff --git a/include/linux/netfilter_ipv4/ip_tables.h b/include/linux/netfilter_ipv4/ip_tables.h index 12ce478..d19d65c 100644 --- a/include/linux/netfilter_ipv4/ip_tables.h +++ b/include/linux/netfilter_ipv4/ip_tables.h @@ -109,7 +109,8 @@ struct ipt_counters /* Values for "flag" field in struct ipt_ip (general ip structure). */ #define IPT_F_FRAG 0x01 /* Set if rule is a fragment rule */ -#define IPT_F_MASK 0x01 /* All possible flag bits mask. */ +#define IPT_F_GOTO 0x02 /* Set if jump is a goto */ +#define IPT_F_MASK 0x03 /* All possible flag bits mask. */ /* Values for "inv" field in struct ipt_ip. */ #define IPT_INV_VIA_IN 0x01 /* Invert the sense of IN IFACE. */ diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/linux/netfilter_ipv6/ip6_tables.h index f1ce3b0..58c72a5 100644 --- a/include/linux/netfilter_ipv6/ip6_tables.h +++ b/include/linux/netfilter_ipv6/ip6_tables.h @@ -111,7 +111,8 @@ struct ip6t_counters #define IP6T_F_PROTO 0x01 /* Set if rule cares about upper protocols */ #define IP6T_F_TOS 0x02 /* Match the TOS. */ -#define IP6T_F_MASK 0x03 /* All possible flag bits mask. */ +#define IP6T_F_GOTO 0x04 /* Set if jump is a goto */ +#define IP6T_F_MASK 0x07 /* All possible flag bits mask. */ /* Values for "inv" field in struct ip6t_ip6. */ #define IP6T_INV_VIA_IN 0x01 /* Invert the sense of IN IFACE. */ diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index ff8d85d..eef99a1 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -340,8 +340,8 @@ ipt_do_table(struct sk_buff **pskb, back->comefrom); continue; } - if (table_base + v - != (void *)e + e->next_offset) { + if (table_base + v != (void *)e + e->next_offset + && !(e->ip.flags & IPT_F_GOTO)) { /* Save old back ptr in next entry */ struct ipt_entry *next = (void *)e + e->next_offset; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 41a67cf..1cb8adb 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -433,8 +433,8 @@ ip6t_do_table(struct sk_buff **pskb, back->comefrom); continue; } - if (table_base + v - != (void *)e + e->next_offset) { + if (table_base + v != (void *)e + e->next_offset + && !(e->ipv6.flags & IP6T_F_GOTO)) { /* Save old back ptr in next entry */ struct ip6t_entry *next = (void *)e + e->next_offset; -- cgit v0.10.2 From 58e45131dc269eff0983c6d44494f9e687686900 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sun, 21 Aug 2005 23:46:01 -0700 Subject: [DCCP]: Fix printf format warnings on 64-bit. Signed-off-by: David S. Miller diff --git a/net/dccp/input.c b/net/dccp/input.c index 5847cf4..8540253 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -141,10 +141,16 @@ static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb) "(P.ackno %s or LAWL(%llu) <= P.ackno(%llu) <= S.AWH(%llu), " "sending SYNC...\n", dccp_packet_name(dh->dccph_type), - lswl, DCCP_SKB_CB(skb)->dccpd_seq, dp->dccps_swh, + (unsigned long long) lswl, + (unsigned long long) + DCCP_SKB_CB(skb)->dccpd_seq, + (unsigned long long) dp->dccps_swh, (DCCP_SKB_CB(skb)->dccpd_ack_seq == DCCP_PKT_WITHOUT_ACK_SEQ) ? "doesn't exist" : "exists", - lawl, DCCP_SKB_CB(skb)->dccpd_ack_seq, dp->dccps_awh); + (unsigned long long) lawl, + (unsigned long long) + DCCP_SKB_CB(skb)->dccpd_ack_seq, + (unsigned long long) dp->dccps_awh); dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_PKT_SYNC); return -1; } -- cgit v0.10.2 From 7ad07e7cf343181002c10c39d3f57a88e4903d4f Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 23 Aug 2005 21:50:06 -0700 Subject: [DCCP]: Implement the CLOSING timer So that we retransmit CLOSE/CLOSEREQ packets till they elicit an answer or we hit a timeout. Most of the machinery uses TCP approaches, this code has to be polished & audited, but this is better than we had before. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 33968a9..53994f1 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -255,7 +255,7 @@ extern int dccp_v4_checksum(const struct sk_buff *skb, extern int dccp_v4_send_reset(struct sock *sk, enum dccp_reset_codes code); -extern void dccp_send_close(struct sock *sk); +extern void dccp_send_close(struct sock *sk, const int active); struct dccp_skb_cb { __u8 dccpd_type; diff --git a/net/dccp/input.c b/net/dccp/input.c index 8540253..02af05e 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -31,14 +31,9 @@ static void dccp_fin(struct sock *sk, struct sk_buff *skb) static void dccp_rcv_close(struct sock *sk, struct sk_buff *skb) { - switch (sk->sk_state) { - case DCCP_PARTOPEN: - case DCCP_OPEN: - dccp_v4_send_reset(sk, DCCP_RESET_CODE_CLOSED); - dccp_fin(sk, skb); - dccp_set_state(sk, DCCP_CLOSED); - break; - } + dccp_v4_send_reset(sk, DCCP_RESET_CODE_CLOSED); + dccp_fin(sk, skb); + dccp_set_state(sk, DCCP_CLOSED); } static void dccp_rcv_closereq(struct sock *sk, struct sk_buff *skb) @@ -54,13 +49,8 @@ static void dccp_rcv_closereq(struct sock *sk, struct sk_buff *skb) return; } - switch (sk->sk_state) { - case DCCP_PARTOPEN: - case DCCP_OPEN: - dccp_set_state(sk, DCCP_CLOSING); - dccp_send_close(sk); - break; - } + dccp_set_state(sk, DCCP_CLOSING); + dccp_send_close(sk, 0); } static inline void dccp_event_ack_recv(struct sock *sk, struct sk_buff *skb) @@ -562,6 +552,12 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_PKT_SYNC); goto discard; + } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) { + dccp_rcv_closereq(sk, skb); + goto discard; + } else if (dh->dccph_type == DCCP_PKT_CLOSE) { + dccp_rcv_close(sk, skb); + return 0; } switch (sk->sk_state) { diff --git a/net/dccp/output.c b/net/dccp/output.c index 708fc3c..630ca77 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -96,8 +96,7 @@ int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) dh->dccph_checksum = dccp_v4_checksum(skb, inet->saddr, inet->daddr); - if (dcb->dccpd_type == DCCP_PKT_ACK || - dcb->dccpd_type == DCCP_PKT_DATAACK) + if (set_ack) dccp_event_ack_sent(sk); DCCP_INC_STATS(DCCP_MIB_OUTSEGS); @@ -429,18 +428,15 @@ void dccp_send_sync(struct sock *sk, const u64 seq, * cannot be allowed to fail queueing a DCCP_PKT_CLOSE/CLOSEREQ frame under * any circumstances. */ -void dccp_send_close(struct sock *sk) +void dccp_send_close(struct sock *sk, const int active) { struct dccp_sock *dp = dccp_sk(sk); struct sk_buff *skb; + const unsigned int prio = active ? GFP_KERNEL : GFP_ATOMIC; - /* Socket is locked, keep trying until memory is available. */ - for (;;) { - skb = alloc_skb(sk->sk_prot->max_header, GFP_KERNEL); - if (skb != NULL) - break; - yield(); - } + skb = alloc_skb(sk->sk_prot->max_header, prio); + if (skb == NULL) + return; /* Reserve space for headers and prepare control bits. */ skb_reserve(skb, sk->sk_prot->max_header); @@ -449,7 +445,12 @@ void dccp_send_close(struct sock *sk) DCCP_PKT_CLOSE : DCCP_PKT_CLOSEREQ; skb_set_owner_w(skb, sk); - dccp_transmit_skb(sk, skb); + if (active) { + BUG_TRAP(sk->sk_send_head == NULL); + sk->sk_send_head = skb; + dccp_transmit_skb(sk, skb_clone(skb, prio)); + } else + dccp_transmit_skb(sk, skb); ccid_hc_rx_exit(dp->dccps_hc_rx_ccid, sk); ccid_hc_tx_exit(dp->dccps_hc_tx_ccid, sk); diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 8b613c3..a3f8a80 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -402,12 +402,15 @@ void dccp_close(struct sock *sk, long timeout) /* Check zero linger _after_ checking for unread data. */ sk->sk_prot->disconnect(sk, 0); } else if (dccp_close_state(sk)) { - dccp_send_close(sk); + dccp_send_close(sk, 1); } sk_stream_wait_close(sk, timeout); adjudge_to_death: + /* + * It is the last release_sock in its life. It will remove backlog. + */ release_sock(sk); /* * Now socket is owned by kernel and we acquire BH lock @@ -419,11 +422,26 @@ adjudge_to_death: sock_hold(sk); sock_orphan(sk); - - if (sk->sk_state != DCCP_CLOSED) + + /* + * The last release_sock may have processed the CLOSE or RESET + * packet moving sock to CLOSED state, if not we have to fire + * the CLOSE/CLOSEREQ retransmission timer, see "8.3. Termination" + * in draft-ietf-dccp-spec-11. -acme + */ + if (sk->sk_state == DCCP_CLOSING) { + /* FIXME: should start at 2 * RTT */ + /* Timer for repeating the CLOSE/CLOSEREQ until an answer. */ + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto, + DCCP_RTO_MAX); +#if 0 + /* Yeah, we should use sk->sk_prot->orphan_count, etc */ dccp_set_state(sk, DCCP_CLOSED); +#endif + } - atomic_inc(&dccp_orphan_count); + atomic_inc(sk->sk_prot->orphan_count); if (sk->sk_state == DCCP_CLOSED) inet_csk_destroy_sock(sk); -- cgit v0.10.2 From 20472af986569b0615bd77f0fd7ca9e3d33e9895 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 23 Aug 2005 21:50:21 -0700 Subject: [DCCP]: Fix skb leak in dccp_sendmsg Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/net/dccp/proto.c b/net/dccp/proto.c index a3f8a80..2b6db18 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -206,6 +206,18 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, goto out_discard; rc = dccp_write_xmit(sk, skb, len); + /* + * XXX we don't use sk_write_queue, so just discard the packet. + * Current plan however is to _use_ sk_write_queue with + * an algorith similar to tcp_sendmsg, where the main difference + * is that in DCCP we have to respect packet boundaries, so + * no coalescing of skbs. + * + * This bug was _quickly_ found & fixed by just looking at an OSTRA + * generated callgraph 8) -acme + */ + if (rc != 0) + goto out_discard; out_release: release_sock(sk); return rc ? : len; -- cgit v0.10.2 From a4beb1b64f5846e216bf2c439022df480151902a Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 23 Aug 2005 21:50:45 -0700 Subject: [DCCP]: Send a DATAACK packet when we have a TIMESTAMP_ECHO pending Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/net/dccp/output.c b/net/dccp/output.c index 630ca77..f96dedd 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -171,6 +171,7 @@ int dccp_write_xmit(struct sock *sk, struct sk_buff *skb, const int len) * dccps_ack_pending or use icsk. */ } else if (inet_csk_ack_scheduled(sk) || + dp->dccps_timestamp_echo != 0 || (dp->dccps_options.dccpo_send_ack_vector && ap->dccpap_buf_ackno != DCCP_MAX_SEQNO + 1 && ap->dccpap_ack_seqno == DCCP_MAX_SEQNO + 1)) -- cgit v0.10.2 From 012e13eac7579fcc7618df4ca1d5af3cdc03748c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 23 Aug 2005 21:51:13 -0700 Subject: [CCID]: Make ccid_hc_[rt]x_exit accept NULL arguments Just like kfree, etc it will just not call the CCID exit routines when the private data area is set to NULL. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h index 95eb47d..c6767b2 100644 --- a/net/dccp/ccid.h +++ b/net/dccp/ccid.h @@ -97,13 +97,15 @@ static inline int ccid_hc_tx_init(struct ccid *ccid, struct sock *sk) static inline void ccid_hc_rx_exit(struct ccid *ccid, struct sock *sk) { - if (ccid->ccid_hc_rx_exit != NULL) + if (ccid->ccid_hc_rx_exit != NULL && + dccp_sk(sk)->dccps_hc_rx_ccid_private != NULL) ccid->ccid_hc_rx_exit(sk); } static inline void ccid_hc_tx_exit(struct ccid *ccid, struct sock *sk) { - if (ccid->ccid_hc_tx_exit != NULL) + if (ccid->ccid_hc_tx_exit != NULL && + dccp_sk(sk)->dccps_hc_tx_ccid_private != NULL) ccid->ccid_hc_tx_exit(sk); } -- cgit v0.10.2 From d4b81ff70547b40c9b0742b163e8354560003cc0 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 23 Aug 2005 21:51:36 -0700 Subject: [DCCP]: Export dccp_insert_option_timestamp to CCIDs And don't insert a TIMESTAMP option in all packets, leave the decision to the CCIDs. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 53994f1..c6ba07e 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -364,6 +364,8 @@ extern void dccp_insert_options(struct sock *sk, struct sk_buff *skb); extern void dccp_insert_option_elapsed_time(struct sock *sk, struct sk_buff *skb, u32 elapsed_time); +extern void dccp_insert_option_timestamp(struct sock *sk, + struct sk_buff *skb); extern void dccp_insert_option(struct sock *sk, struct sk_buff *skb, unsigned char option, const void *value, unsigned char len); diff --git a/net/dccp/options.c b/net/dccp/options.c index 7ecffdf..eabcc8f 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -348,7 +348,7 @@ void dccp_insert_option_elapsed_time(struct sock *sk, (unsigned long long) DCCP_SKB_CB(skb)->dccpd_seq); } -EXPORT_SYMBOL(dccp_insert_option_elapsed_time); +EXPORT_SYMBOL_GPL(dccp_insert_option_elapsed_time); static void dccp_insert_option_ack_vector(struct sock *sk, struct sk_buff *skb) { @@ -426,8 +426,7 @@ static void dccp_insert_option_ack_vector(struct sock *sk, struct sk_buff *skb) (unsigned long long) ap->dccpap_ack_ackno); } -static inline void dccp_insert_option_timestamp(struct sock *sk, - struct sk_buff *skb) +void dccp_insert_option_timestamp(struct sock *sk, struct sk_buff *skb) { struct timeval tv; u32 now; @@ -441,6 +440,8 @@ static inline void dccp_insert_option_timestamp(struct sock *sk, dccp_insert_option(sk, skb, DCCPO_TIMESTAMP, &now, sizeof(now)); } +EXPORT_SYMBOL_GPL(dccp_insert_option_timestamp); + static void dccp_insert_option_timestamp_echo(struct sock *sk, struct sk_buff *skb) { @@ -504,7 +505,6 @@ void dccp_insert_options(struct sock *sk, struct sk_buff *skb) DCCP_MAX_SEQNO + 1)) dccp_insert_option_ack_vector(sk, skb); - dccp_insert_option_timestamp(sk, skb); if (dp->dccps_timestamp_echo != 0) dccp_insert_option_timestamp_echo(sk, skb); } -- cgit v0.10.2 From 4fded33b3e8177d1d2eec0ccc69af8dfe8b4c3c3 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 23 Aug 2005 21:51:59 -0700 Subject: [CCID3]: Calculate the RTT in the RX half connection Using TIMESTAMP_ECHO and ELAPSED_TIME options received. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index ffd5b44..48c36af 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -1010,7 +1010,6 @@ static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, int len) struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private; struct timeval now; -// ccid3_pr_debug("%s, sk=%p, more=%d, len=%d\n", dccp_role(sk), sk, more, len); BUG_ON(hctx == NULL); if (hctx->ccid3hctx_state == TFRC_SSTATE_TERM) { @@ -1562,23 +1561,27 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk) static void ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) { const struct dccp_sock *dp = dccp_sk(sk); + u32 x_recv, pinv; struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; if (hcrx == NULL || !(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN)) return; - if (hcrx->ccid3hcrx_elapsed_time != 0 && !dccp_packet_without_ack(skb)) - dccp_insert_option_elapsed_time(sk, skb, hcrx->ccid3hcrx_elapsed_time); - - if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA) { - const u32 x_recv = htonl(hcrx->ccid3hcrx_x_recv); - const u32 pinv = htonl(hcrx->ccid3hcrx_pinv); - - dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE, &pinv, sizeof(pinv)); - dccp_insert_option(sk, skb, TFRC_OPT_RECEIVE_RATE, &x_recv, sizeof(x_recv)); - } - DCCP_SKB_CB(skb)->dccpd_ccval = hcrx->ccid3hcrx_last_counter; + + if (dccp_packet_without_ack(skb)) + return; + + if (hcrx->ccid3hcrx_elapsed_time != 0) + dccp_insert_option_elapsed_time(sk, skb, + hcrx->ccid3hcrx_elapsed_time); + dccp_insert_option_timestamp(sk, skb); + x_recv = htonl(hcrx->ccid3hcrx_x_recv); + pinv = htonl(hcrx->ccid3hcrx_pinv); + dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE, + &pinv, sizeof(pinv)); + dccp_insert_option(sk, skb, TFRC_OPT_RECEIVE_RATE, + &x_recv, sizeof(x_recv)); } /* Weights used to calculate loss event rate */ @@ -1860,8 +1863,10 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) { struct dccp_sock *dp = dccp_sk(sk); struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; + const struct dccp_options_received *opt_recv; struct dccp_rx_hist_entry *packet; struct timeval now; + u32 now_usecs; u8 win_count; u32 p_prev; int ins; @@ -1876,24 +1881,25 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) BUG_ON(!(hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA || hcrx->ccid3hcrx_state == TFRC_RSTATE_DATA)); + opt_recv = &dp->dccps_options_received; + switch (DCCP_SKB_CB(skb)->dccpd_type) { case DCCP_PKT_ACK: if (hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA) return; case DCCP_PKT_DATAACK: - if (dp->dccps_options_received.dccpor_timestamp_echo == 0) + if (opt_recv->dccpor_timestamp_echo == 0) break; p_prev = hcrx->ccid3hcrx_rtt; do_gettimeofday(&now); - /* hcrx->ccid3hcrx_rtt = now - dp->dccps_options_received.dccpor_timestamp_echo - - usecs_to_jiffies(dp->dccps_options_received.dccpor_elapsed_time * 10); - FIXME - I think above code is broken - have to look at options more, will also need - to fix pr_debug below */ + now_usecs = now.tv_sec * USEC_PER_SEC + now.tv_usec; + hcrx->ccid3hcrx_rtt = now_usecs - + (opt_recv->dccpor_timestamp_echo - + opt_recv->dccpor_elapsed_time) * 10; if (p_prev != hcrx->ccid3hcrx_rtt) - ccid3_pr_debug("%s, sk=%p, New RTT estimate=%lu jiffies, tstamp_echo=%u, elapsed time=%u\n", - dccp_role(sk), sk, hcrx->ccid3hcrx_rtt, - dp->dccps_options_received.dccpor_timestamp_echo, - dp->dccps_options_received.dccpor_elapsed_time); + ccid3_pr_debug("%s, New RTT=%luus, elapsed time=%u\n", + dccp_role(sk), hcrx->ccid3hcrx_rtt, + opt_recv->dccpor_elapsed_time); break; case DCCP_PKT_DATA: break; @@ -1904,8 +1910,7 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) return; } - packet = dccp_rx_hist_entry_new(ccid3_rx_hist, - dp->dccps_options_received.dccpor_ndp, + packet = dccp_rx_hist_entry_new(ccid3_rx_hist, opt_recv->dccpor_ndp, skb, SLAB_ATOMIC); if (packet == NULL) { ccid3_pr_debug("%s, sk=%p, Not enough mem to add rx packet to history (consider it lost)!", @@ -1930,9 +1935,9 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) case TFRC_RSTATE_DATA: hcrx->ccid3hcrx_bytes_recv += skb->len - dccp_hdr(skb)->dccph_doff * 4; if (ins == 0) { - do_gettimeofday(&now); - if ((now_delta(hcrx->ccid3hcrx_tstamp_last_ack)) >= hcrx->ccid3hcrx_rtt) { - hcrx->ccid3hcrx_tstamp_last_ack = now; + if (now_delta(hcrx->ccid3hcrx_tstamp_last_ack) >= + hcrx->ccid3hcrx_rtt) { + do_gettimeofday(&hcrx->ccid3hcrx_tstamp_last_ack); ccid3_hc_rx_send_feedback(sk); } return; @@ -1946,8 +1951,8 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) } /* Dealing with packet loss */ - ccid3_pr_debug("%s, sk=%p(%s), skb=%p, data loss! Reacting...\n", - dccp_role(sk), sk, dccp_state_name(sk->sk_state), skb); + ccid3_pr_debug("%s, sk=%p(%s), data loss! Reacting...\n", + dccp_role(sk), sk, dccp_state_name(sk->sk_state)); ccid3_hc_rx_detect_loss(sk); p_prev = hcrx->ccid3hcrx_p; @@ -1985,7 +1990,11 @@ static int ccid3_hc_rx_init(struct sock *sk) hcrx->ccid3hcrx_state = TFRC_RSTATE_NO_DATA; INIT_LIST_HEAD(&hcrx->ccid3hcrx_hist); INIT_LIST_HEAD(&hcrx->ccid3hcrx_loss_interval_hist); - + /* + * XXX this seems to be paranoid, need to think more about this, for + * now start with something different than zero. -acme + */ + hcrx->ccid3hcrx_rtt = USEC_PER_SEC / 5; return 0; } -- cgit v0.10.2 From 2babe1f6fea717c36c008c878fe095d1ca5696c1 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 23 Aug 2005 21:52:35 -0700 Subject: [DCCP]: Introduce dccp_get_info And also hc_tx and hc_rx get_info functions for the CCIDs to fill in information that is specific to them. For now reusing struct tcp_info, later I'll try to figure out a better solution, for now its really nice to get this kind of info: [root@qemu ~]# ./ss -danemi State Recv-Q Send-Q Local Addr:Port Peer Addr:Port LISTEN 0 0 *:5001 *:* ino:628 sk:c1340040 mem:(r0,w0,f0,t0) cwnd:0 ssthresh:0 ESTAB 0 0 172.20.0.2:5001 172.20.0.1:32785 ino:629 sk:c13409a0 mem:(r0,w0,f0,t0) ts rto:1000 rtt:0.004/0 cwnd:0 ssthresh:0 rcv_rtt:61.377 This, for instance, shows that we're not congestion controlling ACKs, as the above output is in the ttcp receiving host, and ttcp is a one way app, i.e. the received never calls sendmsg, so ccid_hc_tx_send_packet is never called, so the TX half connection stays in TFRC_SSTATE_NO_SENT state and hctx_rtt is never calculated, stays with the value set in ccid3_hc_tx_init, 4us, as show above in milliseconds (0.004ms), upcoming patches will fix this. rcv_rtt seems sane tho, matching ping results :-) Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h index c6767b2..962f1e9 100644 --- a/net/dccp/ccid.h +++ b/net/dccp/ccid.h @@ -50,6 +50,10 @@ struct ccid { struct sk_buff *skb, int len); void (*ccid_hc_tx_packet_sent)(struct sock *sk, int more, int len); + void (*ccid_hc_rx_get_info)(struct sock *sk, + struct tcp_info *info); + void (*ccid_hc_tx_get_info)(struct sock *sk, + struct tcp_info *info); }; extern int ccid_register(struct ccid *ccid); @@ -159,4 +163,18 @@ static inline void ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk, if (ccid->ccid_hc_rx_insert_options != NULL) ccid->ccid_hc_rx_insert_options(sk, skb); } + +static inline void ccid_hc_rx_get_info(struct ccid *ccid, struct sock *sk, + struct tcp_info *info) +{ + if (ccid->ccid_hc_rx_get_info != NULL) + ccid->ccid_hc_rx_get_info(sk, info); +} + +static inline void ccid_hc_tx_get_info(struct ccid *ccid, struct sock *sk, + struct tcp_info *info) +{ + if (ccid->ccid_hc_tx_get_info != NULL) + ccid->ccid_hc_tx_get_info(sk, info); +} #endif /* _CCID_H */ diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 48c36af..fe4cc85 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -2020,6 +2020,31 @@ static void ccid3_hc_rx_exit(struct sock *sk) dp->dccps_hc_rx_ccid_private = NULL; } +static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info) +{ + const struct dccp_sock *dp = dccp_sk(sk); + const struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; + + if (hcrx == NULL) + return; + + info->tcpi_ca_state = hcrx->ccid3hcrx_state; + info->tcpi_options |= TCPI_OPT_TIMESTAMPS; + info->tcpi_rcv_rtt = hcrx->ccid3hcrx_rtt; +} + +static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info) +{ + const struct dccp_sock *dp = dccp_sk(sk); + const struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private; + + if (hctx == NULL) + return; + + info->tcpi_rto = hctx->ccid3hctx_t_rto; + info->tcpi_rtt = hctx->ccid3hctx_rtt; +} + static struct ccid ccid3 = { .ccid_id = 3, .ccid_name = "ccid3", @@ -2037,6 +2062,8 @@ static struct ccid ccid3 = { .ccid_hc_rx_exit = ccid3_hc_rx_exit, .ccid_hc_rx_insert_options = ccid3_hc_rx_insert_options, .ccid_hc_rx_packet_recv = ccid3_hc_rx_packet_recv, + .ccid_hc_rx_get_info = ccid3_hc_rx_get_info, + .ccid_hc_tx_get_info = ccid3_hc_tx_get_info, }; module_param(ccid3_debug, int, 0444); diff --git a/net/dccp/diag.c b/net/dccp/diag.c index 0b10c17..f675d8e6 100644 --- a/net/dccp/diag.c +++ b/net/dccp/diag.c @@ -14,19 +14,43 @@ #include #include +#include "ccid.h" #include "dccp.h" +static void dccp_get_info(struct sock *sk, struct tcp_info *info) +{ + struct dccp_sock *dp = dccp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); + + memset(info, 0, sizeof(*info)); + + info->tcpi_state = sk->sk_state; + info->tcpi_retransmits = icsk->icsk_retransmits; + info->tcpi_probes = icsk->icsk_probes_out; + info->tcpi_backoff = icsk->icsk_backoff; + info->tcpi_pmtu = dp->dccps_pmtu_cookie; + + if (dp->dccps_options.dccpo_send_ack_vector) + info->tcpi_options |= TCPI_OPT_SACK; + + ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info); + ccid_hc_tx_get_info(dp->dccps_hc_tx_ccid, sk, info); +} + static void dccp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, void *_info) { r->idiag_rqueue = r->idiag_wqueue = 0; + + if (_info != NULL) + dccp_get_info(sk, _info); } static struct inet_diag_handler dccp_diag_handler = { .idiag_hashinfo = &dccp_hashinfo, .idiag_get_info = dccp_diag_get_info, .idiag_type = DCCPDIAG_GETSOCK, - .idiag_info_size = 0, + .idiag_info_size = sizeof(struct tcp_info), }; static int __init dccp_diag_init(void) -- cgit v0.10.2 From dc40c7bc76054f5e4382835ca2bafb895b993a8a Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 23 Aug 2005 21:52:58 -0700 Subject: [ICSK]: Generalise tcp_listen_poll Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 4d7e708..8a87a3a 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -260,6 +260,16 @@ extern void inet_csk_reqsk_queue_prune(struct sock *parent, const unsigned long max_rto); extern void inet_csk_destroy_sock(struct sock *sk); + +/* + * LISTEN is a special case for poll.. + */ +static inline unsigned int inet_csk_listen_poll(const struct sock *sk) +{ + return !reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue) ? + (POLLIN | POLLRDNORM) : 0; +} + extern int inet_csk_listen_start(struct sock *sk, const int nr_table_entries); extern void inet_csk_listen_stop(struct sock *sk); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 02848e7..68626de 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -310,15 +310,6 @@ void tcp_enter_memory_pressure(void) EXPORT_SYMBOL(tcp_enter_memory_pressure); /* - * LISTEN is a special case for poll.. - */ -static __inline__ unsigned int tcp_listen_poll(struct sock *sk, - poll_table *wait) -{ - return !reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue) ? (POLLIN | POLLRDNORM) : 0; -} - -/* * Wait for a TCP event. * * Note that we don't need to lock the socket, as the upper poll layers @@ -333,7 +324,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) poll_wait(file, sk->sk_sleep, wait); if (sk->sk_state == TCP_LISTEN) - return tcp_listen_poll(sk, wait); + return inet_csk_listen_poll(sk); /* Socket is not locked. We are protected from async events by poll logic and correct handling of state changes -- cgit v0.10.2 From 8efa544f9c84919c047dc2f96e308c902e8fb1a4 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 23 Aug 2005 21:54:00 -0700 Subject: [DCCP]: Call the HC exit routines at dccp_v4_destroy_sock Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 647e669..3cf2cbc 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -1303,6 +1303,8 @@ static int dccp_v4_destroy_sock(struct sock *sk) if (inet_csk(sk)->icsk_bind_hash != NULL) inet_put_port(&dccp_hashinfo, sk); + ccid_hc_rx_exit(dp->dccps_hc_rx_ccid, sk); + ccid_hc_tx_exit(dp->dccps_hc_tx_ccid, sk); dccp_ackpkts_free(dp->dccps_hc_rx_ackpkts); dp->dccps_hc_rx_ackpkts = NULL; ccid_exit(dp->dccps_hc_rx_ccid, sk); -- cgit v0.10.2 From 331968bd0c1b2437f3ad773cbf55f2e0737bafc0 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 23 Aug 2005 21:54:23 -0700 Subject: [DCCP]: Initial dccp_poll implementation Tested with a patched netcat, no horror stories so far 8) Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/net/dccp/input.c b/net/dccp/input.c index 02af05e..ef29cef 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -34,6 +34,7 @@ static void dccp_rcv_close(struct sock *sk, struct sk_buff *skb) dccp_v4_send_reset(sk, DCCP_RESET_CODE_CLOSED); dccp_fin(sk, skb); dccp_set_state(sk, DCCP_CLOSED); + sk_wake_async(sk, 1, POLL_HUP); } static void dccp_rcv_closereq(struct sock *sk, struct sk_buff *skb) diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 2b6db18..600dda5 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -140,6 +140,62 @@ int dccp_disconnect(struct sock *sk, int flags) return err; } +/* + * Wait for a DCCP event. + * + * Note that we don't need to lock the socket, as the upper poll layers + * take care of normal races (between the test and the event) and we don't + * go look at any of the socket buffers directly. + */ +static unsigned int dccp_poll(struct file *file, struct socket *sock, + poll_table *wait) +{ + unsigned int mask; + struct sock *sk = sock->sk; + + poll_wait(file, sk->sk_sleep, wait); + if (sk->sk_state == DCCP_LISTEN) + return inet_csk_listen_poll(sk); + + /* Socket is not locked. We are protected from async events + by poll logic and correct handling of state changes + made by another threads is impossible in any case. + */ + + mask = 0; + if (sk->sk_err) + mask = POLLERR; + + if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == DCCP_CLOSED) + mask |= POLLHUP; + if (sk->sk_shutdown & RCV_SHUTDOWN) + mask |= POLLIN | POLLRDNORM; + + /* Connected? */ + if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_RESPOND)) { + if (atomic_read(&sk->sk_rmem_alloc) > 0) + mask |= POLLIN | POLLRDNORM; + + if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { + if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { + mask |= POLLOUT | POLLWRNORM; + } else { /* send SIGIO later */ + set_bit(SOCK_ASYNC_NOSPACE, + &sk->sk_socket->flags); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + + /* Race breaker. If space is freed after + * wspace test but before the flags are set, + * IO signal will be lost. + */ + if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) + mask |= POLLOUT | POLLWRNORM; + } + } + } + return mask; +} + int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg) { dccp_pr_debug("entry\n"); @@ -478,7 +534,8 @@ static struct proto_ops inet_dccp_ops = { .socketpair = sock_no_socketpair, .accept = inet_accept, .getname = inet_getname, - .poll = sock_no_poll, + /* FIXME: work on tcp_poll to rename it to inet_csk_poll */ + .poll = dccp_poll, .ioctl = inet_ioctl, /* FIXME: work on inet_listen to rename it to sock_common_listen */ .listen = inet_dccp_listen, -- cgit v0.10.2 From 0c7770c740156c8802c23d24fc094d06967d997d Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 23 Aug 2005 21:59:41 -0700 Subject: [IPV4]: FIB trie cleanup This is a redo of earlier cleanup stuff: * replace DBG() macro with pr_debug() * get rid of duplicate extern's that are already in fib_lookup.h * use BUG_ON and WARN_ON * don't use BUG checks for null pointers where next statement would get a fault anyway * remove debug printout when rebalance causes deep tree * remove trailing blanks Signed-off-by: Stephen Hemminger Signed-off-by: Robert Olsson Signed-off-by: David S. Miller diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 395f64d..9c4c7f0 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -157,10 +157,6 @@ struct trie { unsigned int revision; }; -static int trie_debug = 0; - -#define DBG(x...) do { if (trie_debug) printk(x); } while (0) - static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n); static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull); static struct node *resize(struct trie *t, struct tnode *tn); @@ -168,12 +164,6 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn); static struct tnode *halve(struct trie *t, struct tnode *tn); static void tnode_free(struct tnode *tn); static void trie_dump_seq(struct seq_file *seq, struct trie *t); -extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio); -extern int fib_detect_death(struct fib_info *fi, int order, - struct fib_info **last_resort, int *last_idx, int *dflt); - -extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa, int z, int tb_id, - struct nlmsghdr *n, struct netlink_skb_parms *req); static kmem_cache_t *fn_alias_kmem; static struct trie *trie_local = NULL, *trie_main = NULL; @@ -294,11 +284,9 @@ static void fn_free_alias(struct fib_alias *fa) */ -static void check_tnode(struct tnode *tn) +static inline void check_tnode(const struct tnode *tn) { - if (tn && tn->pos+tn->bits > 32) { - printk("TNODE ERROR tn=%p, pos=%d, bits=%d\n", tn, tn->pos, tn->bits); - } + WARN_ON(tn && tn->pos+tn->bits > 32); } static int halve_threshold = 25; @@ -374,21 +362,19 @@ static struct tnode* tnode_new(t_key key, int pos, int bits) tn->empty_children = 1<= 1<bits) { - printk("bits=%d, i=%d\n", tn->bits, i); - BUG(); - } + BUG_ON(i >= 1<bits); + write_lock_bh(&fib_lock); chi = tn->child[i]; @@ -459,8 +443,8 @@ static struct node *resize(struct trie *t, struct tnode *tn) if (!tn) return NULL; - DBG("In tnode_resize %p inflate_threshold=%d threshold=%d\n", - tn, inflate_threshold, halve_threshold); + pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n", + tn, inflate_threshold, halve_threshold); /* No children */ if (tn->empty_children == tnode_child_length(tn)) { @@ -625,11 +609,11 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) int olen = tnode_child_length(tn); int i; - DBG("In inflate\n"); + pr_debug("In inflate\n"); tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1); - if (!tn) + if (!tn) return ERR_PTR(-ENOMEM); /* @@ -749,12 +733,12 @@ nomem: int size = tnode_child_length(tn); int j; - for(j = 0; j < size; j++) + for (j = 0; j < size; j++) if (tn->child[j]) tnode_free((struct tnode *)tn->child[j]); tnode_free(tn); - + return ERR_PTR(-ENOMEM); } } @@ -766,7 +750,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn) int i; int olen = tnode_child_length(tn); - DBG("In halve\n"); + pr_debug("In halve\n"); tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1); @@ -785,14 +769,14 @@ static struct tnode *halve(struct trie *t, struct tnode *tn) right = tnode_get_child(oldtnode, i+1); /* Two nonempty children */ - if (left && right) { + if (left && right) { struct tnode *newn; - + newn = tnode_new(left->key, tn->pos + tn->bits, 1); - - if (!newn) + + if (!newn) goto nomem; - + put_child(t, tn, i/2, (struct node *)newn); } @@ -810,7 +794,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn) continue; put_child(t, tn, i/2, right); continue; - } + } if (right == NULL) { put_child(t, tn, i/2, left); @@ -820,9 +804,6 @@ static struct tnode *halve(struct trie *t, struct tnode *tn) /* Two nonempty children */ newBinNode = (struct tnode *) tnode_get_child(tn, i/2); put_child(t, tn, i/2, NULL); - - BUG_ON(!newBinNode); - put_child(t, newBinNode, 0, left); put_child(t, newBinNode, 1, right); put_child(t, tn, i/2, resize(t, newBinNode)); @@ -834,12 +815,12 @@ nomem: int size = tnode_child_length(tn); int j; - for(j = 0; j < size; j++) + for (j = 0; j < size; j++) if (tn->child[j]) tnode_free((struct tnode *)tn->child[j]); tnode_free(tn); - + return ERR_PTR(-ENOMEM); } } @@ -939,22 +920,10 @@ static struct node *trie_rebalance(struct trie *t, struct tnode *tn) t_key cindex, key; struct tnode *tp = NULL; - BUG_ON(!tn); - key = tn->key; i = 0; while (tn != NULL && NODE_PARENT(tn) != NULL) { - if (i > 10) { - printk("Rebalance tn=%p \n", tn); - if (tn) - printk("tn->parent=%p \n", NODE_PARENT(tn)); - - printk("Rebalance tp=%p \n", tp); - if (tp) - printk("tp->parent=%p \n", NODE_PARENT(tp)); - } - BUG_ON(i > 12); /* Why is this a bug? -ojn */ i++; @@ -1019,10 +988,7 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen) pos = tn->pos + tn->bits; n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits)); - if (n && NODE_PARENT(n) != tn) { - printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n)); - BUG(); - } + BUG_ON(n && NODE_PARENT(n) != tn); } else break; } @@ -1076,8 +1042,6 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen) NODE_SET_PARENT(l, tp); - BUG_ON(!tp); - cindex = tkey_extract_bits(key, tp->pos, tp->bits); put_child(t, (struct tnode *)tp, cindex, (struct node *)l); } else { @@ -1158,7 +1122,7 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, key = ntohl(key); - DBG("Insert table=%d %08x/%d\n", tb->tb_id, key, plen); + pr_debug("Insert table=%d %08x/%d\n", tb->tb_id, key, plen); mask = ntohl(inet_make_mask(plen)); @@ -1282,7 +1246,8 @@ err: return err; } -static inline int check_leaf(struct trie *t, struct leaf *l, t_key key, int *plen, const struct flowi *flp, +static inline int check_leaf(struct trie *t, struct leaf *l, + t_key key, int *plen, const struct flowi *flp, struct fib_result *res) { int err, i; @@ -1511,7 +1476,7 @@ static int trie_leaf_remove(struct trie *t, t_key key) struct node *n = t->trie; struct leaf *l; - DBG("entering trie_leaf_remove(%p)\n", n); + pr_debug("entering trie_leaf_remove(%p)\n", n); /* Note that in the case skipped bits, those bits are *not* checked! * When we finish this, we will have NULL or a T_LEAF, and the @@ -1523,10 +1488,7 @@ static int trie_leaf_remove(struct trie *t, t_key key) check_tnode(tn); n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits)); - if (n && NODE_PARENT(n) != tn) { - printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n)); - BUG(); - } + BUG_ON(n && NODE_PARENT(n) != tn); } l = (struct leaf *) n; @@ -1594,7 +1556,7 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, if (!fa) return -ESRCH; - DBG("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t); + pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t); fa_to_delete = NULL; fa_head = fa->fa_list.prev; @@ -1762,7 +1724,7 @@ static int fn_trie_flush(struct fib_table *tb) if (ll && hlist_empty(&ll->list)) trie_leaf_remove(t, ll->key); - DBG("trie_flush found=%d\n", found); + pr_debug("trie_flush found=%d\n", found); return found; } -- cgit v0.10.2 From 98a82febb6340466824c3a453738d4fbd05db81a Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Wed, 24 Aug 2005 11:35:51 -0700 Subject: [AX25/NETROM]: Cleanup direct calls into IP stack Get rid of the calls to ip_rcv and arp_rcv which were layering violations anyway. With those being replaced by netif_rx, less parts of AX.25 and relatives depend on INET support actually being enabled. This also will make PF_PACKET sockets work for IP and ARP packets received over AX.25 and for IP packets over NET/ROM. Signed-off-by: Ralf Baechle DL5RB Signed-off-by: David S. Miller diff --git a/net/ax25/ax25_in.c b/net/ax25/ax25_in.c index 0357705..810c9c7 100644 --- a/net/ax25/ax25_in.c +++ b/net/ax25/ax25_in.c @@ -9,7 +9,6 @@ * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de) * Copyright (C) Hans-Joachim Hetscher DD8NE (dd8ne@bnv-bamberg.de) */ -#include #include #include #include @@ -26,9 +25,7 @@ #include #include #include -#include /* For ip_rcv */ #include -#include /* For arp_rcv */ #include #include #include @@ -114,7 +111,6 @@ int ax25_rx_iframe(ax25_cb *ax25, struct sk_buff *skb) pid = *skb->data; -#ifdef CONFIG_INET if (pid == AX25_P_IP) { /* working around a TCP bug to keep additional listeners * happy. TCP re-uses the buffer and destroys the original @@ -132,10 +128,9 @@ int ax25_rx_iframe(ax25_cb *ax25, struct sk_buff *skb) skb->dev = ax25->ax25_dev->dev; skb->pkt_type = PACKET_HOST; skb->protocol = htons(ETH_P_IP); - ip_rcv(skb, skb->dev, NULL, skb->dev); /* Wrong ptype */ + netif_rx(skb); return 1; } -#endif if (pid == AX25_P_SEGMENT) { skb_pull(skb, 1); /* Remove PID */ return ax25_rx_fragment(ax25, skb); @@ -250,7 +245,6 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev, /* Now we are pointing at the pid byte */ switch (skb->data[1]) { -#ifdef CONFIG_INET case AX25_P_IP: skb_pull(skb,2); /* drop PID/CTRL */ skb->h.raw = skb->data; @@ -258,7 +252,7 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev, skb->dev = dev; skb->pkt_type = PACKET_HOST; skb->protocol = htons(ETH_P_IP); - ip_rcv(skb, dev, ptype, dev); /* Note ptype here is the wrong one, fix me later */ + netif_rx(skb); break; case AX25_P_ARP: @@ -268,9 +262,8 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev, skb->dev = dev; skb->pkt_type = PACKET_HOST; skb->protocol = htons(ETH_P_ARP); - arp_rcv(skb, dev, ptype, dev); /* Note ptype here is wrong... */ + netif_rx(skb); break; -#endif case AX25_P_TEXT: /* Now find a suitable dgram socket */ sk = ax25_get_socket(&dest, &src, SOCK_DGRAM); diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 9aa8b14..4b53de9 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -858,17 +858,16 @@ int nr_rx_frame(struct sk_buff *skb, struct net_device *dev) frametype = skb->data[19] & 0x0F; flags = skb->data[19] & 0xF0; -#ifdef CONFIG_INET /* * Check for an incoming IP over NET/ROM frame. */ - if (frametype == NR_PROTOEXT && circuit_index == NR_PROTO_IP && circuit_id == NR_PROTO_IP) { + if (frametype == NR_PROTOEXT && + circuit_index == NR_PROTO_IP && circuit_id == NR_PROTO_IP) { skb_pull(skb, NR_NETWORK_LEN + NR_TRANSPORT_LEN); skb->h.raw = skb->data; return nr_rx_ip(skb, dev); } -#endif /* * Find an existing socket connection, based on circuit ID, if it's diff --git a/net/netrom/nr_dev.c b/net/netrom/nr_dev.c index 83eb41d..263da4c 100644 --- a/net/netrom/nr_dev.c +++ b/net/netrom/nr_dev.c @@ -38,8 +38,6 @@ #include #include -#ifdef CONFIG_INET - /* * Only allow IP over NET/ROM frames through if the netrom device is up. */ @@ -64,11 +62,12 @@ int nr_rx_ip(struct sk_buff *skb, struct net_device *dev) skb->nh.raw = skb->data; skb->pkt_type = PACKET_HOST; - ip_rcv(skb, skb->dev, NULL, skb->dev); + netif_rx(skb); return 1; } +#ifdef CONFIG_INET static int nr_rebuild_header(struct sk_buff *skb) { -- cgit v0.10.2 From c91326db013ddff478da8a8d66fb99ef4579f19a Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Wed, 24 Aug 2005 11:37:45 -0700 Subject: [AX25/NETROM/ROSE]: Kill net/ip.h inclusion All these are claiming to include to get ip_rcv() but in fact don't need the header at all, so away with the inclusion. Signed-off-by: Ralf Baechle DL5RB Signed-off-by: David S. Miller diff --git a/net/ax25/ax25_ds_in.c b/net/ax25/ax25_ds_in.c index 5d0f8fb..edcaa89 100644 --- a/net/ax25/ax25_ds_in.c +++ b/net/ax25/ax25_ds_in.c @@ -22,7 +22,6 @@ #include #include #include -#include /* For ip_rcv */ #include #include #include diff --git a/net/ax25/ax25_std_in.c b/net/ax25/ax25_std_in.c index 83a3338..f6ed283 100644 --- a/net/ax25/ax25_std_in.c +++ b/net/ax25/ax25_std_in.c @@ -29,7 +29,6 @@ #include #include #include -#include /* For ip_rcv */ #include #include #include diff --git a/net/netrom/nr_in.c b/net/netrom/nr_in.c index 2fcba9e..64b81a7 100644 --- a/net/netrom/nr_in.c +++ b/net/netrom/nr_in.c @@ -23,7 +23,6 @@ #include #include #include -#include /* For ip_rcv */ #include #include #include diff --git a/net/rose/rose_in.c b/net/rose/rose_in.c index a52417b..8348d33 100644 --- a/net/rose/rose_in.c +++ b/net/rose/rose_in.c @@ -26,7 +26,6 @@ #include #include #include -#include /* For ip_rcv */ #include #include #include -- cgit v0.10.2 From 3625796806419d97641d90e0f197eab9b952212e Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Wed, 24 Aug 2005 11:38:53 -0700 Subject: [IPV4]: Module export of ip_rcv() no longer needed. With ip_rcv nowhere outside the IP stack being used anymore it's EXPORT_SYMBOL is not needed any longer either. Signed-off-by: Ralf Baechle DL5RB Signed-off-by: David S. Miller diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 7e78095..220a8b5 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -442,5 +442,4 @@ out: return NET_RX_DROP; } -EXPORT_SYMBOL(ip_rcv); EXPORT_SYMBOL(ip_statistics); -- cgit v0.10.2 From e5b4376074e02b783e56a8f7c42d544e18112c4e Mon Sep 17 00:00:00 2001 From: Robert Olsson Date: Thu, 25 Aug 2005 13:01:03 -0700 Subject: [IPV4]: Prepare FIB core for RCU. * RCU versions of hlist_***_rcu * fib_alias partial rcu port just whats needed now. Signed-off-by: Robert Olsson Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller diff --git a/include/linux/list.h b/include/linux/list.h index 0f2435f..9b9b0ee 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -634,6 +634,27 @@ static inline void hlist_add_after(struct hlist_node *n, next->next->pprev = &next->next; } +static inline void hlist_add_before_rcu(struct hlist_node *n, + struct hlist_node *next) +{ + n->pprev = next->pprev; + n->next = next; + smp_wmb(); + next->pprev = &n->next; + *(n->pprev) = n; +} + +static inline void hlist_add_after_rcu(struct hlist_node *prev, + struct hlist_node *n) +{ + n->next = prev->next; + n->pprev = &prev->next; + smp_wmb(); + prev->next = n; + if (n->next) + n->next->pprev = &n->next; +} + #define hlist_entry(ptr, type, member) container_of(ptr,type,member) #define hlist_for_each(pos, head) \ diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index b729d97..ef6609e 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -7,6 +7,7 @@ struct fib_alias { struct list_head fa_list; + struct rcu_head rcu; struct fib_info *fa_info; u8 fa_tos; u8 fa_type; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 7e4651b..d41219e 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -854,6 +854,7 @@ failure: return NULL; } +/* Note! fib_semantic_match intentionally uses RCU list functions. */ int fib_semantic_match(struct list_head *head, const struct flowi *flp, struct fib_result *res, __u32 zone, __u32 mask, int prefixlen) @@ -861,7 +862,7 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp, struct fib_alias *fa; int nh_sel = 0; - list_for_each_entry(fa, head, fa_list) { + list_for_each_entry_rcu(fa, head, fa_list) { int err; if (fa->fa_tos && -- cgit v0.10.2 From 2373ce1ca04dd46bf2b8b0f9a799eb2a90da92fb Mon Sep 17 00:00:00 2001 From: Robert Olsson Date: Thu, 25 Aug 2005 13:01:29 -0700 Subject: [IPV4]: Convert FIB Trie to RCU. * Removes RW-lock * Proteced read functions uses rcu_dereference proteced with rcu_read_lock() * writing of procted pointer w. rcu_assigen_pointer * Insert/Replace atomic list_replace_rcu * A BUG_ON condition removed.in trie_rebalance With help from Paul E. McKenney. Signed-off-by: Robert Olsson Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 9c4c7f0..ff21748 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -43,7 +43,7 @@ * 2 of the License, or (at your option) any later version. */ -#define VERSION "0.325" +#define VERSION "0.402" #include #include @@ -62,6 +62,7 @@ #include #include #include +#include #include #include #include @@ -81,22 +82,19 @@ #define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l)) #define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset)) -static DEFINE_RWLOCK(fib_lock); - typedef unsigned int t_key; #define T_TNODE 0 #define T_LEAF 1 #define NODE_TYPE_MASK 0x1UL #define NODE_PARENT(node) \ - ((struct tnode *)((node)->parent & ~NODE_TYPE_MASK)) -#define NODE_SET_PARENT(node, ptr) \ - ((node)->parent = (((unsigned long)(ptr)) | \ - ((node)->parent & NODE_TYPE_MASK))) -#define NODE_INIT_PARENT(node, type) \ - ((node)->parent = (type)) -#define NODE_TYPE(node) \ - ((node)->parent & NODE_TYPE_MASK) + ((struct tnode *)rcu_dereference(((node)->parent & ~NODE_TYPE_MASK))) + +#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK) + +#define NODE_SET_PARENT(node, ptr) \ + rcu_assign_pointer((node)->parent, \ + ((unsigned long)(ptr)) | NODE_TYPE(node)) #define IS_TNODE(n) (!(n->parent & T_LEAF)) #define IS_LEAF(n) (n->parent & T_LEAF) @@ -110,10 +108,12 @@ struct leaf { t_key key; unsigned long parent; struct hlist_head list; + struct rcu_head rcu; }; struct leaf_info { struct hlist_node hlist; + struct rcu_head rcu; int plen; struct list_head falh; }; @@ -125,6 +125,7 @@ struct tnode { unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */ unsigned short full_children; /* KEYLENGTH bits needed */ unsigned short empty_children; /* KEYLENGTH bits needed */ + struct rcu_head rcu; struct node *child[0]; }; @@ -168,11 +169,14 @@ static void trie_dump_seq(struct seq_file *seq, struct trie *t); static kmem_cache_t *fn_alias_kmem; static struct trie *trie_local = NULL, *trie_main = NULL; + +/* rcu_read_lock needs to be hold by caller from readside */ + static inline struct node *tnode_get_child(struct tnode *tn, int i) { BUG_ON(i >= 1 << tn->bits); - return tn->child[i]; + return rcu_dereference(tn->child[i]); } static inline int tnode_child_length(const struct tnode *tn) @@ -213,14 +217,6 @@ static inline int tkey_mismatch(t_key a, int offset, t_key b) return i; } -/* Candidate for fib_semantics */ - -static void fn_free_alias(struct fib_alias *fa) -{ - fib_release_info(fa->fa_info); - kmem_cache_free(fn_alias_kmem, fa); -} - /* To understand this stuff, an understanding of keys and all their bits is necessary. Every node in the trie has a key associated with it, but not @@ -292,53 +288,57 @@ static inline void check_tnode(const struct tnode *tn) static int halve_threshold = 25; static int inflate_threshold = 50; -static struct leaf *leaf_new(void) + +static void __alias_free_mem(struct rcu_head *head) { - struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL); - if (l) { - NODE_INIT_PARENT(l, T_LEAF); - INIT_HLIST_HEAD(&l->list); - } - return l; + struct fib_alias *fa = container_of(head, struct fib_alias, rcu); + kmem_cache_free(fn_alias_kmem, fa); } -static struct leaf_info *leaf_info_new(int plen) +static inline void alias_free_mem_rcu(struct fib_alias *fa) { - struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL); - - if (!li) - return NULL; + call_rcu(&fa->rcu, __alias_free_mem); +} - li->plen = plen; - INIT_LIST_HEAD(&li->falh); +static void __leaf_free_rcu(struct rcu_head *head) +{ + kfree(container_of(head, struct leaf, rcu)); +} - return li; +static inline void free_leaf(struct leaf *leaf) +{ + call_rcu(&leaf->rcu, __leaf_free_rcu); } -static inline void free_leaf(struct leaf *l) +static void __leaf_info_free_rcu(struct rcu_head *head) { - kfree(l); + kfree(container_of(head, struct leaf_info, rcu)); } -static inline void free_leaf_info(struct leaf_info *li) +static inline void free_leaf_info(struct leaf_info *leaf) { - kfree(li); + call_rcu(&leaf->rcu, __leaf_info_free_rcu); } static struct tnode *tnode_alloc(unsigned int size) { - if (size <= PAGE_SIZE) { - return kmalloc(size, GFP_KERNEL); - } else { - return (struct tnode *) - __get_free_pages(GFP_KERNEL, get_order(size)); - } + struct page *pages; + + if (size <= PAGE_SIZE) + return kcalloc(size, 1, GFP_KERNEL); + + pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size)); + if (!pages) + return NULL; + + return page_address(pages); } -static void __tnode_free(struct tnode *tn) +static void __tnode_free_rcu(struct rcu_head *head) { + struct tnode *tn = container_of(head, struct tnode, rcu); unsigned int size = sizeof(struct tnode) + - (1 << tn->bits) * sizeof(struct node *); + (1 << tn->bits) * sizeof(struct node *); if (size <= PAGE_SIZE) kfree(tn); @@ -346,6 +346,31 @@ static void __tnode_free(struct tnode *tn) free_pages((unsigned long)tn, get_order(size)); } +static inline void tnode_free(struct tnode *tn) +{ + call_rcu(&tn->rcu, __tnode_free_rcu); +} + +static struct leaf *leaf_new(void) +{ + struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL); + if (l) { + l->parent = T_LEAF; + INIT_HLIST_HEAD(&l->list); + } + return l; +} + +static struct leaf_info *leaf_info_new(int plen) +{ + struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL); + if (li) { + li->plen = plen; + INIT_LIST_HEAD(&li->falh); + } + return li; +} + static struct tnode* tnode_new(t_key key, int pos, int bits) { int nchildren = 1<parent = T_TNODE; tn->pos = pos; tn->bits = bits; tn->key = key; @@ -367,17 +392,6 @@ static struct tnode* tnode_new(t_key key, int pos, int bits) return tn; } -static void tnode_free(struct tnode *tn) -{ - if (IS_LEAF(tn)) { - free_leaf((struct leaf *)tn); - pr_debug("FL %p \n", tn); - } else { - __tnode_free(tn); - pr_debug("FT %p \n", tn); - } -} - /* * Check whether a tnode 'n' is "full", i.e. it is an internal node * and no bits are skipped. See discussion in dyntree paper p. 6 @@ -403,13 +417,11 @@ static inline void put_child(struct trie *t, struct tnode *tn, int i, struct nod static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull) { - struct node *chi; + struct node *chi = tn->child[i]; int isfull; BUG_ON(i >= 1<bits); - write_lock_bh(&fib_lock); - chi = tn->child[i]; /* update emptyChildren */ if (n == NULL && chi != NULL) @@ -430,8 +442,7 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int w if (n) NODE_SET_PARENT(n, tn); - tn->child[i] = n; - write_unlock_bh(&fib_lock); + rcu_assign_pointer(tn->child[i], n); } static struct node *resize(struct trie *t, struct tnode *tn) @@ -456,17 +467,12 @@ static struct node *resize(struct trie *t, struct tnode *tn) for (i = 0; i < tnode_child_length(tn); i++) { struct node *n; - write_lock_bh(&fib_lock); n = tn->child[i]; - if (!n) { - write_unlock_bh(&fib_lock); + if (!n) continue; - } /* compress one level */ - NODE_INIT_PARENT(n, NODE_TYPE(n)); - - write_unlock_bh(&fib_lock); + NODE_SET_PARENT(n, NULL); tnode_free(tn); return n; } @@ -577,24 +583,17 @@ static struct node *resize(struct trie *t, struct tnode *tn) /* Only one child remains */ - if (tn->empty_children == tnode_child_length(tn) - 1) for (i = 0; i < tnode_child_length(tn); i++) { struct node *n; - write_lock_bh(&fib_lock); - n = tn->child[i]; - if (!n) { - write_unlock_bh(&fib_lock); + if (!n) continue; - } /* compress one level */ - NODE_INIT_PARENT(n, NODE_TYPE(n)); - - write_unlock_bh(&fib_lock); + NODE_SET_PARENT(n, NULL); tnode_free(tn); return n; } @@ -831,19 +830,22 @@ static void trie_init(struct trie *t) return; t->size = 0; - t->trie = NULL; + rcu_assign_pointer(t->trie, NULL); t->revision = 0; #ifdef CONFIG_IP_FIB_TRIE_STATS memset(&t->stats, 0, sizeof(struct trie_use_stats)); #endif } +/* readside most use rcu_read_lock currently dump routines + via get_fa_head and dump */ + static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen) { struct hlist_node *node; struct leaf_info *li; - hlist_for_each_entry(li, node, head, hlist) + hlist_for_each_entry_rcu(li, node, head, hlist) if (li->plen == plen) return li; @@ -862,28 +864,27 @@ static inline struct list_head * get_fa_head(struct leaf *l, int plen) static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new) { - struct leaf_info *li = NULL, *last = NULL; - struct hlist_node *node; + struct leaf_info *li = NULL, *last = NULL; + struct hlist_node *node; - write_lock_bh(&fib_lock); + if (hlist_empty(head)) { + hlist_add_head_rcu(&new->hlist, head); + } else { + hlist_for_each_entry(li, node, head, hlist) { + if (new->plen > li->plen) + break; - if (hlist_empty(head)) { - hlist_add_head(&new->hlist, head); - } else { - hlist_for_each_entry(li, node, head, hlist) { - if (new->plen > li->plen) - break; - - last = li; - } - if (last) - hlist_add_after(&last->hlist, &new->hlist); - else - hlist_add_before(&new->hlist, &li->hlist); - } - write_unlock_bh(&fib_lock); + last = li; + } + if (last) + hlist_add_after_rcu(&last->hlist, &new->hlist); + else + hlist_add_before_rcu(&new->hlist, &li->hlist); + } } +/* rcu_read_lock needs to be hold by caller from readside */ + static struct leaf * fib_find_node(struct trie *t, u32 key) { @@ -892,7 +893,7 @@ fib_find_node(struct trie *t, u32 key) struct node *n; pos = 0; - n = t->trie; + n = rcu_dereference(t->trie); while (n != NULL && NODE_TYPE(n) == T_TNODE) { tn = (struct tnode *) n; @@ -915,17 +916,13 @@ fib_find_node(struct trie *t, u32 key) static struct node *trie_rebalance(struct trie *t, struct tnode *tn) { - int i; int wasfull; t_key cindex, key; struct tnode *tp = NULL; key = tn->key; - i = 0; while (tn != NULL && NODE_PARENT(tn) != NULL) { - BUG_ON(i > 12); /* Why is this a bug? -ojn */ - i++; tp = NODE_PARENT(tn); cindex = tkey_extract_bits(key, tp->pos, tp->bits); @@ -945,6 +942,8 @@ static struct node *trie_rebalance(struct trie *t, struct tnode *tn) return (struct node*) tn; } +/* only used from updater-side */ + static struct list_head * fib_insert_node(struct trie *t, int *err, u32 key, int plen) { @@ -1081,7 +1080,7 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen) cindex = tkey_extract_bits(key, tp->pos, tp->bits); put_child(t, (struct tnode *)tp, cindex, (struct node *)tn); } else { - t->trie = (struct node*) tn; /* First tnode */ + rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */ tp = tn; } } @@ -1091,7 +1090,8 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen) tp, tp->pos, tp->bits, key, plen); /* Rebalance the trie */ - t->trie = trie_rebalance(t, tp); + + rcu_assign_pointer(t->trie, trie_rebalance(t, tp)); done: t->revision++; err: @@ -1166,16 +1166,21 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, struct fib_info *fi_drop; u8 state; - write_lock_bh(&fib_lock); + err = -ENOBUFS; + new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL); + if (new_fa == NULL) + goto out; fi_drop = fa->fa_info; - fa->fa_info = fi; - fa->fa_type = type; - fa->fa_scope = r->rtm_scope; + new_fa->fa_tos = fa->fa_tos; + new_fa->fa_info = fi; + new_fa->fa_type = type; + new_fa->fa_scope = r->rtm_scope; state = fa->fa_state; - fa->fa_state &= ~FA_S_ACCESSED; + new_fa->fa_state &= ~FA_S_ACCESSED; - write_unlock_bh(&fib_lock); + list_replace_rcu(&fa->fa_list, &new_fa->fa_list); + alias_free_mem_rcu(fa); fib_release_info(fi_drop); if (state & FA_S_ACCESSED) @@ -1227,11 +1232,8 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, goto out_free_new_fa; } - write_lock_bh(&fib_lock); - - list_add_tail(&new_fa->fa_list, (fa ? &fa->fa_list : fa_head)); - - write_unlock_bh(&fib_lock); + list_add_tail_rcu(&new_fa->fa_list, + (fa ? &fa->fa_list : fa_head)); rt_cache_flush(-1); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req); @@ -1246,6 +1248,8 @@ err: return err; } + +/* should be clalled with rcu_read_lock */ static inline int check_leaf(struct trie *t, struct leaf *l, t_key key, int *plen, const struct flowi *flp, struct fib_result *res) @@ -1256,7 +1260,7 @@ static inline int check_leaf(struct trie *t, struct leaf *l, struct hlist_head *hhead = &l->list; struct hlist_node *node; - hlist_for_each_entry(li, node, hhead, hlist) { + hlist_for_each_entry_rcu(li, node, hhead, hlist) { i = li->plen; mask = ntohl(inet_make_mask(i)); if (l->key != (key & mask)) @@ -1292,10 +1296,9 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result t_key node_prefix, key_prefix, pref_mismatch; int mp; - n = t->trie; - - read_lock(&fib_lock); + rcu_read_lock(); + n = rcu_dereference(t->trie); if (!n) goto failed; @@ -1465,10 +1468,11 @@ backtrace: failed: ret = 1; found: - read_unlock(&fib_lock); + rcu_read_unlock(); return ret; } +/* only called from updater side */ static int trie_leaf_remove(struct trie *t, t_key key) { t_key cindex; @@ -1503,15 +1507,17 @@ static int trie_leaf_remove(struct trie *t, t_key key) t->revision++; t->size--; + preempt_disable(); tp = NODE_PARENT(n); tnode_free((struct tnode *) n); if (tp) { cindex = tkey_extract_bits(key, tp->pos, tp->bits); put_child(t, (struct tnode *)tp, cindex, NULL); - t->trie = trie_rebalance(t, tp); + rcu_assign_pointer(t->trie, trie_rebalance(t, tp)); } else - t->trie = NULL; + rcu_assign_pointer(t->trie, NULL); + preempt_enable(); return 1; } @@ -1527,7 +1533,6 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, struct fib_alias *fa, *fa_to_delete; struct list_head *fa_head; struct leaf *l; - int kill_li = 0; struct leaf_info *li; @@ -1560,6 +1565,7 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, fa_to_delete = NULL; fa_head = fa->fa_list.prev; + list_for_each_entry(fa, fa_head, fa_list) { struct fib_info *fi = fa->fa_info; @@ -1587,18 +1593,12 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, l = fib_find_node(t, key); li = find_leaf_info(&l->list, plen); - write_lock_bh(&fib_lock); - - list_del(&fa->fa_list); + list_del_rcu(&fa->fa_list); if (list_empty(fa_head)) { - hlist_del(&li->hlist); - kill_li = 1; - } - write_unlock_bh(&fib_lock); - - if (kill_li) + hlist_del_rcu(&li->hlist); free_leaf_info(li); + } if (hlist_empty(&l->list)) trie_leaf_remove(t, key); @@ -1606,7 +1606,8 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, if (fa->fa_state & FA_S_ACCESSED) rt_cache_flush(-1); - fn_free_alias(fa); + fib_release_info(fa->fa_info); + alias_free_mem_rcu(fa); return 0; } @@ -1618,12 +1619,10 @@ static int trie_flush_list(struct trie *t, struct list_head *head) list_for_each_entry_safe(fa, fa_node, head, fa_list) { struct fib_info *fi = fa->fa_info; - if (fi && (fi->fib_flags&RTNH_F_DEAD)) { - write_lock_bh(&fib_lock); - list_del(&fa->fa_list); - write_unlock_bh(&fib_lock); - - fn_free_alias(fa); + if (fi && (fi->fib_flags & RTNH_F_DEAD)) { + list_del_rcu(&fa->fa_list); + fib_release_info(fa->fa_info); + alias_free_mem_rcu(fa); found++; } } @@ -1641,30 +1640,30 @@ static int trie_flush_leaf(struct trie *t, struct leaf *l) found += trie_flush_list(t, &li->falh); if (list_empty(&li->falh)) { - write_lock_bh(&fib_lock); - hlist_del(&li->hlist); - write_unlock_bh(&fib_lock); - + hlist_del_rcu(&li->hlist); free_leaf_info(li); } } return found; } +/* rcu_read_lock needs to be hold by caller from readside */ + static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf) { struct node *c = (struct node *) thisleaf; struct tnode *p; int idx; + struct node *trie = rcu_dereference(t->trie); if (c == NULL) { - if (t->trie == NULL) + if (trie == NULL) return NULL; - if (IS_LEAF(t->trie)) /* trie w. just a leaf */ - return (struct leaf *) t->trie; + if (IS_LEAF(trie)) /* trie w. just a leaf */ + return (struct leaf *) trie; - p = (struct tnode*) t->trie; /* Start */ + p = (struct tnode*) trie; /* Start */ } else p = (struct tnode *) NODE_PARENT(c); @@ -1679,23 +1678,26 @@ static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf) last = 1 << p->bits; for (idx = pos; idx < last ; idx++) { - if (!p->child[idx]) + c = rcu_dereference(p->child[idx]); + + if (!c) continue; /* Decend if tnode */ - while (IS_TNODE(p->child[idx])) { - p = (struct tnode*) p->child[idx]; - idx = 0; + while (IS_TNODE(c)) { + p = (struct tnode *) c; + idx = 0; /* Rightmost non-NULL branch */ if (p && IS_TNODE(p)) - while (p->child[idx] == NULL && idx < (1 << p->bits)) idx++; + while (!(c = rcu_dereference(p->child[idx])) + && idx < (1<bits)) idx++; /* Done with this tnode? */ - if (idx >= (1 << p->bits) || p->child[idx] == NULL) + if (idx >= (1 << p->bits) || !c) goto up; } - return (struct leaf*) p->child[idx]; + return (struct leaf *) c; } up: /* No more children go up one step */ @@ -1713,6 +1715,7 @@ static int fn_trie_flush(struct fib_table *tb) t->revision++; + rcu_read_lock(); for (h = 0; (l = nextleaf(t, l)) != NULL; h++) { found += trie_flush_leaf(t, l); @@ -1720,6 +1723,7 @@ static int fn_trie_flush(struct fib_table *tb) trie_leaf_remove(t, ll->key); ll = l; } + rcu_read_unlock(); if (ll && hlist_empty(&ll->list)) trie_leaf_remove(t, ll->key); @@ -1745,7 +1749,7 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib last_resort = NULL; order = -1; - read_lock(&fib_lock); + rcu_read_lock(); l = fib_find_node(t, 0); if (!l) @@ -1758,7 +1762,7 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib if (list_empty(fa_head)) goto out; - list_for_each_entry(fa, fa_head, fa_list) { + list_for_each_entry_rcu(fa, fa_head, fa_list) { struct fib_info *next_fi = fa->fa_info; if (fa->fa_scope != res->scope || @@ -1809,7 +1813,7 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib } trie_last_dflt = last_idx; out:; - read_unlock(&fib_lock); + rcu_read_unlock(); } static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb, @@ -1823,7 +1827,9 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fi s_i = cb->args[3]; i = 0; - list_for_each_entry(fa, fah, fa_list) { + /* rcu_read_lock is hold by caller */ + + list_for_each_entry_rcu(fa, fah, fa_list) { if (i < s_i) { i++; continue; @@ -1898,7 +1904,7 @@ static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlin s_m = cb->args[1]; - read_lock(&fib_lock); + rcu_read_lock(); for (m = 0; m <= 32; m++) { if (m < s_m) continue; @@ -1911,11 +1917,11 @@ static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlin goto out; } } - read_unlock(&fib_lock); + rcu_read_unlock(); cb->args[1] = m; return skb->len; out: - read_unlock(&fib_lock); + rcu_read_unlock(); return -1; } @@ -2016,7 +2022,7 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n, putspace_seq(seq, indent+2); seq_printf(seq, "{/%d...dumping}\n", i); - list_for_each_entry(fa, fa_head, fa_list) { + list_for_each_entry_rcu(fa, fa_head, fa_list) { putspace_seq(seq, indent+2); if (fa->fa_info == NULL) { seq_printf(seq, "Error fa_info=NULL\n"); @@ -2056,28 +2062,28 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n, static void trie_dump_seq(struct seq_file *seq, struct trie *t) { - struct node *n = t->trie; + struct node *n; int cindex = 0; int indent = 1; int pend = 0; int depth = 0; struct tnode *tn; - read_lock(&fib_lock); - + rcu_read_lock(); + n = rcu_dereference(t->trie); seq_printf(seq, "------ trie_dump of t=%p ------\n", t); if (!n) { seq_printf(seq, "------ trie is empty\n"); - read_unlock(&fib_lock); + rcu_read_unlock(); return; } printnode_seq(seq, indent, n, pend, cindex, 0); if (!IS_TNODE(n)) { - read_unlock(&fib_lock); + rcu_read_unlock(); return; } @@ -2088,26 +2094,32 @@ static void trie_dump_seq(struct seq_file *seq, struct trie *t) depth++; while (tn && cindex < (1 << tn->bits)) { - if (tn->child[cindex]) { + struct node *child = rcu_dereference(tn->child[cindex]); + if (!child) + cindex++; + else { /* Got a child */ + printnode_seq(seq, indent, child, pend, + cindex, tn->bits); - printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits); - if (IS_LEAF(tn->child[cindex])) { + if (IS_LEAF(child)) cindex++; - } else { + + else { /* * New tnode. Decend one level */ depth++; - tn = (struct tnode *)tn->child[cindex]; - pend = tn->pos + tn->bits; - putspace_seq(seq, indent); seq_printf(seq, "\\--\n"); + n = child; + tn = (struct tnode *)n; + pend = tn->pos+tn->bits; + putspace_seq(seq, indent); + seq_printf(seq, "\\--\n"); indent += 3; cindex = 0; } - } else - cindex++; + } /* * Test if we are done @@ -2132,8 +2144,7 @@ static void trie_dump_seq(struct seq_file *seq, struct trie *t) depth--; } } - - read_unlock(&fib_lock); + rcu_read_unlock(); } static struct trie_stat *trie_stat_new(void) @@ -2159,7 +2170,7 @@ static struct trie_stat *trie_stat_new(void) static struct trie_stat *trie_collect_stats(struct trie *t) { - struct node *n = t->trie; + struct node *n; struct trie_stat *s = trie_stat_new(); int cindex = 0; int pend = 0; @@ -2167,11 +2178,13 @@ static struct trie_stat *trie_collect_stats(struct trie *t) if (!s) return NULL; + + rcu_read_lock(); + n = rcu_dereference(t->trie); + if (!n) return s; - read_lock(&fib_lock); - if (IS_TNODE(n)) { struct tnode *tn = (struct tnode *)n; pend = tn->pos+tn->bits; @@ -2179,7 +2192,9 @@ static struct trie_stat *trie_collect_stats(struct trie *t) depth++; while (tn && cindex < (1 << tn->bits)) { - if (tn->child[cindex]) { + struct node *ch = rcu_dereference(tn->child[cindex]); + if (ch) { + /* Got a child */ if (IS_LEAF(tn->child[cindex])) { @@ -2199,7 +2214,7 @@ static struct trie_stat *trie_collect_stats(struct trie *t) s->nodesizes[tn->bits]++; depth++; - n = tn->child[cindex]; + n = ch; tn = (struct tnode *)n; pend = tn->pos+tn->bits; @@ -2236,7 +2251,7 @@ static struct trie_stat *trie_collect_stats(struct trie *t) } } - read_unlock(&fib_lock); + rcu_read_unlock(); return s; } -- cgit v0.10.2 From afdc08b9f9a7174d7912a160f657f39d46379b5e Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Thu, 25 Aug 2005 15:34:29 -0700 Subject: [BNX2]: Fix rtnl deadlock in bnx2_close This fixes an rtnl deadlock problem when flush_scheduled_work() is called from bnx2_close(). In rare cases, linkwatch_event() may be on the workqueue from a previous close of a different device and it will try to get the rtnl lock which is already held by dev_close(). The fix is to set a flag if we are in the reset task which is run from the workqueue. bnx2_close() will loop until the flag is cleared. As suggested by Jeff Garzik, the loop is changed to call msleep(1) instead of yield() in the original patch. flush_scheduled_work() is also moved to bnx2_remove_one() before the netdev is freed. Signed-off-by: Michael Chan Signed-off-by: David S. Miller diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c index 3a9d6a8..635a585 100644 --- a/drivers/net/bnx2.c +++ b/drivers/net/bnx2.c @@ -3975,12 +3975,17 @@ bnx2_reset_task(void *data) { struct bnx2 *bp = data; + if (!netif_running(bp->dev)) + return; + + bp->in_reset_task = 1; bnx2_netif_stop(bp); bnx2_init_nic(bp); atomic_set(&bp->intr_sem, 1); bnx2_netif_start(bp); + bp->in_reset_task = 0; } static void @@ -4172,7 +4177,13 @@ bnx2_close(struct net_device *dev) struct bnx2 *bp = dev->priv; u32 reset_code; - flush_scheduled_work(); + /* Calling flush_scheduled_work() may deadlock because + * linkwatch_event() may be on the workqueue and it will try to get + * the rtnl_lock which we are holding. + */ + while (bp->in_reset_task) + msleep(1); + bnx2_netif_stop(bp); del_timer_sync(&bp->timer); if (bp->wol) @@ -5453,6 +5464,8 @@ bnx2_remove_one(struct pci_dev *pdev) struct net_device *dev = pci_get_drvdata(pdev); struct bnx2 *bp = dev->priv; + flush_scheduled_work(); + unregister_netdev(dev); if (bp->regview) diff --git a/drivers/net/bnx2.h b/drivers/net/bnx2.h index 8214a28..63b94ca 100644 --- a/drivers/net/bnx2.h +++ b/drivers/net/bnx2.h @@ -3874,6 +3874,7 @@ struct bnx2 { int timer_interval; struct timer_list timer; struct work_struct reset_task; + int in_reset_task; /* Used to synchronize phy accesses. */ spinlock_t phy_lock; -- cgit v0.10.2 From cd339a0ed61097d92ce03b6d1042b1e4d58535e7 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Thu, 25 Aug 2005 15:35:24 -0700 Subject: [BNX2]: speedup serdes linkup This speeds up link-up time on 5706 SerDes if the link partner does not autoneg, a rather common scenario in blade servers. Some blade servers use IPMI for keyboard input and it's important to minimize link disruptions. The speedup is achieved by shortening the timer to (HZ / 3) during the transient period right after initiating a SerDes autoneg. If autoneg does not complete, parallel detect can be done sooner. After the transient period is over, the timer goes back to its normal HZ interval. As suggested by Jeff Garzik, the timer initialization is moved to bnx2_init_board() from bnx2_open(). An eeprom bit is also added to allow default forced SerDes speed for even faster link-up time. Signed-off-by: Michael Chan Signed-off-by: David S. Miller diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c index 635a585..015ff79 100644 --- a/drivers/net/bnx2.c +++ b/drivers/net/bnx2.c @@ -806,7 +806,19 @@ bnx2_setup_serdes_phy(struct bnx2 *bp) bnx2_write_phy(bp, MII_ADVERTISE, new_adv); bnx2_write_phy(bp, MII_BMCR, bmcr | BMCR_ANRESTART | BMCR_ANENABLE); - bp->serdes_an_pending = SERDES_AN_TIMEOUT / bp->timer_interval; + if (CHIP_NUM(bp) == CHIP_NUM_5706) { + /* Speed up link-up time when the link partner + * does not autonegotiate which is very common + * in blade servers. Some blade servers use + * IPMI for kerboard input and it's important + * to minimize link disruptions. Autoneg. involves + * exchanging base pages plus 3 next pages and + * normally completes in about 120 msec. + */ + bp->current_interval = SERDES_AN_TIMEOUT; + bp->serdes_an_pending = 1; + mod_timer(&bp->timer, jiffies + bp->current_interval); + } } return 0; @@ -3800,6 +3812,9 @@ bnx2_timer(unsigned long data) struct bnx2 *bp = (struct bnx2 *) data; u32 msg; + if (!netif_running(bp->dev)) + return; + if (atomic_read(&bp->intr_sem) != 0) goto bnx2_restart_timer; @@ -3817,6 +3832,8 @@ bnx2_timer(unsigned long data) else if ((bp->link_up == 0) && (bp->autoneg & AUTONEG_SPEED)) { u32 bmcr; + bp->current_interval = bp->timer_interval; + bnx2_read_phy(bp, MII_BMCR, &bmcr); if (bmcr & BMCR_ANENABLE) { @@ -3859,14 +3876,14 @@ bnx2_timer(unsigned long data) } } + else + bp->current_interval = bp->timer_interval; spin_unlock_irqrestore(&bp->phy_lock, flags); } bnx2_restart_timer: - bp->timer.expires = RUN_AT(bp->timer_interval); - - add_timer(&bp->timer); + mod_timer(&bp->timer, jiffies + bp->current_interval); } /* Called with rtnl_lock */ @@ -3919,12 +3936,7 @@ bnx2_open(struct net_device *dev) return rc; } - init_timer(&bp->timer); - - bp->timer.expires = RUN_AT(bp->timer_interval); - bp->timer.data = (unsigned long) bp; - bp->timer.function = bnx2_timer; - add_timer(&bp->timer); + mod_timer(&bp->timer, jiffies + bp->current_interval); atomic_set(&bp->intr_sem, 0); @@ -4485,8 +4497,9 @@ bnx2_nway_reset(struct net_device *dev) spin_lock_irq(&bp->phy_lock); if (CHIP_NUM(bp) == CHIP_NUM_5706) { - bp->serdes_an_pending = SERDES_AN_TIMEOUT / - bp->timer_interval; + bp->current_interval = SERDES_AN_TIMEOUT; + bp->serdes_an_pending = 1; + mod_timer(&bp->timer, jiffies + bp->current_interval); } } @@ -5315,6 +5328,7 @@ bnx2_init_board(struct pci_dev *pdev, struct net_device *dev) bp->stats_ticks = 1000000 & 0xffff00; bp->timer_interval = HZ; + bp->current_interval = HZ; /* Disable WOL support if we are running on a SERDES chip. */ if (CHIP_BOND_ID(bp) & CHIP_BOND_ID_SERDES_BIT) { @@ -5338,6 +5352,15 @@ bnx2_init_board(struct pci_dev *pdev, struct net_device *dev) bp->req_line_speed = 0; if (bp->phy_flags & PHY_SERDES_FLAG) { bp->advertising = ETHTOOL_ALL_FIBRE_SPEED | ADVERTISED_Autoneg; + + reg = REG_RD_IND(bp, HOST_VIEW_SHMEM_BASE + + BNX2_PORT_HW_CFG_CONFIG); + reg &= BNX2_PORT_HW_CFG_CFG_DFLT_LINK_MASK; + if (reg == BNX2_PORT_HW_CFG_CFG_DFLT_LINK_1G) { + bp->autoneg = 0; + bp->req_line_speed = bp->line_speed = SPEED_1000; + bp->req_duplex = DUPLEX_FULL; + } } else { bp->advertising = ETHTOOL_ALL_COPPER_SPEED | ADVERTISED_Autoneg; @@ -5345,6 +5368,11 @@ bnx2_init_board(struct pci_dev *pdev, struct net_device *dev) bp->req_flow_ctrl = FLOW_CTRL_RX | FLOW_CTRL_TX; + init_timer(&bp->timer); + bp->timer.expires = RUN_AT(bp->timer_interval); + bp->timer.data = (unsigned long) bp; + bp->timer.function = bnx2_timer; + return 0; err_out_unmap: diff --git a/drivers/net/bnx2.h b/drivers/net/bnx2.h index 63b94ca..e1fb099 100644 --- a/drivers/net/bnx2.h +++ b/drivers/net/bnx2.h @@ -3872,6 +3872,7 @@ struct bnx2 { char *name; int timer_interval; + int current_interval; struct timer_list timer; struct work_struct reset_task; int in_reset_task; @@ -3986,7 +3987,7 @@ struct bnx2 { #define PHY_LOOPBACK 2 u8 serdes_an_pending; -#define SERDES_AN_TIMEOUT (2 * HZ) +#define SERDES_AN_TIMEOUT (HZ / 3) u8 mac_addr[8]; @@ -4172,6 +4173,9 @@ struct fw_info { #define BNX2_PORT_HW_CFG_MAC_LOWER 0x00000054 #define BNX2_PORT_HW_CFG_CONFIG 0x00000058 +#define BNX2_PORT_HW_CFG_CFG_DFLT_LINK_MASK 0x001f0000 +#define BNX2_PORT_HW_CFG_CFG_DFLT_LINK_AN 0x00000000 +#define BNX2_PORT_HW_CFG_CFG_DFLT_LINK_1G 0x00030000 #define BNX2_PORT_HW_CFG_IMD_MAC_A_UPPER 0x00000068 #define BNX2_PORT_HW_CFG_IMD_MAC_A_LOWER 0x0000006c -- cgit v0.10.2 From e89bbf1049aac3625fdafe3657ed8d7d5373d351 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Thu, 25 Aug 2005 15:36:58 -0700 Subject: [BNX2]: remove atomics in tx Remove atomic operations in the fast tx path. Expensive atomic operations were used to keep track of the number of available tx descriptors. The new code uses the difference between the consumer and producer index to determine the number of free tx descriptors. As suggested by Jeff Garzik, the name of the inline function is changed to all lower case. Signed-off-by: Michael Chan Signed-off-by: David S. Miller diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c index 015ff79..da903b3 100644 --- a/drivers/net/bnx2.c +++ b/drivers/net/bnx2.c @@ -107,6 +107,15 @@ static struct flash_spec flash_table[] = MODULE_DEVICE_TABLE(pci, bnx2_pci_tbl); +static inline u32 bnx2_tx_avail(struct bnx2 *bp) +{ + u32 diff = TX_RING_IDX(bp->tx_prod) - TX_RING_IDX(bp->tx_cons); + + if (diff > MAX_TX_DESC_CNT) + diff = (diff & MAX_TX_DESC_CNT) - 1; + return (bp->tx_ring_size - diff); +} + static u32 bnx2_reg_rd_ind(struct bnx2 *bp, u32 offset) { @@ -1338,22 +1347,19 @@ bnx2_tx_int(struct bnx2 *bp) } } - atomic_add(tx_free_bd, &bp->tx_avail_bd); + bp->tx_cons = sw_cons; if (unlikely(netif_queue_stopped(bp->dev))) { unsigned long flags; spin_lock_irqsave(&bp->tx_lock, flags); if ((netif_queue_stopped(bp->dev)) && - (atomic_read(&bp->tx_avail_bd) > MAX_SKB_FRAGS)) { + (bnx2_tx_avail(bp) > MAX_SKB_FRAGS)) { netif_wake_queue(bp->dev); } spin_unlock_irqrestore(&bp->tx_lock, flags); } - - bp->tx_cons = sw_cons; - } static inline void @@ -2971,7 +2977,6 @@ bnx2_init_tx_ring(struct bnx2 *bp) bp->tx_prod = 0; bp->tx_cons = 0; bp->tx_prod_bseq = 0; - atomic_set(&bp->tx_avail_bd, bp->tx_ring_size); val = BNX2_L2CTX_TYPE_TYPE_L2; val |= BNX2_L2CTX_TYPE_SIZE_L2; @@ -4057,9 +4062,7 @@ bnx2_start_xmit(struct sk_buff *skb, struct net_device *dev) u16 prod, ring_prod; int i; - if (unlikely(atomic_read(&bp->tx_avail_bd) < - (skb_shinfo(skb)->nr_frags + 1))) { - + if (unlikely(bnx2_tx_avail(bp) < (skb_shinfo(skb)->nr_frags + 1))) { netif_stop_queue(dev); printk(KERN_ERR PFX "%s: BUG! Tx ring full when queue awake!\n", dev->name); @@ -4156,8 +4159,6 @@ bnx2_start_xmit(struct sk_buff *skb, struct net_device *dev) prod = NEXT_TX_BD(prod); bp->tx_prod_bseq += skb->len; - atomic_sub(last_frag + 1, &bp->tx_avail_bd); - REG_WR16(bp, MB_TX_CID_ADDR + BNX2_L2CTX_TX_HOST_BIDX, prod); REG_WR(bp, MB_TX_CID_ADDR + BNX2_L2CTX_TX_HOST_BSEQ, bp->tx_prod_bseq); @@ -4166,16 +4167,14 @@ bnx2_start_xmit(struct sk_buff *skb, struct net_device *dev) bp->tx_prod = prod; dev->trans_start = jiffies; - if (unlikely(atomic_read(&bp->tx_avail_bd) <= MAX_SKB_FRAGS)) { + if (unlikely(bnx2_tx_avail(bp) <= MAX_SKB_FRAGS)) { unsigned long flags; spin_lock_irqsave(&bp->tx_lock, flags); - if (atomic_read(&bp->tx_avail_bd) <= MAX_SKB_FRAGS) { - netif_stop_queue(dev); - - if (atomic_read(&bp->tx_avail_bd) > MAX_SKB_FRAGS) - netif_wake_queue(dev); - } + netif_stop_queue(dev); + + if (bnx2_tx_avail(bp) > MAX_SKB_FRAGS) + netif_wake_queue(dev); spin_unlock_irqrestore(&bp->tx_lock, flags); } diff --git a/drivers/net/bnx2.h b/drivers/net/bnx2.h index e1fb099..9ad3f57 100644 --- a/drivers/net/bnx2.h +++ b/drivers/net/bnx2.h @@ -3841,12 +3841,12 @@ struct bnx2 { struct status_block *status_blk; u32 last_status_idx; - atomic_t tx_avail_bd; struct tx_bd *tx_desc_ring; struct sw_bd *tx_buf_ring; u32 tx_prod_bseq; u16 tx_prod; u16 tx_cons; + int tx_ring_size; #ifdef BCM_VLAN struct vlan_group *vlgrp; @@ -3929,7 +3929,6 @@ struct bnx2 { u16 fw_wr_seq; u16 fw_drv_pulse_wr_seq; - int tx_ring_size; dma_addr_t tx_desc_mapping; -- cgit v0.10.2 From c770a65cee7cc250d7bccd99fa55e742988ae4e0 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Thu, 25 Aug 2005 15:38:39 -0700 Subject: [BNX2]: change irq locks to bh locks Change all locks from spin_lock_irqsave() to spin_lock_bh(). All places that require spinlocks are in BH context. Signed-off-by: Michael Chan Signed-off-by: David S. Miller diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c index da903b3..418190b 100644 --- a/drivers/net/bnx2.c +++ b/drivers/net/bnx2.c @@ -1350,15 +1350,13 @@ bnx2_tx_int(struct bnx2 *bp) bp->tx_cons = sw_cons; if (unlikely(netif_queue_stopped(bp->dev))) { - unsigned long flags; - - spin_lock_irqsave(&bp->tx_lock, flags); + spin_lock(&bp->tx_lock); if ((netif_queue_stopped(bp->dev)) && (bnx2_tx_avail(bp) > MAX_SKB_FRAGS)) { netif_wake_queue(bp->dev); } - spin_unlock_irqrestore(&bp->tx_lock, flags); + spin_unlock(&bp->tx_lock); } } @@ -1598,11 +1596,9 @@ bnx2_poll(struct net_device *dev, int *budget) (bp->status_blk->status_attn_bits_ack & STATUS_ATTN_BITS_LINK_STATE)) { - unsigned long flags; - - spin_lock_irqsave(&bp->phy_lock, flags); + spin_lock(&bp->phy_lock); bnx2_phy_int(bp); - spin_unlock_irqrestore(&bp->phy_lock, flags); + spin_unlock(&bp->phy_lock); } if (bp->status_blk->status_tx_quick_consumer_index0 != bp->tx_cons) { @@ -1645,9 +1641,8 @@ bnx2_set_rx_mode(struct net_device *dev) struct bnx2 *bp = dev->priv; u32 rx_mode, sort_mode; int i; - unsigned long flags; - spin_lock_irqsave(&bp->phy_lock, flags); + spin_lock_bh(&bp->phy_lock); rx_mode = bp->rx_mode & ~(BNX2_EMAC_RX_MODE_PROMISCUOUS | BNX2_EMAC_RX_MODE_KEEP_VLAN_TAG); @@ -1708,7 +1703,7 @@ bnx2_set_rx_mode(struct net_device *dev) REG_WR(bp, BNX2_RPM_SORT_USER0, sort_mode); REG_WR(bp, BNX2_RPM_SORT_USER0, sort_mode | BNX2_RPM_SORT_USER0_ENA); - spin_unlock_irqrestore(&bp->phy_lock, flags); + spin_unlock_bh(&bp->phy_lock); } static void @@ -3768,10 +3763,10 @@ bnx2_test_link(struct bnx2 *bp) { u32 bmsr; - spin_lock_irq(&bp->phy_lock); + spin_lock_bh(&bp->phy_lock); bnx2_read_phy(bp, MII_BMSR, &bmsr); bnx2_read_phy(bp, MII_BMSR, &bmsr); - spin_unlock_irq(&bp->phy_lock); + spin_unlock_bh(&bp->phy_lock); if (bmsr & BMSR_LSTATUS) { return 0; @@ -3828,9 +3823,8 @@ bnx2_timer(unsigned long data) if ((bp->phy_flags & PHY_SERDES_FLAG) && (CHIP_NUM(bp) == CHIP_NUM_5706)) { - unsigned long flags; - spin_lock_irqsave(&bp->phy_lock, flags); + spin_lock(&bp->phy_lock); if (bp->serdes_an_pending) { bp->serdes_an_pending--; } @@ -3884,7 +3878,7 @@ bnx2_timer(unsigned long data) else bp->current_interval = bp->timer_interval; - spin_unlock_irqrestore(&bp->phy_lock, flags); + spin_unlock(&bp->phy_lock); } bnx2_restart_timer: @@ -4168,14 +4162,12 @@ bnx2_start_xmit(struct sk_buff *skb, struct net_device *dev) dev->trans_start = jiffies; if (unlikely(bnx2_tx_avail(bp) <= MAX_SKB_FRAGS)) { - unsigned long flags; - - spin_lock_irqsave(&bp->tx_lock, flags); + spin_lock(&bp->tx_lock); netif_stop_queue(dev); if (bnx2_tx_avail(bp) > MAX_SKB_FRAGS) netif_wake_queue(dev); - spin_unlock_irqrestore(&bp->tx_lock, flags); + spin_unlock(&bp->tx_lock); } return NETDEV_TX_OK; @@ -4411,11 +4403,11 @@ bnx2_set_settings(struct net_device *dev, struct ethtool_cmd *cmd) bp->req_line_speed = req_line_speed; bp->req_duplex = req_duplex; - spin_lock_irq(&bp->phy_lock); + spin_lock_bh(&bp->phy_lock); bnx2_setup_phy(bp); - spin_unlock_irq(&bp->phy_lock); + spin_unlock_bh(&bp->phy_lock); return 0; } @@ -4485,16 +4477,16 @@ bnx2_nway_reset(struct net_device *dev) return -EINVAL; } - spin_lock_irq(&bp->phy_lock); + spin_lock_bh(&bp->phy_lock); /* Force a link down visible on the other side */ if (bp->phy_flags & PHY_SERDES_FLAG) { bnx2_write_phy(bp, MII_BMCR, BMCR_LOOPBACK); - spin_unlock_irq(&bp->phy_lock); + spin_unlock_bh(&bp->phy_lock); msleep(20); - spin_lock_irq(&bp->phy_lock); + spin_lock_bh(&bp->phy_lock); if (CHIP_NUM(bp) == CHIP_NUM_5706) { bp->current_interval = SERDES_AN_TIMEOUT; bp->serdes_an_pending = 1; @@ -4506,7 +4498,7 @@ bnx2_nway_reset(struct net_device *dev) bmcr &= ~BMCR_LOOPBACK; bnx2_write_phy(bp, MII_BMCR, bmcr | BMCR_ANRESTART | BMCR_ANENABLE); - spin_unlock_irq(&bp->phy_lock); + spin_unlock_bh(&bp->phy_lock); return 0; } @@ -4692,11 +4684,11 @@ bnx2_set_pauseparam(struct net_device *dev, struct ethtool_pauseparam *epause) bp->autoneg &= ~AUTONEG_FLOW_CTRL; } - spin_lock_irq(&bp->phy_lock); + spin_lock_bh(&bp->phy_lock); bnx2_setup_phy(bp); - spin_unlock_irq(&bp->phy_lock); + spin_unlock_bh(&bp->phy_lock); return 0; } @@ -5046,9 +5038,9 @@ bnx2_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) case SIOCGMIIREG: { u32 mii_regval; - spin_lock_irq(&bp->phy_lock); + spin_lock_bh(&bp->phy_lock); err = bnx2_read_phy(bp, data->reg_num & 0x1f, &mii_regval); - spin_unlock_irq(&bp->phy_lock); + spin_unlock_bh(&bp->phy_lock); data->val_out = mii_regval; @@ -5059,9 +5051,9 @@ bnx2_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) if (!capable(CAP_NET_ADMIN)) return -EPERM; - spin_lock_irq(&bp->phy_lock); + spin_lock_bh(&bp->phy_lock); err = bnx2_write_phy(bp, data->reg_num & 0x1f, data->val_in); - spin_unlock_irq(&bp->phy_lock); + spin_unlock_bh(&bp->phy_lock); return err; -- cgit v0.10.2 From 73eef4cddb2738c4e8c5ef157ebb1b19d6c9272f Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Thu, 25 Aug 2005 15:39:15 -0700 Subject: [BNX2]: update version and minor fixes Update version and add 4 minor fixes, the last 2 were suggested by Jeff Garzik: 1. check for a valid ethernet address before setting it 2. zero out bp->regview if init_one encounters an error and unmaps the IO address. This prevents remove_one from unmapping again. 3. use netif_rx_schedule() instead of hand coding the same. 4. use IRQ_HANDLED and IRQ_NONE. Signed-off-by: Michael Chan Signed-off-by: David S. Miller diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c index 418190b..7babf6a 100644 --- a/drivers/net/bnx2.c +++ b/drivers/net/bnx2.c @@ -14,8 +14,8 @@ #define DRV_MODULE_NAME "bnx2" #define PFX DRV_MODULE_NAME ": " -#define DRV_MODULE_VERSION "1.2.19" -#define DRV_MODULE_RELDATE "May 23, 2005" +#define DRV_MODULE_VERSION "1.2.20" +#define DRV_MODULE_RELDATE "August 22, 2005" #define RUN_AT(x) (jiffies + (x)) @@ -1538,15 +1538,12 @@ bnx2_msi(int irq, void *dev_instance, struct pt_regs *regs) BNX2_PCICFG_INT_ACK_CMD_MASK_INT); /* Return here if interrupt is disabled. */ - if (unlikely(atomic_read(&bp->intr_sem) != 0)) { - return IRQ_RETVAL(1); - } + if (unlikely(atomic_read(&bp->intr_sem) != 0)) + return IRQ_HANDLED; - if (netif_rx_schedule_prep(dev)) { - __netif_rx_schedule(dev); - } + netif_rx_schedule(dev); - return IRQ_RETVAL(1); + return IRQ_HANDLED; } static irqreturn_t @@ -1564,22 +1561,19 @@ bnx2_interrupt(int irq, void *dev_instance, struct pt_regs *regs) if ((bp->status_blk->status_idx == bp->last_status_idx) || (REG_RD(bp, BNX2_PCICFG_MISC_STATUS) & BNX2_PCICFG_MISC_STATUS_INTA_VALUE)) - return IRQ_RETVAL(0); + return IRQ_NONE; REG_WR(bp, BNX2_PCICFG_INT_ACK_CMD, BNX2_PCICFG_INT_ACK_CMD_USE_INT_HC_PARAM | BNX2_PCICFG_INT_ACK_CMD_MASK_INT); /* Return here if interrupt is shared and is disabled. */ - if (unlikely(atomic_read(&bp->intr_sem) != 0)) { - return IRQ_RETVAL(1); - } + if (unlikely(atomic_read(&bp->intr_sem) != 0)) + return IRQ_HANDLED; - if (netif_rx_schedule_prep(dev)) { - __netif_rx_schedule(dev); - } + netif_rx_schedule(dev); - return IRQ_RETVAL(1); + return IRQ_HANDLED; } static int @@ -5071,6 +5065,9 @@ bnx2_change_mac_addr(struct net_device *dev, void *p) struct sockaddr *addr = p; struct bnx2 *bp = dev->priv; + if (!is_valid_ether_addr(addr->sa_data)) + return -EINVAL; + memcpy(dev->dev_addr, addr->sa_data, dev->addr_len); if (netif_running(dev)) bnx2_set_mac_addr(bp); @@ -5369,6 +5366,7 @@ bnx2_init_board(struct pci_dev *pdev, struct net_device *dev) err_out_unmap: if (bp->regview) { iounmap(bp->regview); + bp->regview = NULL; } err_out_release: -- cgit v0.10.2 From 57bf1451ac79640c5a0a4f31284c43539fac2903 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 25 Aug 2005 16:06:19 -0700 Subject: [NET]: net/802: more endian annotations The rest of endian warnings now belongs to tr.c exclusively. Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller diff --git a/include/linux/hippidevice.h b/include/linux/hippidevice.h index 9bc3b68..bab303d 100644 --- a/include/linux/hippidevice.h +++ b/include/linux/hippidevice.h @@ -31,8 +31,7 @@ struct hippi_cb { __u32 ifield; }; -extern unsigned short hippi_type_trans(struct sk_buff *skb, - struct net_device *dev); +extern __be16 hippi_type_trans(struct sk_buff *skb, struct net_device *dev); extern struct net_device *alloc_hippi_dev(int sizeof_priv); #endif diff --git a/include/linux/if_fc.h b/include/linux/if_fc.h index 33330b4..376a34e 100644 --- a/include/linux/if_fc.h +++ b/include/linux/if_fc.h @@ -44,7 +44,7 @@ struct fcllc { __u8 ssap; /* source SAP */ __u8 llc; /* LLC control field */ __u8 protid[3]; /* protocol id */ - __u16 ethertype; /* ether type field */ + __be16 ethertype; /* ether type field */ }; #endif /* _LINUX_IF_FC_H */ diff --git a/include/linux/if_fddi.h b/include/linux/if_fddi.h index a912818..1288a16 100644 --- a/include/linux/if_fddi.h +++ b/include/linux/if_fddi.h @@ -85,7 +85,7 @@ struct fddi_snap_hdr __u8 ssap; /* always 0xAA */ __u8 ctrl; /* always 0x03 */ __u8 oui[FDDI_K_OUI_LEN]; /* organizational universal id */ - __u16 ethertype; /* packet type ID field */ + __be16 ethertype; /* packet type ID field */ } __attribute__ ((packed)); /* Define FDDI LLC frame header */ diff --git a/include/linux/if_hippi.h b/include/linux/if_hippi.h index c8ca72c..94d31ca 100644 --- a/include/linux/if_hippi.h +++ b/include/linux/if_hippi.h @@ -102,9 +102,9 @@ struct hippi_fp_hdr #error "Please fix " #endif #else - __u32 fixed; + __be32 fixed; #endif - __u32 d2_size; + __be32 d2_size; } __attribute__ ((packed)); struct hippi_le_hdr @@ -144,7 +144,7 @@ struct hippi_snap_hdr __u8 ssap; /* always 0xAA */ __u8 ctrl; /* always 0x03 */ __u8 oui[HIPPI_OUI_LEN]; /* organizational universal id (zero)*/ - __u16 ethertype; /* packet type ID field */ + __be16 ethertype; /* packet type ID field */ } __attribute__ ((packed)); struct hippi_hdr diff --git a/net/802/fc.c b/net/802/fc.c index 640d34e..282c4ab 100644 --- a/net/802/fc.c +++ b/net/802/fc.c @@ -87,7 +87,7 @@ static int fc_rebuild_header(struct sk_buff *skb) struct fch_hdr *fch=(struct fch_hdr *)skb->data; struct fcllc *fcllc=(struct fcllc *)(skb->data+sizeof(struct fch_hdr)); if(fcllc->ethertype != htons(ETH_P_IP)) { - printk("fc_rebuild_header: Don't know how to resolve type %04X addresses ?\n",(unsigned int)htons(fcllc->ethertype)); + printk("fc_rebuild_header: Don't know how to resolve type %04X addresses ?\n", ntohs(fcllc->ethertype)); return 0; } #ifdef CONFIG_INET diff --git a/net/802/fddi.c b/net/802/fddi.c index 5ce24c4..ac242a4b 100644 --- a/net/802/fddi.c +++ b/net/802/fddi.c @@ -108,8 +108,8 @@ static int fddi_rebuild_header(struct sk_buff *skb) else #endif { - printk("%s: Don't know how to resolve type %02X addresses.\n", - skb->dev->name, htons(fddi->hdr.llc_snap.ethertype)); + printk("%s: Don't know how to resolve type %04X addresses.\n", + skb->dev->name, ntohs(fddi->hdr.llc_snap.ethertype)); return(0); } } diff --git a/net/802/hippi.c b/net/802/hippi.c index cb45ae1..6d7fed3 100644 --- a/net/802/hippi.c +++ b/net/802/hippi.c @@ -124,7 +124,7 @@ static int hippi_rebuild_header(struct sk_buff *skb) * Determine the packet's protocol ID. */ -unsigned short hippi_type_trans(struct sk_buff *skb, struct net_device *dev) +__be16 hippi_type_trans(struct sk_buff *skb, struct net_device *dev) { struct hippi_hdr *hip; -- cgit v0.10.2 From cf4ef01440ca5c6d96f2ea2b793a37a0a863a045 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 25 Aug 2005 16:08:37 -0700 Subject: [LIST]: Add docbook header comments for hlist_add_{before,after}_rcu() Signed-off-by: David S. Miller diff --git a/include/linux/list.h b/include/linux/list.h index 9b9b0ee..e6ec596 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -634,6 +634,21 @@ static inline void hlist_add_after(struct hlist_node *n, next->next->pprev = &next->next; } +/** + * hlist_add_before_rcu - adds the specified element to the specified hlist + * before the specified node while permitting racing traversals. + * @n: the new element to add to the hash list. + * @next: the existing element to add the new element before. + * + * The caller must take whatever precautions are necessary + * (such as holding appropriate locks) to avoid racing + * with another list-mutation primitive, such as hlist_add_head_rcu() + * or hlist_del_rcu(), running on this same list. + * However, it is perfectly legal to run concurrently with + * the _rcu list-traversal primitives, such as + * hlist_for_each_rcu(), used to prevent memory-consistency + * problems on Alpha CPUs. + */ static inline void hlist_add_before_rcu(struct hlist_node *n, struct hlist_node *next) { @@ -644,6 +659,21 @@ static inline void hlist_add_before_rcu(struct hlist_node *n, *(n->pprev) = n; } +/** + * hlist_add_after_rcu - adds the specified element to the specified hlist + * after the specified node while permitting racing traversals. + * @prev: the existing element to add the new element after. + * @n: the new element to add to the hash list. + * + * The caller must take whatever precautions are necessary + * (such as holding appropriate locks) to avoid racing + * with another list-mutation primitive, such as hlist_add_head_rcu() + * or hlist_del_rcu(), running on this same list. + * However, it is perfectly legal to run concurrently with + * the _rcu list-traversal primitives, such as + * hlist_for_each_rcu(), used to prevent memory-consistency + * problems on Alpha CPUs. + */ static inline void hlist_add_after_rcu(struct hlist_node *prev, struct hlist_node *n) { -- cgit v0.10.2 From 8082e4ed0a61da347f1c7f210493c4e9e55c8cd0 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 25 Aug 2005 16:12:22 -0700 Subject: [LIB]: Boyer-Moore extension for textsearch infrastructure strike #2 Attached the implementation of the Boyer-Moore string search algorithm for the new textsearch infrastructure. I've added as well a note about the limitations that this approach presents, as Thomas has remarked. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller diff --git a/lib/Kconfig b/lib/Kconfig index eeb429a..b62f685 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -72,6 +72,16 @@ config TEXTSEARCH config TEXTSEARCH_KMP tristate +config TEXTSEARCH_BM + depends on TEXTSEARCH + tristate "Boyer-Moore" + help + Say Y here if you want to be able to search text using the + Boyer-Moore textsearch algorithm. + + To compile this code as a module, choose M here: the + module will be called ts_bm. + config TEXTSEARCH_FSM tristate diff --git a/lib/Makefile b/lib/Makefile index f28d903..52f8338 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -38,6 +38,7 @@ obj-$(CONFIG_REED_SOLOMON) += reed_solomon/ obj-$(CONFIG_TEXTSEARCH) += textsearch.o obj-$(CONFIG_TEXTSEARCH_KMP) += ts_kmp.o +obj-$(CONFIG_TEXTSEARCH_BM) += ts_bm.o obj-$(CONFIG_TEXTSEARCH_FSM) += ts_fsm.o hostprogs-y := gen_crc32table diff --git a/lib/ts_bm.c b/lib/ts_bm.c new file mode 100644 index 0000000..2cc79112 --- /dev/null +++ b/lib/ts_bm.c @@ -0,0 +1,185 @@ +/* + * lib/ts_bm.c Boyer-Moore text search implementation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Pablo Neira Ayuso + * + * ========================================================================== + * + * Implements Boyer-Moore string matching algorithm: + * + * [1] A Fast String Searching Algorithm, R.S. Boyer and Moore. + * Communications of the Association for Computing Machinery, + * 20(10), 1977, pp. 762-772. + * http://www.cs.utexas.edu/users/moore/publications/fstrpos.pdf + * + * [2] Handbook of Exact String Matching Algorithms, Thierry Lecroq, 2004 + * http://www-igm.univ-mlv.fr/~lecroq/string/string.pdf + * + * Note: Since Boyer-Moore (BM) performs searches for matchings from right + * to left, it's still possible that a matching could be spread over + * multiple blocks, in that case this algorithm won't find any coincidence. + * + * If you're willing to ensure that such thing won't ever happen, use the + * Knuth-Pratt-Morris (KMP) implementation instead. In conclusion, choose + * the proper string search algorithm depending on your setting. + * + * Say you're using the textsearch infrastructure for filtering, NIDS or + * any similar security focused purpose, then go KMP. Otherwise, if you + * really care about performance, say you're classifying packets to apply + * Quality of Service (QoS) policies, and you don't mind about possible + * matchings spread over multiple fragments, then go BM. + */ + +#include +#include +#include +#include +#include +#include + +/* Alphabet size, use ASCII */ +#define ASIZE 256 + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(args, format...) +#endif + +struct ts_bm +{ + u8 * pattern; + unsigned int patlen; + unsigned int bad_shift[ASIZE]; + unsigned int good_shift[0]; +}; + +static unsigned int bm_find(struct ts_config *conf, struct ts_state *state) +{ + struct ts_bm *bm = ts_config_priv(conf); + unsigned int i, text_len, consumed = state->offset; + const u8 *text; + int shift = bm->patlen, bs; + + for (;;) { + text_len = conf->get_next_block(consumed, &text, conf, state); + + if (unlikely(text_len == 0)) + break; + + while (shift < text_len) { + DEBUGP("Searching in position %d (%c)\n", + shift, text[shift]); + for (i = 0; i < bm->patlen; i++) + if (text[shift-i] != bm->pattern[bm->patlen-1-i]) + goto next; + + /* London calling... */ + DEBUGP("found!\n"); + return consumed += (shift-(bm->patlen-1)); + +next: bs = bm->bad_shift[text[shift-i]]; + + /* Now jumping to... */ + shift = max_t(int, shift-i+bs, shift+bm->good_shift[i]); + } + consumed += text_len; + } + + return UINT_MAX; +} + +static void compute_prefix_tbl(struct ts_bm *bm, const u8 *pattern, + unsigned int len) +{ + int i, j, ended, l[ASIZE]; + + for (i = 0; i < ASIZE; i++) + bm->bad_shift[i] = len; + for (i = 0; i < len - 1; i++) + bm->bad_shift[pattern[i]] = len - 1 - i; + + /* Compute the good shift array, used to match reocurrences + * of a subpattern */ + for (i = 1; i < bm->patlen; i++) { + for (j = 0; j < bm->patlen && bm->pattern[bm->patlen - 1 - j] + == bm->pattern[bm->patlen - 1 - i - j]; j++); + l[i] = j; + } + + bm->good_shift[0] = 1; + for (i = 1; i < bm->patlen; i++) + bm->good_shift[i] = bm->patlen; + for (i = bm->patlen - 1; i > 0; i--) + bm->good_shift[l[i]] = i; + ended = 0; + for (i = 0; i < bm->patlen; i++) { + if (l[i] == bm->patlen - 1 - i) + ended = i; + if (ended) + bm->good_shift[i] = ended; + } +} + +static struct ts_config *bm_init(const void *pattern, unsigned int len, + int gfp_mask) +{ + struct ts_config *conf; + struct ts_bm *bm; + unsigned int prefix_tbl_len = len * sizeof(unsigned int); + size_t priv_size = sizeof(*bm) + len + prefix_tbl_len; + + conf = alloc_ts_config(priv_size, gfp_mask); + if (IS_ERR(conf)) + return conf; + + bm = ts_config_priv(conf); + bm->patlen = len; + bm->pattern = (u8 *) bm->good_shift + prefix_tbl_len; + compute_prefix_tbl(bm, pattern, len); + memcpy(bm->pattern, pattern, len); + + return conf; +} + +static void *bm_get_pattern(struct ts_config *conf) +{ + struct ts_bm *bm = ts_config_priv(conf); + return bm->pattern; +} + +static unsigned int bm_get_pattern_len(struct ts_config *conf) +{ + struct ts_bm *bm = ts_config_priv(conf); + return bm->patlen; +} + +static struct ts_ops bm_ops = { + .name = "bm", + .find = bm_find, + .init = bm_init, + .get_pattern = bm_get_pattern, + .get_pattern_len = bm_get_pattern_len, + .owner = THIS_MODULE, + .list = LIST_HEAD_INIT(bm_ops.list) +}; + +static int __init init_bm(void) +{ + return textsearch_register(&bm_ops); +} + +static void __exit exit_bm(void) +{ + textsearch_unregister(&bm_ops); +} + +MODULE_LICENSE("GPL"); + +module_init(init_bm); +module_exit(exit_bm); -- cgit v0.10.2 From 29cb9f9c5502f6218cd3ea574efe46a5e55522d2 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 25 Aug 2005 16:23:11 -0700 Subject: [LIB]: Make TEXTSEARCH_BM plain tristate like the others And select it when the relevant modules are enabled. Signed-off-by: David S. Miller diff --git a/lib/Kconfig b/lib/Kconfig index b62f685..e43197e 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -73,14 +73,7 @@ config TEXTSEARCH_KMP tristate config TEXTSEARCH_BM - depends on TEXTSEARCH - tristate "Boyer-Moore" - help - Say Y here if you want to be able to search text using the - Boyer-Moore textsearch algorithm. - - To compile this code as a module, choose M here: the - module will be called ts_bm. + tristate config TEXTSEARCH_FSM tristate diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index f2bea6e..c4213f3 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -415,6 +415,7 @@ config IP_NF_MATCH_STRING depends on IP_NF_IPTABLES select TEXTSEARCH select TEXTSEARCH_KMP + select TEXTSEARCH_BM select TEXTSEARCH_FSM help This option adds a `string' match, which allows you to look for diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 59d3e71..45d3bc0 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -491,6 +491,7 @@ config NET_EMATCH_TEXT depends on NET_EMATCH select TEXTSEARCH select TEXTSEARCH_KMP + select TEXTSEARCH_BM select TEXTSEARCH_FSM ---help--- Say Y here if you want to be ablt to classify packets based on -- cgit v0.10.2 From ba89966c1984513f4f2cc0a6c182266be44ddd03 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 26 Aug 2005 12:05:31 -0700 Subject: [NET]: use __read_mostly on kmem_cache_t , DEFINE_SNMP_STAT pointers This patch puts mostly read only data in the right section (read_mostly), to help sharing of these data between CPUS without memory ping pongs. On one of my production machine, tcp_statistics was sitting in a heavily modified cache line, so *every* SNMP update had to force a reload. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index e6c2200..24396b9 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -23,7 +23,7 @@ #include #include "br_private.h" -static kmem_cache_t *br_fdb_cache; +static kmem_cache_t *br_fdb_cache __read_mostly; static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr); diff --git a/net/core/flow.c b/net/core/flow.c index f289570..7e95b39 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -42,7 +42,7 @@ static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables) = { NULL }; #define flow_table(cpu) (per_cpu(flow_tables, cpu)) -static kmem_cache_t *flow_cachep; +static kmem_cache_t *flow_cachep __read_mostly; static int flow_lwm, flow_hwm; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b853a9b..f80a287 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -68,8 +68,8 @@ #include #include -static kmem_cache_t *skbuff_head_cache; -static kmem_cache_t *skbuff_fclone_cache; +static kmem_cache_t *skbuff_head_cache __read_mostly; +static kmem_cache_t *skbuff_fclone_cache __read_mostly; struct timeval __read_mostly skb_tv_base; diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index fe4cc85..cf93b01 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -85,7 +85,7 @@ static int ccid3_debug; static struct dccp_tx_hist *ccid3_tx_hist; static struct dccp_rx_hist *ccid3_rx_hist; -static kmem_cache_t *ccid3_loss_interval_hist_slab; +static kmem_cache_t *ccid3_loss_interval_hist_slab __read_mostly; static inline struct ccid3_loss_interval_hist_entry * ccid3_loss_interval_hist_entry_new(const unsigned int __nocast prio) diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 600dda5..f97e92e 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -39,7 +39,7 @@ #include "ccid.h" #include "dccp.h" -DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics); +DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly; atomic_t dccp_orphan_count = ATOMIC_INIT(0); diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c index 73a8848..eeba56f 100644 --- a/net/decnet/dn_table.c +++ b/net/decnet/dn_table.c @@ -79,7 +79,7 @@ for( ; ((f) = *(fp)) != NULL && dn_key_eq((f)->fn_key, (key)); (fp) = &(f)->fn_n static DEFINE_RWLOCK(dn_fib_tables_lock); struct dn_fib_table *dn_fib_tables[RT_TABLE_MAX + 1]; -static kmem_cache_t *dn_hash_kmem; +static kmem_cache_t *dn_hash_kmem __read_mostly; static int dn_fib_hash_zombies; static inline dn_fib_idx_t dn_hash(dn_fib_key_t key, struct dn_zone *dz) diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 5810f9d..bf147f8 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -113,7 +113,7 @@ #include #endif -DEFINE_SNMP_STAT(struct linux_mib, net_statistics); +DEFINE_SNMP_STAT(struct linux_mib, net_statistics) __read_mostly; extern void ip_mc_drop_socket(struct sock *sk); diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index b10d6bb..2a8c9af 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -45,8 +45,8 @@ #include "fib_lookup.h" -static kmem_cache_t *fn_hash_kmem; -static kmem_cache_t *fn_alias_kmem; +static kmem_cache_t *fn_hash_kmem __read_mostly; +static kmem_cache_t *fn_alias_kmem __read_mostly; struct fib_node { struct hlist_node fn_hash; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index ff21748..b2dea4e 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -166,7 +166,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn); static void tnode_free(struct tnode *tn); static void trie_dump_seq(struct seq_file *seq, struct trie *t); -static kmem_cache_t *fn_alias_kmem; +static kmem_cache_t *fn_alias_kmem __read_mostly; static struct trie *trie_local = NULL, *trie_main = NULL; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 25f66b7..24eb56a 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -114,7 +114,7 @@ struct icmp_bxm { /* * Statistics */ -DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics); +DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics) __read_mostly; /* An array of errno for error messages from dest unreach. */ /* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */ diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 4410b9d..f84ba9c 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -73,7 +73,7 @@ /* Exported for inet_getid inline function. */ DEFINE_SPINLOCK(inet_peer_idlock); -static kmem_cache_t *peer_cachep; +static kmem_cache_t *peer_cachep __read_mostly; #define node_height(x) x->avl_height static struct inet_peer peer_fake_node = { diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 220a8b5..473d0f2 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -150,7 +150,7 @@ * SNMP management statistics */ -DEFINE_SNMP_STAT(struct ipstats_mib, ip_statistics); +DEFINE_SNMP_STAT(struct ipstats_mib, ip_statistics) __read_mostly; /* * Process Router Attention IP option diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index dc806b5..9dbf590 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -103,7 +103,7 @@ static DEFINE_SPINLOCK(mfc_unres_lock); In this case data path is free of exclusive locks at all. */ -static kmem_cache_t *mrt_cachep; +static kmem_cache_t *mrt_cachep __read_mostly; static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local); static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert); diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c index d0145a8..e11952e 100644 --- a/net/ipv4/ipvs/ip_vs_conn.c +++ b/net/ipv4/ipvs/ip_vs_conn.c @@ -40,7 +40,7 @@ static struct list_head *ip_vs_conn_tab; /* SLAB cache for IPVS connections */ -static kmem_cache_t *ip_vs_conn_cachep; +static kmem_cache_t *ip_vs_conn_cachep __read_mostly; /* counter for current IPVS connections */ static atomic_t ip_vs_conn_count = ATOMIC_INIT(0); diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index 285743b..a064860 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -70,8 +70,8 @@ static LIST_HEAD(helpers); unsigned int ip_conntrack_htable_size = 0; int ip_conntrack_max; struct list_head *ip_conntrack_hash; -static kmem_cache_t *ip_conntrack_cachep; -static kmem_cache_t *ip_conntrack_expect_cachep; +static kmem_cache_t *ip_conntrack_cachep __read_mostly; +static kmem_cache_t *ip_conntrack_expect_cachep __read_mostly; struct ip_conntrack ip_conntrack_untracked; unsigned int ip_ct_log_invalid; static LIST_HEAD(unconfirmed); diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c index 564b49b..2dd1ccc 100644 --- a/net/ipv4/netfilter/ipt_hashlimit.c +++ b/net/ipv4/netfilter/ipt_hashlimit.c @@ -94,7 +94,7 @@ struct ipt_hashlimit_htable { static DEFINE_SPINLOCK(hashlimit_lock); /* protects htables list */ static DECLARE_MUTEX(hlimit_mutex); /* additional checkentry protection */ static HLIST_HEAD(hashlimit_htables); -static kmem_cache_t *hashlimit_cachep; +static kmem_cache_t *hashlimit_cachep __read_mostly; static inline int dst_cmp(const struct dsthash_ent *ent, struct dsthash_dst *b) { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 68626de..02fdda6 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -269,7 +269,7 @@ int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; -DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics); +DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics) __read_mostly; atomic_t tcp_orphan_count = ATOMIC_INIT(0); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 3a5bbbe..e5beca7 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -113,7 +113,7 @@ * Snmp MIB for the UDP layer */ -DEFINE_SNMP_STAT(struct udp_mib, udp_statistics); +DEFINE_SNMP_STAT(struct udp_mib, udp_statistics) __read_mostly; struct hlist_head udp_hash[UDP_HTABLE_SIZE]; DEFINE_RWLOCK(udp_hash_lock); diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index ff685f2..5176fc6 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -67,7 +67,7 @@ #include #include -DEFINE_SNMP_STAT(struct icmpv6_mib, icmpv6_statistics); +DEFINE_SNMP_STAT(struct icmpv6_mib, icmpv6_statistics) __read_mostly; /* * The ICMP socket(s). This is the most convenient way to flow control diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 1b354aa..16af874 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -49,7 +49,7 @@ struct rt6_statistics rt6_stats; -static kmem_cache_t * fib6_node_kmem; +static kmem_cache_t * fib6_node_kmem __read_mostly; enum fib_walk_state_t { diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 7516b88..76466af 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -55,7 +55,7 @@ #include -DEFINE_SNMP_STAT(struct ipstats_mib, ipv6_statistics); +DEFINE_SNMP_STAT(struct ipstats_mib, ipv6_statistics) __read_mostly; static struct packet_type ipv6_packet_type = { .type = __constant_htons(ETH_P_IPV6), diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 67d9a04..390d750 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -59,7 +59,7 @@ #include #include -DEFINE_SNMP_STAT(struct udp_mib, udp_stats_in6); +DEFINE_SNMP_STAT(struct udp_mib, udp_stats_in6) __read_mostly; /* Grrr, addr_type already calculated by caller, but I don't want * to add some silly "cookie" argument to this method just for that. diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index 60c26c8..fbef782 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -79,7 +79,7 @@ static u32 xfrm6_tunnel_spi; #define XFRM6_TUNNEL_SPI_MIN 1 #define XFRM6_TUNNEL_SPI_MAX 0xffffffff -static kmem_cache_t *xfrm6_tunnel_spi_kmem; +static kmem_cache_t *xfrm6_tunnel_spi_kmem __read_mostly; #define XFRM6_TUNNEL_SPI_BYADDR_HSIZE 256 #define XFRM6_TUNNEL_SPI_BYSPI_HSIZE 256 diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 7d8ec65..e7025be 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -62,7 +62,7 @@ /* Global data structures. */ struct sctp_globals sctp_globals; struct proc_dir_entry *proc_net_sctp; -DEFINE_SNMP_STAT(struct sctp_mib, sctp_statistics); +DEFINE_SNMP_STAT(struct sctp_mib, sctp_statistics) __read_mostly; struct idr sctp_assocs_id; DEFINE_SPINLOCK(sctp_assocs_id_lock); @@ -78,8 +78,8 @@ static struct sctp_pf *sctp_pf_inet_specific; static struct sctp_af *sctp_af_v4_specific; static struct sctp_af *sctp_af_v6_specific; -kmem_cache_t *sctp_chunk_cachep; -kmem_cache_t *sctp_bucket_cachep; +kmem_cache_t *sctp_chunk_cachep __read_mostly; +kmem_cache_t *sctp_bucket_cachep __read_mostly; extern int sctp_snmp_proc_init(void); extern int sctp_snmp_proc_exit(void); diff --git a/net/socket.c b/net/socket.c index ce69b78..94fe638 100644 --- a/net/socket.c +++ b/net/socket.c @@ -274,7 +274,7 @@ int move_addr_to_user(void *kaddr, int klen, void __user *uaddr, int __user *ule #define SOCKFS_MAGIC 0x534F434B -static kmem_cache_t * sock_inode_cachep; +static kmem_cache_t * sock_inode_cachep __read_mostly; static struct inode *sock_alloc_inode(struct super_block *sb) { @@ -333,7 +333,7 @@ static struct super_block *sockfs_get_sb(struct file_system_type *fs_type, return get_sb_pseudo(fs_type, "socket:", &sockfs_ops, SOCKFS_MAGIC); } -static struct vfsmount *sock_mnt; +static struct vfsmount *sock_mnt __read_mostly; static struct file_system_type sock_fs_type = { .name = "sockfs", diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 554f224..fe1a73c 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -28,13 +28,13 @@ #include #include -static struct vfsmount *rpc_mount; +static struct vfsmount *rpc_mount __read_mostly; static int rpc_mount_count; static struct file_system_type rpc_pipe_fs_type; -static kmem_cache_t *rpc_inode_cachep; +static kmem_cache_t *rpc_inode_cachep __read_mostly; #define RPC_UPCALL_TIMEOUT (30*HZ) diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 2d9eb7f..f310403 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -34,10 +34,10 @@ static int rpc_task_id; #define RPC_BUFFER_MAXSIZE (2048) #define RPC_BUFFER_POOLSIZE (8) #define RPC_TASK_POOLSIZE (8) -static kmem_cache_t *rpc_task_slabp; -static kmem_cache_t *rpc_buffer_slabp; -static mempool_t *rpc_task_mempool; -static mempool_t *rpc_buffer_mempool; +static kmem_cache_t *rpc_task_slabp __read_mostly; +static kmem_cache_t *rpc_buffer_slabp __read_mostly; +static mempool_t *rpc_task_mempool __read_mostly; +static mempool_t *rpc_buffer_mempool __read_mostly; static void __rpc_default_timer(struct rpc_task *task); static void rpciod_killall(void); diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index c58a6f0..2407a70 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -12,7 +12,7 @@ #include #include -static kmem_cache_t *secpath_cachep; +static kmem_cache_t *secpath_cachep __read_mostly; void __secpath_destroy(struct sec_path *sp) { diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index d65ed86..83c8135 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -37,7 +37,7 @@ EXPORT_SYMBOL(xfrm_policy_list); static DEFINE_RWLOCK(xfrm_policy_afinfo_lock); static struct xfrm_policy_afinfo *xfrm_policy_afinfo[NPROTO]; -static kmem_cache_t *xfrm_dst_cache; +static kmem_cache_t *xfrm_dst_cache __read_mostly; static struct work_struct xfrm_policy_gc_work; static struct list_head xfrm_policy_gc_list = -- cgit v0.10.2 From 75b3f207b433dcb807fcf0f47de1c8398571ba5f Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sat, 27 Aug 2005 02:35:30 -0300 Subject: [DCCP]: Make the Debug Menu available when DCCP is statically linked too Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig index 3023f70..187ac18 100644 --- a/net/dccp/Kconfig +++ b/net/dccp/Kconfig @@ -27,7 +27,7 @@ config INET_DCCP_DIAG source "net/dccp/ccids/Kconfig" menu "DCCP Kernel Hacking" - depends on IP_DCCP=m && DEBUG_KERNEL=y + depends on IP_DCCP && DEBUG_KERNEL=y config IP_DCCP_DEBUG bool "DCCP debug messages" @@ -37,7 +37,7 @@ config IP_DCCP_DEBUG Just say N. config IP_DCCP_UNLOAD_HACK - depends on IP_DCCP_CCID3=m + depends on IP_DCCP=m && IP_DCCP_CCID3=m bool "DCCP control sock unload hack" ---help--- Enable this to be able to unload the dccp module when the it -- cgit v0.10.2 From d6809c12b3334a929c39bf08ea63bd819e0500f7 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sat, 27 Aug 2005 03:06:35 -0300 Subject: [DCCP]: Introduce dccp_wait_for_ccid and use it in dccp_write_xmit This is not quite what I think we should have long term but improves performance for now, so lets use it till we get CCID3 working well, then we can think about using sk_write_queue, perhaps using some ideas from Juwen Lai's old stack for 2.4.20. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index cf93b01..9866dc1 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -985,7 +985,7 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, ccid3_pr_debug("send_packet delay=%ld\n", delay); delay /= -1000; /* divide by -1000 is to convert to ms and get sign right */ - rc = delay > 0 ? -EAGAIN : 0; + rc = delay > 0 ? delay : 0; break; default: printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n", diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index c6ba07e..6ba2150 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -126,8 +126,7 @@ extern void dccp_send_delayed_ack(struct sock *sk); extern void dccp_send_sync(struct sock *sk, const u64 seq, const enum dccp_pkt_type pkt_type); -extern int dccp_write_xmit(struct sock *sk, struct sk_buff *skb, - const int len); +extern int dccp_write_xmit(struct sock *sk, struct sk_buff *skb, long *timeo); extern void dccp_init_xmit_timers(struct sock *sk); static inline void dccp_clear_xmit_timers(struct sock *sk) diff --git a/net/dccp/output.c b/net/dccp/output.c index f96dedd..116f6db 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -150,14 +150,71 @@ unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu) return mss_now; } -int dccp_write_xmit(struct sock *sk, struct sk_buff *skb, const int len) +/** + * dccp_wait_for_ccid - Wait for ccid to tell us we can send a packet + * @sk: socket to wait for + * @timeo: for how long + */ +static int dccp_wait_for_ccid(struct sock *sk, struct sk_buff *skb, + long *timeo) +{ + struct dccp_sock *dp = dccp_sk(sk); + DEFINE_WAIT(wait); + long delay; + int rc; + + while (1) { + prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + + if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) + goto do_error; + if (!*timeo) + goto do_nonblock; + if (signal_pending(current)) + goto do_interrupted; + + rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb, + skb->len); + if (rc <= 0) + break; + delay = msecs_to_jiffies(rc); + if (delay > *timeo || delay < 0) + goto do_nonblock; + + sk->sk_write_pending++; + release_sock(sk); + *timeo -= schedule_timeout(delay); + lock_sock(sk); + sk->sk_write_pending--; + } +out: + finish_wait(sk->sk_sleep, &wait); + return rc; + +do_error: + rc = -EPIPE; + goto out; +do_nonblock: + rc = -EAGAIN; + goto out; +do_interrupted: + rc = sock_intr_errno(*timeo); + goto out; +} + +int dccp_write_xmit(struct sock *sk, struct sk_buff *skb, long *timeo) { const struct dccp_sock *dp = dccp_sk(sk); - int err = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb, len); + int err = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb, + skb->len); + + if (err > 0) + err = dccp_wait_for_ccid(sk, skb, timeo); if (err == 0) { const struct dccp_ackpkts *ap = dp->dccps_hc_rx_ackpkts; struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); + const int len = skb->len; if (sk->sk_state == DCCP_PARTOPEN) { /* See 8.1.5. Handshake Completion */ diff --git a/net/dccp/proto.c b/net/dccp/proto.c index f97e92e..f4da656 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -261,7 +261,7 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, if (rc != 0) goto out_discard; - rc = dccp_write_xmit(sk, skb, len); + rc = dccp_write_xmit(sk, skb, &timeo); /* * XXX we don't use sk_write_queue, so just discard the packet. * Current plan however is to _use_ sk_write_queue with -- cgit v0.10.2 From 1f2333aea3269e196c44ae9a220e714cc1427792 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sat, 27 Aug 2005 03:51:58 -0300 Subject: [CCID3]: Reflow to mostly fit under 80 columns No code changes. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 9866dc1..225c530 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -142,14 +142,16 @@ static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state) } #endif -static inline void ccid3_hc_tx_set_state(struct sock *sk, enum ccid3_hc_tx_states state) +static inline void ccid3_hc_tx_set_state(struct sock *sk, + enum ccid3_hc_tx_states state) { struct dccp_sock *dp = dccp_sk(sk); struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private; enum ccid3_hc_tx_states oldstate = hctx->ccid3hctx_state; ccid3_pr_debug("%s(%p) %-8.8s -> %s\n", - dccp_role(sk), sk, ccid3_tx_state_name(oldstate), ccid3_tx_state_name(state)); + dccp_role(sk), sk, ccid3_tx_state_name(oldstate), + ccid3_tx_state_name(state)); WARN_ON(state == oldstate); hctx->ccid3hctx_state = state; } @@ -785,7 +787,8 @@ static inline void ccid3_calc_new_t_ipi(struct ccid3_hc_tx_sock *hctx) /* Calculate new delta by delta = min(t_ipi / 2, t_gran / 2) */ static inline void ccid3_calc_new_delta(struct ccid3_hc_tx_sock *hctx) { - hctx->ccid3hctx_delta = min_t(u32, hctx->ccid3hctx_t_ipi / 2, TFRC_OPSYS_HALF_TIME_GRAN); + hctx->ccid3hctx_delta = min_t(u32, hctx->ccid3hctx_t_ipi / 2, + TFRC_OPSYS_HALF_TIME_GRAN); } @@ -804,20 +807,25 @@ static void ccid3_hc_tx_update_x(struct sock *sk) struct dccp_sock *dp = dccp_sk(sk); struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private; - if (hctx->ccid3hctx_p >= TFRC_SMALLEST_P) { /* to avoid large error in calcX */ + /* To avoid large error in calcX */ + if (hctx->ccid3hctx_p >= TFRC_SMALLEST_P) { hctx->ccid3hctx_x_calc = ccid3_calc_x(hctx->ccid3hctx_s, hctx->ccid3hctx_rtt, hctx->ccid3hctx_p); - hctx->ccid3hctx_x = max_t(u32, min_t(u32, hctx->ccid3hctx_x_calc, 2 * hctx->ccid3hctx_x_recv), - hctx->ccid3hctx_s / TFRC_MAX_BACK_OFF_TIME); + hctx->ccid3hctx_x = max_t(u32, min_t(u32, hctx->ccid3hctx_x_calc, + 2 * hctx->ccid3hctx_x_recv), + (hctx->ccid3hctx_s / + TFRC_MAX_BACK_OFF_TIME)); } else if (now_delta(hctx->ccid3hctx_t_ld) >= hctx->ccid3hctx_rtt) { u32 rtt = hctx->ccid3hctx_rtt; if (rtt < 10) { rtt = 10; } /* avoid divide by zero below */ - hctx->ccid3hctx_x = max_t(u32, min_t(u32, 2 * hctx->ccid3hctx_x_recv, 2 * hctx->ccid3hctx_x), - (hctx->ccid3hctx_s * 100000) / (rtt / 10)); + hctx->ccid3hctx_x = max_t(u32, min_t(u32, 2 * hctx->ccid3hctx_x_recv, + 2 * hctx->ccid3hctx_x), + ((hctx->ccid3hctx_s * 100000) / + (rtt / 10))); /* Using 100000 and 10 to avoid 32 bit overflow for jumbo frames */ do_gettimeofday(&hctx->ccid3hctx_t_ld); } @@ -840,7 +848,8 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) if (sock_owned_by_user(sk)) { /* Try again later. */ /* XXX: set some sensible MIB */ - sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, jiffies + HZ / 5); + sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, + jiffies + HZ / 5); goto out; } @@ -858,27 +867,38 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) case TFRC_SSTATE_NO_FBACK: /* Halve send rate */ hctx->ccid3hctx_x /= 2; - if (hctx->ccid3hctx_x < (hctx->ccid3hctx_s / TFRC_MAX_BACK_OFF_TIME)) - hctx->ccid3hctx_x = hctx->ccid3hctx_s / TFRC_MAX_BACK_OFF_TIME; + if (hctx->ccid3hctx_x < + (hctx->ccid3hctx_s / TFRC_MAX_BACK_OFF_TIME)) + hctx->ccid3hctx_x = (hctx->ccid3hctx_s / + TFRC_MAX_BACK_OFF_TIME); - ccid3_pr_debug("%s, sk=%p, state=%s, updated tx rate to %d bytes/s\n", - dccp_role(sk), sk, ccid3_tx_state_name(hctx->ccid3hctx_state), + ccid3_pr_debug("%s, sk=%p, state=%s, updated tx rate to %d " + "bytes/s\n", + dccp_role(sk), sk, + ccid3_tx_state_name(hctx->ccid3hctx_state), hctx->ccid3hctx_x); - next_tmout = max_t(u32, 2 * (hctx->ccid3hctx_s * 100000) - / (hctx->ccid3hctx_x / 10), TFRC_INITIAL_TIMEOUT); + next_tmout = max_t(u32, 2 * (hctx->ccid3hctx_s * 100000) / (hctx->ccid3hctx_x / 10), + TFRC_INITIAL_TIMEOUT); /* do above maths with 100000 and 10 to prevent overflow on 32 bit */ - /* FIXME - not sure above calculation is correct. See section 5 of CCID3 11 - * should adjust tx_t_ipi and double that to achieve it really */ + /* + * FIXME - not sure above calculation is correct. See section + * 5 of CCID3 11 should adjust tx_t_ipi and double that to + * achieve it really + */ break; case TFRC_SSTATE_FBACK: - /* Check if IDLE since last timeout and recv rate is less than 4 packets per RTT */ + /* + * Check if IDLE since last timeout and recv rate is less than + * 4 packets per RTT + */ rtt = hctx->ccid3hctx_rtt; if (rtt < 10) rtt = 10; /* stop divide by zero below */ - if (!hctx->ccid3hctx_idle || (hctx->ccid3hctx_x_recv >= - 4 * (hctx->ccid3hctx_s * 100000) / (rtt / 10))) { - ccid3_pr_debug("%s, sk=%p, state=%s, not idle\n", dccp_role(sk), sk, + if (!hctx->ccid3hctx_idle || + (hctx->ccid3hctx_x_recv >= 4 * (hctx->ccid3hctx_s * 100000) / (rtt / 10))) { + ccid3_pr_debug("%s, sk=%p, state=%s, not idle\n", + dccp_role(sk), sk, ccid3_tx_state_name(hctx->ccid3hctx_state)); /* Halve sending rate */ @@ -887,7 +907,8 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) * Else * X_recv = X_calc / 4; */ - BUG_ON(hctx->ccid3hctx_p >= TFRC_SMALLEST_P && hctx->ccid3hctx_x_calc == 0); + BUG_ON(hctx->ccid3hctx_p >= TFRC_SMALLEST_P && + hctx->ccid3hctx_x_calc == 0); /* check also if p is zero -> x_calc is infinity? */ if (hctx->ccid3hctx_p < TFRC_SMALLEST_P || @@ -916,7 +937,7 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) } sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, - jiffies + max_t(u32, 1, usecs_to_jiffies(next_tmout))); + jiffies + max_t(u32, 1, usecs_to_jiffies(next_tmout))); hctx->ccid3hctx_idle = 1; out: bh_unlock_sock(sk); @@ -933,24 +954,27 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, long delay; int rc = -ENOTCONN; -// ccid3_pr_debug("%s, sk=%p, skb=%p, len=%d\n", dccp_role(sk), sk, skb, len); + /* Check if pure ACK or Terminating*/ + /* - * check if pure ACK or Terminating */ - /* XXX: We only call this function for DATA and DATAACK, on, these packets can have - * zero length, but why the comment about "pure ACK"? + * XXX: We only call this function for DATA and DATAACK, on, these + * packets can have zero length, but why the comment about "pure ACK"? */ - if (hctx == NULL || len == 0 || hctx->ccid3hctx_state == TFRC_SSTATE_TERM) + if (hctx == NULL || len == 0 || + hctx->ccid3hctx_state == TFRC_SSTATE_TERM) goto out; /* See if last packet allocated was not sent */ new_packet = dccp_tx_hist_head(&hctx->ccid3hctx_hist); if (new_packet == NULL || new_packet->dccphtx_sent) { - new_packet = dccp_tx_hist_entry_new(ccid3_tx_hist, SLAB_ATOMIC); + new_packet = dccp_tx_hist_entry_new(ccid3_tx_hist, + SLAB_ATOMIC); rc = -ENOBUFS; if (new_packet == NULL) { ccid3_pr_debug("%s, sk=%p, not enough mem to add " - "to history, send refused\n", dccp_role(sk), sk); + "to history, send refused\n", + dccp_role(sk), sk); goto out; } @@ -961,12 +985,13 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, switch (hctx->ccid3hctx_state) { case TFRC_SSTATE_NO_SENT: - ccid3_pr_debug("%s, sk=%p, first packet(%llu)\n", dccp_role(sk), sk, - dp->dccps_gss); + ccid3_pr_debug("%s, sk=%p, first packet(%llu)\n", + dccp_role(sk), sk, dp->dccps_gss); hctx->ccid3hctx_no_feedback_timer.function = ccid3_hc_tx_no_feedback_timer; hctx->ccid3hctx_no_feedback_timer.data = (unsigned long)sk; - sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, jiffies + usecs_to_jiffies(TFRC_INITIAL_TIMEOUT)); + sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, + jiffies + usecs_to_jiffies(TFRC_INITIAL_TIMEOUT)); hctx->ccid3hctx_last_win_count = 0; hctx->ccid3hctx_t_last_win_count = now; ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); @@ -981,7 +1006,7 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, break; case TFRC_SSTATE_NO_FBACK: case TFRC_SSTATE_FBACK: - delay = (now_delta(hctx->ccid3hctx_t_nom) - hctx->ccid3hctx_delta); + delay = now_delta(hctx->ccid3hctx_t_nom) - hctx->ccid3hctx_delta; ccid3_pr_debug("send_packet delay=%ld\n", delay); delay /= -1000; /* divide by -1000 is to convert to ms and get sign right */ @@ -1027,41 +1052,35 @@ static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, int len) packet = dccp_tx_hist_head(&hctx->ccid3hctx_hist); if (packet == NULL) { - printk(KERN_CRIT "%s: packet doesn't exists in history!\n", __FUNCTION__); + printk(KERN_CRIT "%s: packet doesn't exists in " + "history!\n", __FUNCTION__); return; } if (packet->dccphtx_sent) { - printk(KERN_CRIT "%s: no unsent packet in history!\n", __FUNCTION__); + printk(KERN_CRIT "%s: no unsent packet in history!\n", + __FUNCTION__); return; } packet->dccphtx_tstamp = now; packet->dccphtx_seqno = dp->dccps_gss; -#if 0 - ccid3_pr_debug("%s, sk=%p, seqno=%llu inserted!\n", - dccp_role(sk), sk, packet->dccphtx_seqno); -#endif /* - * Check if win_count have changed */ - /* COMPLIANCE_BEGIN - * Algorithm in "8.1. Window Counter Valuer" in draft-ietf-dccp-ccid3-11.txt + * Check if win_count have changed + * Algorithm in "8.1. Window Counter Valuer" in + * draft-ietf-dccp-ccid3-11.txt */ - quarter_rtt = now_delta(hctx->ccid3hctx_t_last_win_count) / (hctx->ccid3hctx_rtt / 4); + quarter_rtt = now_delta(hctx->ccid3hctx_t_last_win_count) / + (hctx->ccid3hctx_rtt / 4); if (quarter_rtt > 0) { hctx->ccid3hctx_t_last_win_count = now; hctx->ccid3hctx_last_win_count = (hctx->ccid3hctx_last_win_count + min_t(unsigned long, quarter_rtt, 5)) % 16; - ccid3_pr_debug("%s, sk=%p, window changed from %u to %u!\n", + ccid3_pr_debug("%s, sk=%p, window changed from " + "%u to %u!\n", dccp_role(sk), sk, packet->dccphtx_ccval, hctx->ccid3hctx_last_win_count); } - /* COMPLIANCE_END */ -#if 0 - ccid3_pr_debug("%s, sk=%p, packet sent (%llu,%u)\n", - dccp_role(sk), sk, - packet->dccphtx_seqno, - packet->dccphtx_ccval); -#endif + hctx->ccid3hctx_idle = 0; packet->dccphtx_rtt = hctx->ccid3hctx_rtt; packet->dccphtx_sent = 1; @@ -1073,7 +1092,8 @@ static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, int len) case TFRC_SSTATE_NO_SENT: /* if first wasn't pure ack */ if (len != 0) - printk(KERN_CRIT "%s: %s, First packet sent is noted as a data packet\n", + printk(KERN_CRIT "%s: %s, First packet sent is noted " + "as a data packet\n", __FUNCTION__, dccp_role(sk)); return; case TFRC_SSTATE_NO_FBACK: @@ -1105,16 +1125,13 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) u32 pinv; u32 x_recv; u32 r_sample; -#if 0 - ccid3_pr_debug("%s, sk=%p(%s), skb=%p(%s)\n", - dccp_role(sk), sk, dccp_state_name(sk->sk_state), - skb, dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type)); -#endif + if (hctx == NULL) return; if (hctx->ccid3hctx_state == TFRC_SSTATE_TERM) { - ccid3_pr_debug("%s, sk=%p, received a packet when terminating!\n", dccp_role(sk), sk); + ccid3_pr_debug("%s, sk=%p, received a packet when " + "terminating!\n", dccp_role(sk), sk); return; } @@ -1141,8 +1158,10 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) packet = dccp_tx_hist_find_entry(&hctx->ccid3hctx_hist, DCCP_SKB_CB(skb)->dccpd_ack_seq); if (packet == NULL) { - ccid3_pr_debug("%s, sk=%p, seqno %llu(%s) does't exist in history!\n", - dccp_role(sk), sk, DCCP_SKB_CB(skb)->dccpd_ack_seq, + ccid3_pr_debug("%s, sk=%p, seqno %llu(%s) does't " + "exist in history!\n", + dccp_role(sk), sk, + DCCP_SKB_CB(skb)->dccpd_ack_seq, dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type)); return; } @@ -1164,7 +1183,8 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK); hctx->ccid3hctx_rtt = r_sample; } else - hctx->ccid3hctx_rtt = (hctx->ccid3hctx_rtt * 9) / 10 + r_sample / 10; + hctx->ccid3hctx_rtt = (hctx->ccid3hctx_rtt * 9) / 10 + + r_sample / 10; /* * XXX: this is to avoid a division by zero in ccid3_hc_tx_packet_sent @@ -1173,17 +1193,16 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) if (hctx->ccid3hctx_rtt < 4) hctx->ccid3hctx_rtt = 4; - ccid3_pr_debug("%s, sk=%p, New RTT estimate=%uus, r_sample=%us\n", - dccp_role(sk), sk, - hctx->ccid3hctx_rtt, - r_sample); + ccid3_pr_debug("%s, sk=%p, New RTT estimate=%uus, " + "r_sample=%us\n", dccp_role(sk), sk, + hctx->ccid3hctx_rtt, r_sample); /* Update timeout interval */ hctx->ccid3hctx_t_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt, USEC_PER_SEC); /* Update receive rate */ - hctx->ccid3hctx_x_recv = x_recv; /* x_recv in bytes per second */ + hctx->ccid3hctx_x_recv = x_recv;/* X_recv in bytes per sec */ /* Update loss event rate */ if (pinv == ~0 || pinv == 0) @@ -1193,7 +1212,8 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) if (hctx->ccid3hctx_p < TFRC_SMALLEST_P) { hctx->ccid3hctx_p = TFRC_SMALLEST_P; - ccid3_pr_debug("%s, sk=%p, Smallest p used!\n", dccp_role(sk), sk); + ccid3_pr_debug("%s, sk=%p, Smallest p used!\n", + dccp_role(sk), sk); } } @@ -1220,22 +1240,27 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) &hctx->ccid3hctx_hist, packet); if (hctx->ccid3hctx_x < 10) { - ccid3_pr_debug("ccid3_hc_tx_packet_recv hctx->ccid3hctx_x < 10\n"); + ccid3_pr_debug("ccid3_hc_tx_packet_recv hctx_x < 10\n"); hctx->ccid3hctx_x = 10; } /* to prevent divide by zero below */ - /* Schedule no feedback timer to expire in max(4 * R, 2 * s / X) */ + /* + * Schedule no feedback timer to expire in + * max(4 * R, 2 * s / X) + */ next_tmout = max(hctx->ccid3hctx_t_rto, (2 * (hctx->ccid3hctx_s * 100000) / (hctx->ccid3hctx_x / 10))); /* maths with 100000 and 10 is to prevent overflow with 32 bit */ - ccid3_pr_debug("%s, sk=%p, Scheduled no feedback timer to expire in %lu jiffies (%luus)\n", - dccp_role(sk), sk, usecs_to_jiffies(next_tmout), next_tmout); + ccid3_pr_debug("%s, sk=%p, Scheduled no feedback timer to " + "expire in %lu jiffies (%luus)\n", + dccp_role(sk), sk, + usecs_to_jiffies(next_tmout), next_tmout); sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, - jiffies + max_t(u32,1,usecs_to_jiffies(next_tmout))); + jiffies + max_t(u32, 1, usecs_to_jiffies(next_tmout))); /* set idle flag */ hctx->ccid3hctx_idle = 1; @@ -1253,14 +1278,16 @@ static void ccid3_hc_tx_insert_options(struct sock *sk, struct sk_buff *skb) const struct dccp_sock *dp = dccp_sk(sk); struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private; - if (hctx == NULL || !(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN)) + if (hctx == NULL || !(sk->sk_state == DCCP_OPEN || + sk->sk_state == DCCP_PARTOPEN)) return; DCCP_SKB_CB(skb)->dccpd_ccval = hctx->ccid3hctx_last_win_count; } static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option, - unsigned char len, u16 idx, unsigned char *value) + unsigned char len, u16 idx, + unsigned char *value) { int rc = 0; struct dccp_sock *dp = dccp_sk(sk); @@ -1283,7 +1310,8 @@ static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option, switch (option) { case TFRC_OPT_LOSS_EVENT_RATE: if (len != 4) { - ccid3_pr_debug("%s, sk=%p, invalid len for TFRC_OPT_LOSS_EVENT_RATE\n", + ccid3_pr_debug("%s, sk=%p, invalid len for " + "TFRC_OPT_LOSS_EVENT_RATE\n", dccp_role(sk), sk); rc = -EINVAL; } else { @@ -1303,7 +1331,8 @@ static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option, break; case TFRC_OPT_RECEIVE_RATE: if (len != 4) { - ccid3_pr_debug("%s, sk=%p, invalid len for TFRC_OPT_RECEIVE_RATE\n", + ccid3_pr_debug("%s, sk=%p, invalid len for " + "TFRC_OPT_RECEIVE_RATE\n", dccp_role(sk), sk); rc = -EINVAL; } else { @@ -1325,7 +1354,8 @@ static int ccid3_hc_tx_init(struct sock *sk) ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk); - hctx = dp->dccps_hc_tx_ccid_private = kmalloc(sizeof(*hctx), gfp_any()); + hctx = dp->dccps_hc_tx_ccid_private = kmalloc(sizeof(*hctx), + gfp_any()); if (hctx == NULL) return -ENOMEM; @@ -1337,8 +1367,10 @@ static int ccid3_hc_tx_init(struct sock *sk) else hctx->ccid3hctx_s = TFRC_STD_PACKET_SIZE; - hctx->ccid3hctx_x = hctx->ccid3hctx_s; /* set transmission rate to 1 packet per second */ - hctx->ccid3hctx_rtt = 4; /* See ccid3_hc_tx_packet_sent win_count calculatation */ + /* Set transmission rate to 1 packet per second */ + hctx->ccid3hctx_x = hctx->ccid3hctx_s; + /* See ccid3_hc_tx_packet_sent win_count calculatation */ + hctx->ccid3hctx_rtt = 4; hctx->ccid3hctx_t_rto = USEC_PER_SEC; hctx->ccid3hctx_state = TFRC_SSTATE_NO_SENT; INIT_LIST_HEAD(&hctx->ccid3hctx_hist); @@ -1389,14 +1421,16 @@ static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state) } #endif -static inline void ccid3_hc_rx_set_state(struct sock *sk, enum ccid3_hc_rx_states state) +static inline void ccid3_hc_rx_set_state(struct sock *sk, + enum ccid3_hc_rx_states state) { struct dccp_sock *dp = dccp_sk(sk); struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; enum ccid3_hc_rx_states oldstate = hcrx->ccid3hcrx_state; ccid3_pr_debug("%s(%p) %-8.8s -> %s\n", - dccp_role(sk), sk, ccid3_rx_state_name(oldstate), ccid3_rx_state_name(state)); + dccp_role(sk), sk, ccid3_rx_state_name(oldstate), + ccid3_rx_state_name(state)); WARN_ON(state == oldstate); hcrx->ccid3hcrx_state = state; } @@ -1434,9 +1468,12 @@ static int ccid3_hc_rx_add_hist(struct sock *sk, num_later++; if (num_later == TFRC_RECV_NUM_LATE_LOSS) { - dccp_rx_hist_entry_delete(ccid3_rx_hist, packet); - ccid3_pr_debug("%s, sk=%p, packet(%llu) already lost!\n", - dccp_role(sk), sk, seqno); + dccp_rx_hist_entry_delete(ccid3_rx_hist, + packet); + ccid3_pr_debug("%s, sk=%p, packet" + "(%llu) already lost!\n", + dccp_role(sk), sk, + seqno); return 1; } } @@ -1444,12 +1481,18 @@ static int ccid3_hc_rx_add_hist(struct sock *sk, if (num_later < TFRC_RECV_NUM_LATE_LOSS) dccp_rx_hist_add_entry(&hcrx->ccid3hcrx_hist, packet); - /* FIXME: else what? should we destroy the packet like above? */ + /* + * FIXME: else what? should we destroy the packet + * like above? + */ } } trim_history: - /* Trim history (remove all packets after the NUM_LATE_LOSS + 1 data packets) */ + /* + * Trim history (remove all packets after the NUM_LATE_LOSS + 1 + * data packets) + */ num_later = TFRC_RECV_NUM_LATE_LOSS + 1; if (!list_empty(&hcrx->ccid3hcrx_loss_interval_hist)) { @@ -1489,15 +1532,18 @@ trim_history: if (tmp < 0) tmp += TFRC_WIN_COUNT_LIMIT; if (tmp > TFRC_WIN_COUNT_PER_RTT + 1) { - /* we have found a packet older than one rtt - * remove the rest */ + /* + * We have found a packet older + * than one rtt remove the rest + */ step = 3; } else /* OK, find next data packet */ num_later = 1; break; case 3: list_del_init(&entry->dccphrx_node); - dccp_rx_hist_entry_delete(ccid3_rx_hist, entry); + dccp_rx_hist_entry_delete(ccid3_rx_hist, + entry); break; } } else if (dccp_rx_hist_entry_data_packet(entry)) @@ -1564,7 +1610,8 @@ static void ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) u32 x_recv, pinv; struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; - if (hcrx == NULL || !(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN)) + if (hcrx == NULL || !(sk->sk_state == DCCP_OPEN || + sk->sk_state == DCCP_PARTOPEN)) return; DCCP_SKB_CB(skb)->dccpd_ccval = hcrx->ccid3hcrx_last_counter; @@ -1658,13 +1705,15 @@ static u32 ccid3_hc_rx_calc_first_li(struct sock *sk) } if (step == 0) { - printk(KERN_CRIT "%s: %s, sk=%p, packet history contains no data packets!\n", + printk(KERN_CRIT "%s: %s, sk=%p, packet history contains no " + "data packets!\n", __FUNCTION__, dccp_role(sk), sk); return ~0; } if (interval == 0) { - ccid3_pr_debug("%s, sk=%p, Could not find a win_count interval > 0. Defaulting to 1\n", + ccid3_pr_debug("%s, sk=%p, Could not find a win_count " + "interval > 0. Defaulting to 1\n", dccp_role(sk), sk); interval = 1; } @@ -1688,8 +1737,8 @@ found: fval = (hcrx->ccid3hcrx_s * 100000) / tmp2; /* do not alter order above or you will get overflow on 32 bit */ p = calcx_reverse_lookup(fval); - ccid3_pr_debug("%s, sk=%p, receive rate=%u bytes/s, implied loss rate=%u\n",\ - dccp_role(sk), sk, x_recv, p); + ccid3_pr_debug("%s, sk=%p, receive rate=%u bytes/s, implied " + "loss rate=%u\n", dccp_role(sk), sk, x_recv, p); if (p == 0) return ~0; @@ -1704,25 +1753,31 @@ static void ccid3_hc_rx_update_li(struct sock *sk, u64 seq_loss, u8 win_loss) struct ccid3_loss_interval_hist_entry *li_entry; if (seq_loss != DCCP_MAX_SEQNO + 1) { - ccid3_pr_debug("%s, sk=%p, seq_loss=%llu, win_loss=%u, packet loss detected\n", + ccid3_pr_debug("%s, sk=%p, seq_loss=%llu, win_loss=%u, " + "packet loss detected\n", dccp_role(sk), sk, seq_loss, win_loss); if (list_empty(&hcrx->ccid3hcrx_loss_interval_hist)) { struct ccid3_loss_interval_hist_entry *li_tail = NULL; int i; - ccid3_pr_debug("%s, sk=%p, first loss event detected, creating history\n", dccp_role(sk), sk); + ccid3_pr_debug("%s, sk=%p, first loss event detected, " + "creating history\n", + dccp_role(sk), sk); for (i = 0; i <= TFRC_RECV_IVAL_F_LENGTH; ++i) { li_entry = ccid3_loss_interval_hist_entry_new(SLAB_ATOMIC); if (li_entry == NULL) { ccid3_loss_interval_history_delete(&hcrx->ccid3hcrx_loss_interval_hist); - ccid3_pr_debug("%s, sk=%p, not enough mem for creating history\n", + ccid3_pr_debug("%s, sk=%p, not enough " + "mem for creating " + "history\n", dccp_role(sk), sk); return; } if (li_tail == NULL) li_tail = li_entry; - list_add(&li_entry->ccid3lih_node, &hcrx->ccid3hcrx_loss_interval_hist); + list_add(&li_entry->ccid3lih_node, + &hcrx->ccid3hcrx_loss_interval_hist); } li_entry->ccid3lih_seqno = seq_loss; @@ -1772,11 +1827,13 @@ static void ccid3_hc_rx_detect_loss(struct sock *sk) if (list_empty(&hcrx->ccid3hcrx_loss_interval_hist)) { /* no loss event have occured yet */ ccid3_pr_debug("%s, sk=%p, TODO: find a lost data " - "packet by comparing to initial seqno\n", + "packet by comparing to initial " + "seqno\n", dccp_role(sk), sk); goto out_update_li; } else { - pr_info("%s: %s, sk=%p, ERROR! Less than 4 data packets in history", + pr_info("%s: %s, sk=%p, ERROR! Less than 4 data " + "packets in history", __FUNCTION__, dccp_role(sk), sk); return; } @@ -1831,7 +1888,9 @@ static u32 ccid3_hc_rx_calc_i_mean(struct sock *sk) u32 i_tot1 = 0; u32 w_tot = 0; - list_for_each_entry_safe(li_entry, li_next, &hcrx->ccid3hcrx_loss_interval_hist, ccid3lih_node) { + list_for_each_entry_safe(li_entry, li_next, + &hcrx->ccid3hcrx_loss_interval_hist, + ccid3lih_node) { if (i < TFRC_RECV_IVAL_F_LENGTH) { i_tot0 += li_entry->ccid3lih_interval * ccid3_hc_rx_w[i]; w_tot += ccid3_hc_rx_w[i]; @@ -1845,7 +1904,8 @@ static u32 ccid3_hc_rx_calc_i_mean(struct sock *sk) } if (i != TFRC_RECV_IVAL_F_LENGTH) { - pr_info("%s: %s, sk=%p, ERROR! Missing entry in interval history!\n", + pr_info("%s: %s, sk=%p, ERROR! Missing entry in " + "interval history!\n", __FUNCTION__, dccp_role(sk), sk); return 0; } @@ -1870,11 +1930,7 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) u8 win_count; u32 p_prev; int ins; -#if 0 - ccid3_pr_debug("%s, sk=%p(%s), skb=%p(%s)\n", - dccp_role(sk), sk, dccp_state_name(sk->sk_state), - skb, dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type)); -#endif + if (hcrx == NULL) return; @@ -1913,7 +1969,8 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) packet = dccp_rx_hist_entry_new(ccid3_rx_hist, opt_recv->dccpor_ndp, skb, SLAB_ATOMIC); if (packet == NULL) { - ccid3_pr_debug("%s, sk=%p, Not enough mem to add rx packet to history (consider it lost)!", + ccid3_pr_debug("%s, sk=%p, Not enough mem to add rx packet " + "to history (consider it lost)!", dccp_role(sk), sk); return; } @@ -1927,13 +1984,16 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) switch (hcrx->ccid3hcrx_state) { case TFRC_RSTATE_NO_DATA: - ccid3_pr_debug("%s, sk=%p(%s), skb=%p, sending initial feedback\n", - dccp_role(sk), sk, dccp_state_name(sk->sk_state), skb); + ccid3_pr_debug("%s, sk=%p(%s), skb=%p, sending initial " + "feedback\n", + dccp_role(sk), sk, + dccp_state_name(sk->sk_state), skb); ccid3_hc_rx_send_feedback(sk); ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA); return; case TFRC_RSTATE_DATA: - hcrx->ccid3hcrx_bytes_recv += skb->len - dccp_hdr(skb)->dccph_doff * 4; + hcrx->ccid3hcrx_bytes_recv += skb->len - + dccp_hdr(skb)->dccph_doff * 4; if (ins == 0) { if (now_delta(hcrx->ccid3hcrx_tstamp_last_ack) >= hcrx->ccid3hcrx_rtt) { @@ -1975,7 +2035,8 @@ static int ccid3_hc_rx_init(struct sock *sk) ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk); - hcrx = dp->dccps_hc_rx_ccid_private = kmalloc(sizeof(*hcrx), gfp_any()); + hcrx = dp->dccps_hc_rx_ccid_private = kmalloc(sizeof(*hcrx), + gfp_any()); if (hcrx == NULL) return -ENOMEM; @@ -2135,7 +2196,8 @@ static __exit void ccid3_module_exit(void) } module_exit(ccid3_module_exit); -MODULE_AUTHOR("Ian McDonald & Arnaldo Carvalho de Melo "); +MODULE_AUTHOR("Ian McDonald , " + "Arnaldo Carvalho de Melo "); MODULE_DESCRIPTION("DCCP TFRC CCID3 CCID"); MODULE_LICENSE("GPL"); MODULE_ALIAS("net-dccp-ccid-3"); -- cgit v0.10.2 From b6ee3d4ada4e85d9b9b9164c1327ef0850c79d5e Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sat, 27 Aug 2005 18:18:18 -0300 Subject: [CCID3]: Reorganise timeval handling Introducing functions to add to or subtract from a timeval variable and renaming now_delta to timeval_new_delta that calls do_gettimeofday and then timeval_delta, that should be used when there are several deltas made relative to the current time or setting variables to it, so as to avoid calling do_gettimeofday excessively. I'm leaving these "timeval_" prefixed funcions internal to DCCP for a while till we're sure there are no subtle bugs in it. It also is more correct as it checks if the number of usecs added to or subtracted from a tv_usec field is more than 2 seconds. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 225c530..60e3a5f 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -156,26 +156,6 @@ static inline void ccid3_hc_tx_set_state(struct sock *sk, hctx->ccid3hctx_state = state; } -static void timeval_sub(struct timeval large, struct timeval small, - struct timeval *result) -{ - result->tv_sec = large.tv_sec-small.tv_sec; - if (large.tv_usec < small.tv_usec) { - (result->tv_sec)--; - result->tv_usec = USEC_PER_SEC + - large.tv_usec - small.tv_usec; - } else - result->tv_usec = large.tv_usec-small.tv_usec; -} - -static inline void timeval_fix(struct timeval *tv) -{ - if (tv->tv_usec >= USEC_PER_SEC) { - tv->tv_sec++; - tv->tv_usec -= USEC_PER_SEC; - } -} - #define CALCX_ARRSIZE 500 #define CALCX_SPLIT 50000 @@ -816,18 +796,22 @@ static void ccid3_hc_tx_update_x(struct sock *sk) 2 * hctx->ccid3hctx_x_recv), (hctx->ccid3hctx_s / TFRC_MAX_BACK_OFF_TIME)); - } else if (now_delta(hctx->ccid3hctx_t_ld) >= hctx->ccid3hctx_rtt) { - u32 rtt = hctx->ccid3hctx_rtt; - if (rtt < 10) { - rtt = 10; - } /* avoid divide by zero below */ - - hctx->ccid3hctx_x = max_t(u32, min_t(u32, 2 * hctx->ccid3hctx_x_recv, - 2 * hctx->ccid3hctx_x), - ((hctx->ccid3hctx_s * 100000) / - (rtt / 10))); - /* Using 100000 and 10 to avoid 32 bit overflow for jumbo frames */ - do_gettimeofday(&hctx->ccid3hctx_t_ld); + } else { + struct timeval now; + + do_gettimeofday(&now); + if (timeval_delta(&now, &hctx->ccid3hctx_t_ld) >= + hctx->ccid3hctx_rtt) { + /* Avoid divide by zero below */ + const u32 rtt = max_t(u32, hctx->ccid3hctx_rtt, 10); + + hctx->ccid3hctx_x = max_t(u32, min_t(u32, 2 * hctx->ccid3hctx_x_recv, + 2 * hctx->ccid3hctx_x), + ((hctx->ccid3hctx_s * 100000) / + (rtt / 10))); + /* Using 100000 and 10 to avoid 32 bit overflow for jumbo frames */ + hctx->ccid3hctx_t_ld = now; + } } if (hctx->ccid3hctx_x == 0) { @@ -999,14 +983,15 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, /* Set nominal send time for initial packet */ hctx->ccid3hctx_t_nom = now; - (hctx->ccid3hctx_t_nom).tv_usec += hctx->ccid3hctx_t_ipi; - timeval_fix(&(hctx->ccid3hctx_t_nom)); + timeval_add_usecs(&hctx->ccid3hctx_t_nom, + hctx->ccid3hctx_t_ipi); ccid3_calc_new_delta(hctx); rc = 0; break; case TFRC_SSTATE_NO_FBACK: case TFRC_SSTATE_FBACK: - delay = now_delta(hctx->ccid3hctx_t_nom) - hctx->ccid3hctx_delta; + delay = (timeval_delta(&now, &hctx->ccid3hctx_t_nom) - + hctx->ccid3hctx_delta); ccid3_pr_debug("send_packet delay=%ld\n", delay); delay /= -1000; /* divide by -1000 is to convert to ms and get sign right */ @@ -1068,7 +1053,7 @@ static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, int len) * Algorithm in "8.1. Window Counter Valuer" in * draft-ietf-dccp-ccid3-11.txt */ - quarter_rtt = now_delta(hctx->ccid3hctx_t_last_win_count) / + quarter_rtt = timeval_delta(&now, &hctx->ccid3hctx_t_last_win_count) / (hctx->ccid3hctx_rtt / 4); if (quarter_rtt > 0) { hctx->ccid3hctx_t_last_win_count = now; @@ -1102,8 +1087,8 @@ static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, int len) hctx->ccid3hctx_t_nom = now; ccid3_calc_new_t_ipi(hctx); ccid3_calc_new_delta(hctx); - (hctx->ccid3hctx_t_nom).tv_usec += hctx->ccid3hctx_t_ipi; - timeval_fix(&(hctx->ccid3hctx_t_nom)); + timeval_add_usecs(&hctx->ccid3hctx_t_nom, + hctx->ccid3hctx_t_ipi); } break; default: @@ -1167,7 +1152,7 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) } /* Update RTT */ - r_sample = now_delta(packet->dccphtx_tstamp); + r_sample = timeval_now_delta(&packet->dccphtx_tstamp); /* FIXME: */ // r_sample -= usecs_to_jiffies(t_elapsed * 10); @@ -1224,15 +1209,11 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) ccid3_hc_tx_update_x(sk); /* Update next send time */ - if (hctx->ccid3hctx_t_ipi > (hctx->ccid3hctx_t_nom).tv_usec) { - hctx->ccid3hctx_t_nom.tv_usec += USEC_PER_SEC; - (hctx->ccid3hctx_t_nom).tv_sec--; - } - /* FIXME - if no feedback then t_ipi can go > 1 second */ - (hctx->ccid3hctx_t_nom).tv_usec -= hctx->ccid3hctx_t_ipi; + timeval_sub_usecs(&hctx->ccid3hctx_t_nom, + hctx->ccid3hctx_t_ipi); ccid3_calc_new_t_ipi(hctx); - (hctx->ccid3hctx_t_nom).tv_usec += hctx->ccid3hctx_t_ipi; - timeval_fix(&(hctx->ccid3hctx_t_nom)); + timeval_add_usecs(&hctx->ccid3hctx_t_nom, + hctx->ccid3hctx_t_ipi); ccid3_calc_new_delta(hctx); /* remove all packets older than the one acked from history */ @@ -1559,20 +1540,24 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk) struct dccp_sock *dp = dccp_sk(sk); struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; struct dccp_rx_hist_entry *packet; + struct timeval now; ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk); + do_gettimeofday(&now); + switch (hcrx->ccid3hcrx_state) { case TFRC_RSTATE_NO_DATA: hcrx->ccid3hcrx_x_recv = 0; break; case TFRC_RSTATE_DATA: { - u32 delta = now_delta(hcrx->ccid3hcrx_tstamp_last_feedback); + const u32 delta = timeval_delta(&now, + &hcrx->ccid3hcrx_tstamp_last_feedback); - if (delta == 0) - delta = 1; /* to prevent divide by zero */ hcrx->ccid3hcrx_x_recv = (hcrx->ccid3hcrx_bytes_recv * - USEC_PER_SEC) / delta; + USEC_PER_SEC); + if (likely(delta > 1)) + hcrx->ccid3hcrx_x_recv /= delta; } break; default: @@ -1590,13 +1575,14 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk) return; } - do_gettimeofday(&(hcrx->ccid3hcrx_tstamp_last_feedback)); + hcrx->ccid3hcrx_tstamp_last_feedback = now; hcrx->ccid3hcrx_last_counter = packet->dccphrx_ccval; hcrx->ccid3hcrx_seqno_last_counter = packet->dccphrx_seqno; hcrx->ccid3hcrx_bytes_recv = 0; /* Convert to multiples of 10us */ - hcrx->ccid3hcrx_elapsed_time = now_delta(packet->dccphrx_tstamp) / 10; + hcrx->ccid3hcrx_elapsed_time = + timeval_delta(&now, &packet->dccphrx_tstamp) / 10; if (hcrx->ccid3hcrx_p == 0) hcrx->ccid3hcrx_pinv = ~0; else @@ -1676,7 +1662,7 @@ static u32 ccid3_hc_rx_calc_first_li(struct sock *sk) struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; struct dccp_rx_hist_entry *entry, *next, *tail = NULL; u32 rtt, delta, x_recv, fval, p, tmp2; - struct timeval tstamp = { 0 }, tmp_tv; + struct timeval tstamp = { 0, }; int interval = 0; int win_count = 0; int step = 0; @@ -1718,18 +1704,16 @@ static u32 ccid3_hc_rx_calc_first_li(struct sock *sk) interval = 1; } found: - timeval_sub(tstamp,tail->dccphrx_tstamp,&tmp_tv); - rtt = (tmp_tv.tv_sec * USEC_PER_SEC + tmp_tv.tv_usec) * 4 / interval; + rtt = timeval_delta(&tstamp, &tail->dccphrx_tstamp) * 4 / interval; ccid3_pr_debug("%s, sk=%p, approximated RTT to %uus\n", dccp_role(sk), sk, rtt); if (rtt == 0) rtt = 1; - delta = now_delta(hcrx->ccid3hcrx_tstamp_last_feedback); - if (delta == 0) - delta = 1; - - x_recv = (hcrx->ccid3hcrx_bytes_recv * USEC_PER_SEC) / delta; + delta = timeval_now_delta(&hcrx->ccid3hcrx_tstamp_last_feedback); + x_recv = hcrx->ccid3hcrx_bytes_recv * USEC_PER_SEC; + if (likely(delta > 1)) + x_recv /= delta; tmp1 = (u64)x_recv * (u64)rtt; do_div(tmp1,10000000); @@ -1926,7 +1910,6 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) const struct dccp_options_received *opt_recv; struct dccp_rx_hist_entry *packet; struct timeval now; - u32 now_usecs; u8 win_count; u32 p_prev; int ins; @@ -1948,8 +1931,7 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) break; p_prev = hcrx->ccid3hcrx_rtt; do_gettimeofday(&now); - now_usecs = now.tv_sec * USEC_PER_SEC + now.tv_usec; - hcrx->ccid3hcrx_rtt = now_usecs - + hcrx->ccid3hcrx_rtt = timeval_usecs(&now) - (opt_recv->dccpor_timestamp_echo - opt_recv->dccpor_elapsed_time) * 10; if (p_prev != hcrx->ccid3hcrx_rtt) @@ -1994,15 +1976,16 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) case TFRC_RSTATE_DATA: hcrx->ccid3hcrx_bytes_recv += skb->len - dccp_hdr(skb)->dccph_doff * 4; - if (ins == 0) { - if (now_delta(hcrx->ccid3hcrx_tstamp_last_ack) >= - hcrx->ccid3hcrx_rtt) { - do_gettimeofday(&hcrx->ccid3hcrx_tstamp_last_ack); - ccid3_hc_rx_send_feedback(sk); - } - return; + if (ins != 0) + break; + + do_gettimeofday(&now); + if (timeval_delta(&now, &hcrx->ccid3hcrx_tstamp_last_ack) >= + hcrx->ccid3hcrx_rtt) { + hcrx->ccid3hcrx_tstamp_last_ack = now; + ccid3_hc_rx_send_feedback(sk); } - break; + return; default: printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n", __FUNCTION__, dccp_role(sk), sk, hcrx->ccid3hcrx_state); diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 6ba2150..5cd9e79 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -429,17 +429,53 @@ extern int dccp_ackpkts_add(struct dccp_ackpkts *ap, u64 ackno, u8 state); extern void dccp_ackpkts_check_rcv_ackno(struct dccp_ackpkts *ap, struct sock *sk, u64 ackno); +static inline suseconds_t timeval_usecs(const struct timeval *tv) +{ + return tv->tv_sec * USEC_PER_SEC + tv->tv_usec; +} + +static inline suseconds_t timeval_delta(const struct timeval *large, + const struct timeval *small) +{ + time_t secs = large->tv_sec - small->tv_sec; + suseconds_t usecs = large->tv_usec - small->tv_usec; + + if (usecs < 0) { + secs--; + usecs += USEC_PER_SEC; + } + return secs * USEC_PER_SEC + usecs; +} + +static inline void timeval_add_usecs(struct timeval *tv, + const suseconds_t usecs) +{ + tv->tv_usec += usecs; + while (tv->tv_usec >= USEC_PER_SEC) { + tv->tv_sec++; + tv->tv_usec -= USEC_PER_SEC; + } +} + +static inline void timeval_sub_usecs(struct timeval *tv, + const suseconds_t usecs) +{ + tv->tv_usec -= usecs; + while (tv->tv_usec < 0) { + tv->tv_sec--; + tv->tv_usec += USEC_PER_SEC; + } +} + /* * Returns the difference in usecs between timeval * passed in and current time */ -static inline u32 now_delta(struct timeval tv) +static inline suseconds_t timeval_now_delta(const struct timeval *tv) { struct timeval now; - do_gettimeofday(&now); - return (now.tv_sec - tv.tv_sec) * USEC_PER_SEC + - (now.tv_usec - tv.tv_usec); + return timeval_delta(&now, tv); } #ifdef CONFIG_IP_DCCP_DEBUG diff --git a/net/dccp/options.c b/net/dccp/options.c index eabcc8f..382c589 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -359,7 +359,7 @@ static void dccp_insert_option_ack_vector(struct sock *sk, struct sk_buff *skb) #endif struct dccp_ackpkts *ap = dp->dccps_hc_rx_ackpkts; int len = ap->dccpap_buf_vector_len + 2; - const u32 elapsed_time = now_delta(ap->dccpap_time) / 10; + const u32 elapsed_time = timeval_now_delta(&ap->dccpap_time) / 10; unsigned char *to, *from; if (elapsed_time != 0) @@ -451,7 +451,8 @@ static void dccp_insert_option_timestamp_echo(struct sock *sk, "CLIENT TX opt: " : "server TX opt: "; #endif u32 tstamp_echo; - const u32 elapsed_time = now_delta(dp->dccps_timestamp_time) / 10; + const u32 elapsed_time = + timeval_now_delta(&dp->dccps_timestamp_time) / 10; const int elapsed_time_len = dccp_elapsed_time_len(elapsed_time); const int len = 6 + elapsed_time_len; unsigned char *to; -- cgit v0.10.2 From 6b5e633ab1525b4def3f36b53903b00586e9966d Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sat, 27 Aug 2005 20:11:28 -0300 Subject: [CCID3]: Introduce usecs_div To avoid open coding this all over the place. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 60e3a5f..ff41977 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -40,6 +40,15 @@ #include "../packet_history.h" #include "ccid3.h" +/* + * Reason for maths with 10 here is to avoid 32 bit overflow when a is big. + */ +static inline u32 usecs_div(const u32 a, const u32 b) +{ + const u32 tmp = a * (USEC_PER_SEC / 10); + return b > 20 ? tmp / (b / 10) : tmp; +} + #ifdef CCID3_DEBUG extern int ccid3_debug; @@ -748,20 +757,13 @@ static u32 ccid3_calc_x(u16 s, u32 R, u32 p) /* Calculate new t_ipi (inter packet interval) by t_ipi = s / X_inst */ static inline void ccid3_calc_new_t_ipi(struct ccid3_hc_tx_sock *hctx) { - if (hctx->ccid3hctx_state == TFRC_SSTATE_NO_FBACK) - return; - /* if no feedback spec says t_ipi is 1 second (set elsewhere and then - * doubles after every no feedback timer (separate function) */ - - if (hctx->ccid3hctx_x < 10) { - ccid3_pr_debug("ccid3_calc_new_t_ipi - ccid3hctx_x < 10\n"); - hctx->ccid3hctx_x = 10; - } - hctx->ccid3hctx_t_ipi = (hctx->ccid3hctx_s * 100000) - / (hctx->ccid3hctx_x / 10); - /* reason for above maths with 10 in there is to avoid 32 bit - * overflow for jumbo packets */ - + /* + * If no feedback spec says t_ipi is 1 second (set elsewhere and then + * doubles after every no feedback timer (separate function) + */ + if (hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK) + hctx->ccid3hctx_t_ipi = usecs_div(hctx->ccid3hctx_s, + hctx->ccid3hctx_x); } /* Calculate new delta by delta = min(t_ipi / 2, t_gran / 2) */ @@ -769,7 +771,6 @@ static inline void ccid3_calc_new_delta(struct ccid3_hc_tx_sock *hctx) { hctx->ccid3hctx_delta = min_t(u32, hctx->ccid3hctx_t_ipi / 2, TFRC_OPSYS_HALF_TIME_GRAN); - } /* @@ -802,22 +803,13 @@ static void ccid3_hc_tx_update_x(struct sock *sk) do_gettimeofday(&now); if (timeval_delta(&now, &hctx->ccid3hctx_t_ld) >= hctx->ccid3hctx_rtt) { - /* Avoid divide by zero below */ - const u32 rtt = max_t(u32, hctx->ccid3hctx_rtt, 10); - - hctx->ccid3hctx_x = max_t(u32, min_t(u32, 2 * hctx->ccid3hctx_x_recv, - 2 * hctx->ccid3hctx_x), - ((hctx->ccid3hctx_s * 100000) / - (rtt / 10))); - /* Using 100000 and 10 to avoid 32 bit overflow for jumbo frames */ + hctx->ccid3hctx_x = max_t(u32, min_t(u32, hctx->ccid3hctx_x_recv, + hctx->ccid3hctx_x) * 2, + usecs_div(hctx->ccid3hctx_s, + hctx->ccid3hctx_rtt)); hctx->ccid3hctx_t_ld = now; } } - - if (hctx->ccid3hctx_x == 0) { - ccid3_pr_debug("ccid3hctx_x = 0!\n"); - hctx->ccid3hctx_x = 1; - } } static void ccid3_hc_tx_no_feedback_timer(unsigned long data) @@ -826,7 +818,6 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) struct dccp_sock *dp = dccp_sk(sk); unsigned long next_tmout = 0; struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private; - u32 rtt; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { @@ -840,19 +831,14 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) ccid3_pr_debug("%s, sk=%p, state=%s\n", dccp_role(sk), sk, ccid3_tx_state_name(hctx->ccid3hctx_state)); - if (hctx->ccid3hctx_x < 10) { - ccid3_pr_debug("TFRC_SSTATE_NO_FBACK ccid3hctx_x < 10\n"); - hctx->ccid3hctx_x = 10; - } - switch (hctx->ccid3hctx_state) { case TFRC_SSTATE_TERM: goto out; case TFRC_SSTATE_NO_FBACK: /* Halve send rate */ hctx->ccid3hctx_x /= 2; - if (hctx->ccid3hctx_x < - (hctx->ccid3hctx_s / TFRC_MAX_BACK_OFF_TIME)) + if (hctx->ccid3hctx_x < (hctx->ccid3hctx_s / + TFRC_MAX_BACK_OFF_TIME)) hctx->ccid3hctx_x = (hctx->ccid3hctx_s / TFRC_MAX_BACK_OFF_TIME); @@ -861,9 +847,9 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) dccp_role(sk), sk, ccid3_tx_state_name(hctx->ccid3hctx_state), hctx->ccid3hctx_x); - next_tmout = max_t(u32, 2 * (hctx->ccid3hctx_s * 100000) / (hctx->ccid3hctx_x / 10), + next_tmout = max_t(u32, 2 * usecs_div(hctx->ccid3hctx_s, + hctx->ccid3hctx_x), TFRC_INITIAL_TIMEOUT); - /* do above maths with 100000 and 10 to prevent overflow on 32 bit */ /* * FIXME - not sure above calculation is correct. See section * 5 of CCID3 11 should adjust tx_t_ipi and double that to @@ -875,12 +861,9 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) * Check if IDLE since last timeout and recv rate is less than * 4 packets per RTT */ - rtt = hctx->ccid3hctx_rtt; - if (rtt < 10) - rtt = 10; - /* stop divide by zero below */ if (!hctx->ccid3hctx_idle || - (hctx->ccid3hctx_x_recv >= 4 * (hctx->ccid3hctx_s * 100000) / (rtt / 10))) { + (hctx->ccid3hctx_x_recv >= + 4 * usecs_div(hctx->ccid3hctx_s, hctx->ccid3hctx_rtt))) { ccid3_pr_debug("%s, sk=%p, state=%s, not idle\n", dccp_role(sk), sk, ccid3_tx_state_name(hctx->ccid3hctx_state)); @@ -905,13 +888,13 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) /* Update sending rate */ ccid3_hc_tx_update_x(sk); } - if (hctx->ccid3hctx_x == 0) { - ccid3_pr_debug("TFRC_SSTATE_FBACK ccid3hctx_x = 0!\n"); - hctx->ccid3hctx_x = 10; - } - /* Schedule no feedback timer to expire in max(4 * R, 2 * s / X) */ + /* + * Schedule no feedback timer to expire in + * max(4 * R, 2 * s / X) + */ next_tmout = max_t(u32, hctx->ccid3hctx_t_rto, - 2 * (hctx->ccid3hctx_s * 100000) / (hctx->ccid3hctx_x / 10)); + 2 * usecs_div(hctx->ccid3hctx_s, + hctx->ccid3hctx_x)); break; default: printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n", @@ -1053,8 +1036,10 @@ static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, int len) * Algorithm in "8.1. Window Counter Valuer" in * draft-ietf-dccp-ccid3-11.txt */ - quarter_rtt = timeval_delta(&now, &hctx->ccid3hctx_t_last_win_count) / - (hctx->ccid3hctx_rtt / 4); + quarter_rtt = timeval_delta(&now, &hctx->ccid3hctx_t_last_win_count); + if (likely(hctx->ccid3hctx_rtt > 8)) + quarter_rtt /= hctx->ccid3hctx_rtt / 4; + if (quarter_rtt > 0) { hctx->ccid3hctx_t_last_win_count = now; hctx->ccid3hctx_last_win_count = (hctx->ccid3hctx_last_win_count + @@ -1171,13 +1156,6 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) hctx->ccid3hctx_rtt = (hctx->ccid3hctx_rtt * 9) / 10 + r_sample / 10; - /* - * XXX: this is to avoid a division by zero in ccid3_hc_tx_packet_sent - * implemention of the new window count. - */ - if (hctx->ccid3hctx_rtt < 4) - hctx->ccid3hctx_rtt = 4; - ccid3_pr_debug("%s, sk=%p, New RTT estimate=%uus, " "r_sample=%us\n", dccp_role(sk), sk, hctx->ccid3hctx_rtt, r_sample); @@ -1220,21 +1198,14 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) dccp_tx_hist_purge_older(ccid3_tx_hist, &hctx->ccid3hctx_hist, packet); - if (hctx->ccid3hctx_x < 10) { - ccid3_pr_debug("ccid3_hc_tx_packet_recv hctx_x < 10\n"); - hctx->ccid3hctx_x = 10; - } - /* to prevent divide by zero below */ - /* * Schedule no feedback timer to expire in * max(4 * R, 2 * s / X) */ next_tmout = max(hctx->ccid3hctx_t_rto, - (2 * (hctx->ccid3hctx_s * 100000) / - (hctx->ccid3hctx_x / 10))); - /* maths with 100000 and 10 is to prevent overflow with 32 bit */ - + 2 * usecs_div(hctx->ccid3hctx_s, + hctx->ccid3hctx_x)); + ccid3_pr_debug("%s, sk=%p, Scheduled no feedback timer to " "expire in %lu jiffies (%luus)\n", dccp_role(sk), sk, @@ -1350,8 +1321,6 @@ static int ccid3_hc_tx_init(struct sock *sk) /* Set transmission rate to 1 packet per second */ hctx->ccid3hctx_x = hctx->ccid3hctx_s; - /* See ccid3_hc_tx_packet_sent win_count calculatation */ - hctx->ccid3hctx_rtt = 4; hctx->ccid3hctx_t_rto = USEC_PER_SEC; hctx->ccid3hctx_state = TFRC_SSTATE_NO_SENT; INIT_LIST_HEAD(&hctx->ccid3hctx_hist); -- cgit v0.10.2 From cfc3c525a3b434cabf92bf7054f2c6c93497fbea Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sat, 27 Aug 2005 20:20:37 -0300 Subject: [CCID3]: Move the CCID3 defines to ccid3.h Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index ff41977..cfd1123 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -49,9 +49,9 @@ static inline u32 usecs_div(const u32 a, const u32 b) return b > 20 ? tmp / (b / 10) : tmp; } -#ifdef CCID3_DEBUG -extern int ccid3_debug; +static int ccid3_debug; +#ifdef CCID3_DEBUG #define ccid3_pr_debug(format, a...) \ do { if (ccid3_debug) \ printk(KERN_DEBUG "%s: " format, __FUNCTION__, ##a); \ @@ -60,37 +60,6 @@ extern int ccid3_debug; #define ccid3_pr_debug(format, a...) #endif -#define TFRC_MIN_PACKET_SIZE 16 -#define TFRC_STD_PACKET_SIZE 256 -#define TFRC_MAX_PACKET_SIZE 65535 - -#define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC) -/* two seconds as per CCID3 spec 11 */ - -#define TFRC_OPSYS_HALF_TIME_GRAN (USEC_PER_SEC / (2 * HZ)) -/* above is in usecs - half the scheduling granularity as per RFC3448 4.6 */ - -#define TFRC_WIN_COUNT_PER_RTT 4 -#define TFRC_WIN_COUNT_LIMIT 16 - -#define TFRC_MAX_BACK_OFF_TIME 64 -/* above is in seconds */ - -#define TFRC_SMALLEST_P 40 - -#define TFRC_RECV_IVAL_F_LENGTH 8 /* length(w[]) */ - -/* Number of later packets received before one is considered lost */ -#define TFRC_RECV_NUM_LATE_LOSS 3 - -enum ccid3_options { - TFRC_OPT_LOSS_EVENT_RATE = 192, - TFRC_OPT_LOSS_INTERVALS = 193, - TFRC_OPT_RECEIVE_RATE = 194, -}; - -static int ccid3_debug; - static struct dccp_tx_hist *ccid3_tx_hist; static struct dccp_rx_hist *ccid3_rx_hist; diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h index 5ef72cd..f896570 100644 --- a/net/dccp/ccids/ccid3.h +++ b/net/dccp/ccids/ccid3.h @@ -36,8 +36,39 @@ #ifndef _DCCP_CCID3_H_ #define _DCCP_CCID3_H_ -#include +#include #include +#include +#include + +#define TFRC_MIN_PACKET_SIZE 16 +#define TFRC_STD_PACKET_SIZE 256 +#define TFRC_MAX_PACKET_SIZE 65535 + +/* Two seconds as per CCID3 spec */ +#define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC) + +/* In usecs - half the scheduling granularity as per RFC3448 4.6 */ +#define TFRC_OPSYS_HALF_TIME_GRAN (USEC_PER_SEC / (2 * HZ)) + +#define TFRC_WIN_COUNT_PER_RTT 4 +#define TFRC_WIN_COUNT_LIMIT 16 + +/* In seconds */ +#define TFRC_MAX_BACK_OFF_TIME 64 + +#define TFRC_SMALLEST_P 40 + +#define TFRC_RECV_IVAL_F_LENGTH 8 + +/* Number of later packets received before one is considered lost */ +#define TFRC_RECV_NUM_LATE_LOSS 3 + +enum ccid3_options { + TFRC_OPT_LOSS_EVENT_RATE = 192, + TFRC_OPT_LOSS_INTERVALS = 193, + TFRC_OPT_RECEIVE_RATE = 194, +}; struct ccid3_options_received { u64 ccid3or_seqno:48, @@ -47,7 +78,7 @@ struct ccid3_options_received { u32 ccid3or_receive_rate; }; -/** struct ccid3_hc_tx_sock - CCID3 sender half connection congestion control block +/** struct ccid3_hc_tx_sock - CCID3 sender half connection sock * * @ccid3hctx_state - Sender state * @ccid3hctx_x - Current sending rate @@ -57,7 +88,8 @@ struct ccid3_options_received { * @ccid3hctx_rtt - Estimate of current round trip time in usecs * @@ccid3hctx_p - Current loss event rate (0-1) scaled by 1000000 * @ccid3hctx_last_win_count - Last window counter sent - * @ccid3hctx_t_last_win_count - Timestamp of earliest packet with last_win_count value sent + * @ccid3hctx_t_last_win_count - Timestamp of earliest packet + * with last_win_count value sent * @ccid3hctx_no_feedback_timer - Handle to no feedback timer * @ccid3hctx_idle - FIXME * @ccid3hctx_t_ld - Time last doubled during slow start @@ -112,9 +144,9 @@ struct ccid3_hc_rx_sock { }; #define ccid3_hc_tx_field(s,field) (s->dccps_hc_tx_ccid_private == NULL ? 0 : \ - ((struct ccid3_hc_tx_sock *)s->dccps_hc_tx_ccid_private)->ccid3hctx_##field) + ((struct ccid3_hc_tx_sock *)s->dccps_hc_tx_ccid_private)->ccid3hctx_##field) #define ccid3_hc_rx_field(s,field) (s->dccps_hc_rx_ccid_private == NULL ? 0 : \ - ((struct ccid3_hc_rx_sock *)s->dccps_hc_rx_ccid_private)->ccid3hcrx_##field) + ((struct ccid3_hc_rx_sock *)s->dccps_hc_rx_ccid_private)->ccid3hcrx_##field) #endif /* _DCCP_CCID3_H_ */ -- cgit v0.10.2 From ae6706f0678b89de07ad3b456893cc883584f711 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sat, 27 Aug 2005 23:03:09 -0300 Subject: [CCID3]: Move the loss interval code to loss_interval.[ch] And put this into net/dccp/ccids/lib/, where packet_history.[ch] will also be moved and then we'll have a tfrc_lib.ko module that will be used by dccp_ccid3.ko and other CCIDs that are variations of TFRC (RFC 3448). Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/net/dccp/ccids/Makefile b/net/dccp/ccids/Makefile index 1c72013..323b68f 100644 --- a/net/dccp/ccids/Makefile +++ b/net/dccp/ccids/Makefile @@ -1,3 +1,3 @@ obj-$(CONFIG_IP_DCCP_CCID3) += dccp_ccid3.o -dccp_ccid3-y := ccid3.o +dccp_ccid3-y := ccid3.o lib/loss_interval.o diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index cfd1123..7468928 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -38,6 +38,7 @@ #include "../ccid.h" #include "../dccp.h" #include "../packet_history.h" +#include "lib/loss_interval.h" #include "ccid3.h" /* @@ -62,30 +63,7 @@ static int ccid3_debug; static struct dccp_tx_hist *ccid3_tx_hist; static struct dccp_rx_hist *ccid3_rx_hist; - -static kmem_cache_t *ccid3_loss_interval_hist_slab __read_mostly; - -static inline struct ccid3_loss_interval_hist_entry * - ccid3_loss_interval_hist_entry_new(const unsigned int __nocast prio) -{ - return kmem_cache_alloc(ccid3_loss_interval_hist_slab, prio); -} - -static inline void ccid3_loss_interval_hist_entry_delete(struct ccid3_loss_interval_hist_entry *entry) -{ - if (entry != NULL) - kmem_cache_free(ccid3_loss_interval_hist_slab, entry); -} - -static void ccid3_loss_interval_history_delete(struct list_head *hist) -{ - struct ccid3_loss_interval_hist_entry *entry, *next; - - list_for_each_entry_safe(entry, next, hist, ccid3lih_node) { - list_del_init(&entry->ccid3lih_node); - kmem_cache_free(ccid3_loss_interval_hist_slab, entry); - } -} +static struct dccp_li_hist *ccid3_li_hist; static int ccid3_init(struct sock *sk) { @@ -1414,7 +1392,7 @@ trim_history: */ num_later = TFRC_RECV_NUM_LATE_LOSS + 1; - if (!list_empty(&hcrx->ccid3hcrx_loss_interval_hist)) { + if (!list_empty(&hcrx->ccid3hcrx_li_hist)) { list_for_each_entry_safe(entry, next, &hcrx->ccid3hcrx_hist, dccphrx_node) { if (num_later == 0) { @@ -1555,15 +1533,6 @@ static void ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) &x_recv, sizeof(x_recv)); } -/* Weights used to calculate loss event rate */ -/* - * These are integers as per section 8 of RFC3448. We can then divide by 4 * - * when we use it. - */ -static const int ccid3_hc_rx_w[TFRC_RECV_IVAL_F_LENGTH] = { - 4, 4, 4, 4, 3, 2, 1, 1, -}; - /* * args: fvalue - function value to match * returns: p closest to that value @@ -1672,41 +1641,17 @@ static void ccid3_hc_rx_update_li(struct sock *sk, u64 seq_loss, u8 win_loss) { struct dccp_sock *dp = dccp_sk(sk); struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; - struct ccid3_loss_interval_hist_entry *li_entry; - if (seq_loss != DCCP_MAX_SEQNO + 1) { - ccid3_pr_debug("%s, sk=%p, seq_loss=%llu, win_loss=%u, " - "packet loss detected\n", - dccp_role(sk), sk, seq_loss, win_loss); - - if (list_empty(&hcrx->ccid3hcrx_loss_interval_hist)) { - struct ccid3_loss_interval_hist_entry *li_tail = NULL; - int i; - - ccid3_pr_debug("%s, sk=%p, first loss event detected, " - "creating history\n", - dccp_role(sk), sk); - for (i = 0; i <= TFRC_RECV_IVAL_F_LENGTH; ++i) { - li_entry = ccid3_loss_interval_hist_entry_new(SLAB_ATOMIC); - if (li_entry == NULL) { - ccid3_loss_interval_history_delete(&hcrx->ccid3hcrx_loss_interval_hist); - ccid3_pr_debug("%s, sk=%p, not enough " - "mem for creating " - "history\n", - dccp_role(sk), sk); - return; - } - if (li_tail == NULL) - li_tail = li_entry; - list_add(&li_entry->ccid3lih_node, - &hcrx->ccid3hcrx_loss_interval_hist); - } + if (seq_loss != DCCP_MAX_SEQNO + 1 && + list_empty(&hcrx->ccid3hcrx_li_hist)) { + struct dccp_li_hist_entry *li_tail; - li_entry->ccid3lih_seqno = seq_loss; - li_entry->ccid3lih_win_count = win_loss; - - li_tail->ccid3lih_interval = ccid3_hc_rx_calc_first_li(sk); - } + li_tail = dccp_li_hist_interval_new(ccid3_li_hist, + &hcrx->ccid3hcrx_li_hist, + seq_loss, win_loss); + if (li_tail == NULL) + return; + li_tail->dccplih_interval = ccid3_hc_rx_calc_first_li(sk); } /* FIXME: find end of interval */ } @@ -1746,12 +1691,11 @@ static void ccid3_hc_rx_detect_loss(struct sock *sk) } if (a_loss == NULL) { - if (list_empty(&hcrx->ccid3hcrx_loss_interval_hist)) { + if (list_empty(&hcrx->ccid3hcrx_li_hist)) { /* no loss event have occured yet */ - ccid3_pr_debug("%s, sk=%p, TODO: find a lost data " - "packet by comparing to initial " - "seqno\n", - dccp_role(sk), sk); + LIMIT_NETDEBUG("%s: TODO: find a lost data packet by " + "comparing to initial seqno\n", + dccp_role(sk)); goto out_update_li; } else { pr_info("%s: %s, sk=%p, ERROR! Less than 4 data " @@ -1799,48 +1743,6 @@ out_update_li: ccid3_hc_rx_update_li(sk, seq_loss, win_loss); } -static u32 ccid3_hc_rx_calc_i_mean(struct sock *sk) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; - struct ccid3_loss_interval_hist_entry *li_entry, *li_next; - int i = 0; - u32 i_tot; - u32 i_tot0 = 0; - u32 i_tot1 = 0; - u32 w_tot = 0; - - list_for_each_entry_safe(li_entry, li_next, - &hcrx->ccid3hcrx_loss_interval_hist, - ccid3lih_node) { - if (i < TFRC_RECV_IVAL_F_LENGTH) { - i_tot0 += li_entry->ccid3lih_interval * ccid3_hc_rx_w[i]; - w_tot += ccid3_hc_rx_w[i]; - } - - if (i != 0) - i_tot1 += li_entry->ccid3lih_interval * ccid3_hc_rx_w[i - 1]; - - if (++i > TFRC_RECV_IVAL_F_LENGTH) - break; - } - - if (i != TFRC_RECV_IVAL_F_LENGTH) { - pr_info("%s: %s, sk=%p, ERROR! Missing entry in " - "interval history!\n", - __FUNCTION__, dccp_role(sk), sk); - return 0; - } - - i_tot = max(i_tot0, i_tot1); - - /* FIXME: Why do we do this? -Ian McDonald */ - if (i_tot * 4 < w_tot) - i_tot = w_tot * 4; - - return i_tot * 4 / w_tot; -} - static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) { struct dccp_sock *dp = dccp_sk(sk); @@ -1939,9 +1841,9 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) p_prev = hcrx->ccid3hcrx_p; /* Calculate loss event rate */ - if (!list_empty(&hcrx->ccid3hcrx_loss_interval_hist)) + if (!list_empty(&hcrx->ccid3hcrx_li_hist)) /* Scaling up by 1000000 as fixed decimal */ - hcrx->ccid3hcrx_p = 1000000 / ccid3_hc_rx_calc_i_mean(sk); + hcrx->ccid3hcrx_p = 1000000 / dccp_li_hist_calc_i_mean(&hcrx->ccid3hcrx_li_hist); if (hcrx->ccid3hcrx_p > p_prev) { ccid3_hc_rx_send_feedback(sk); @@ -1971,7 +1873,7 @@ static int ccid3_hc_rx_init(struct sock *sk) hcrx->ccid3hcrx_state = TFRC_RSTATE_NO_DATA; INIT_LIST_HEAD(&hcrx->ccid3hcrx_hist); - INIT_LIST_HEAD(&hcrx->ccid3hcrx_loss_interval_hist); + INIT_LIST_HEAD(&hcrx->ccid3hcrx_li_hist); /* * XXX this seems to be paranoid, need to think more about this, for * now start with something different than zero. -acme @@ -1996,7 +1898,7 @@ static void ccid3_hc_rx_exit(struct sock *sk) dccp_rx_hist_purge(ccid3_rx_hist, &hcrx->ccid3hcrx_hist); /* Empty loss interval history */ - ccid3_loss_interval_history_delete(&hcrx->ccid3hcrx_loss_interval_hist); + dccp_li_hist_purge(ccid3_li_hist, &hcrx->ccid3hcrx_li_hist); kfree(dp->dccps_hc_rx_ccid_private); dp->dccps_hc_rx_ccid_private = NULL; @@ -2063,11 +1965,8 @@ static __init int ccid3_module_init(void) if (ccid3_tx_hist == NULL) goto out_free_rx; - ccid3_loss_interval_hist_slab = kmem_cache_create("li_hist_ccid3", - sizeof(struct ccid3_loss_interval_hist_entry), - 0, SLAB_HWCACHE_ALIGN, - NULL, NULL); - if (ccid3_loss_interval_hist_slab == NULL) + ccid3_li_hist = dccp_li_hist_new("ccid3"); + if (ccid3_li_hist == NULL) goto out_free_tx; rc = ccid_register(&ccid3); @@ -2077,8 +1976,8 @@ out: return rc; out_free_loss_interval_history: - kmem_cache_destroy(ccid3_loss_interval_hist_slab); - ccid3_loss_interval_hist_slab = NULL; + dccp_li_hist_delete(ccid3_li_hist); + ccid3_li_hist = NULL; out_free_tx: dccp_tx_hist_delete(ccid3_tx_hist); ccid3_tx_hist = NULL; @@ -2110,9 +2009,9 @@ static __exit void ccid3_module_exit(void) dccp_rx_hist_delete(ccid3_rx_hist); ccid3_rx_hist = NULL; } - if (ccid3_loss_interval_hist_slab != NULL) { - kmem_cache_destroy(ccid3_loss_interval_hist_slab); - ccid3_loss_interval_hist_slab = NULL; + if (ccid3_li_hist != NULL) { + dccp_li_hist_delete(ccid3_li_hist); + ccid3_li_hist = NULL; } } module_exit(ccid3_module_exit); diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h index f896570..f68d0b4 100644 --- a/net/dccp/ccids/ccid3.h +++ b/net/dccp/ccids/ccid3.h @@ -59,8 +59,6 @@ #define TFRC_SMALLEST_P 40 -#define TFRC_RECV_IVAL_F_LENGTH 8 - /* Number of later packets received before one is considered lost */ #define TFRC_RECV_NUM_LATE_LOSS 3 @@ -119,13 +117,6 @@ struct ccid3_hc_tx_sock { struct ccid3_options_received ccid3hctx_options_received; }; -struct ccid3_loss_interval_hist_entry { - struct list_head ccid3lih_node; - u64 ccid3lih_seqno:48, - ccid3lih_win_count:4; - u32 ccid3lih_interval; -}; - struct ccid3_hc_rx_sock { u64 ccid3hcrx_seqno_last_counter:48, ccid3hcrx_state:8, @@ -136,7 +127,7 @@ struct ccid3_hc_rx_sock { struct timeval ccid3hcrx_tstamp_last_feedback; struct timeval ccid3hcrx_tstamp_last_ack; struct list_head ccid3hcrx_hist; - struct list_head ccid3hcrx_loss_interval_hist; + struct list_head ccid3hcrx_li_hist; u16 ccid3hcrx_s; u32 ccid3hcrx_pinv; u32 ccid3hcrx_elapsed_time; diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c new file mode 100644 index 0000000..4c01a54 --- /dev/null +++ b/net/dccp/ccids/lib/loss_interval.c @@ -0,0 +1,144 @@ +/* + * net/dccp/ccids/lib/loss_interval.c + * + * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand. + * Copyright (c) 2005 Ian McDonald + * Copyright (c) 2005 Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include + +#include "loss_interval.h" + +struct dccp_li_hist *dccp_li_hist_new(const char *name) +{ + struct dccp_li_hist *hist = kmalloc(sizeof(*hist), GFP_ATOMIC); + static const char dccp_li_hist_mask[] = "li_hist_%s"; + char *slab_name; + + if (hist == NULL) + goto out; + + slab_name = kmalloc(strlen(name) + sizeof(dccp_li_hist_mask) - 1, + GFP_ATOMIC); + if (slab_name == NULL) + goto out_free_hist; + + sprintf(slab_name, dccp_li_hist_mask, name); + hist->dccplih_slab = kmem_cache_create(slab_name, + sizeof(struct dccp_li_hist_entry), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (hist->dccplih_slab == NULL) + goto out_free_slab_name; +out: + return hist; +out_free_slab_name: + kfree(slab_name); +out_free_hist: + kfree(hist); + hist = NULL; + goto out; +} + +EXPORT_SYMBOL_GPL(dccp_li_hist_new); + +void dccp_li_hist_delete(struct dccp_li_hist *hist) +{ + const char* name = kmem_cache_name(hist->dccplih_slab); + + kmem_cache_destroy(hist->dccplih_slab); + kfree(name); + kfree(hist); +} + +EXPORT_SYMBOL_GPL(dccp_li_hist_delete); + +void dccp_li_hist_purge(struct dccp_li_hist *hist, struct list_head *list) +{ + struct dccp_li_hist_entry *entry, *next; + + list_for_each_entry_safe(entry, next, list, dccplih_node) { + list_del_init(&entry->dccplih_node); + kmem_cache_free(hist->dccplih_slab, entry); + } +} + +EXPORT_SYMBOL_GPL(dccp_li_hist_purge); + +/* Weights used to calculate loss event rate */ +/* + * These are integers as per section 8 of RFC3448. We can then divide by 4 * + * when we use it. + */ +static const int dccp_li_hist_w[DCCP_LI_HIST_IVAL_F_LENGTH] = { + 4, 4, 4, 4, 3, 2, 1, 1, +}; + +u32 dccp_li_hist_calc_i_mean(struct list_head *list) +{ + struct dccp_li_hist_entry *li_entry, *li_next; + int i = 0; + u32 i_tot; + u32 i_tot0 = 0; + u32 i_tot1 = 0; + u32 w_tot = 0; + + list_for_each_entry_safe(li_entry, li_next, list, dccplih_node) { + if (i < DCCP_LI_HIST_IVAL_F_LENGTH) { + i_tot0 += li_entry->dccplih_interval * dccp_li_hist_w[i]; + w_tot += dccp_li_hist_w[i]; + } + + if (i != 0) + i_tot1 += li_entry->dccplih_interval * dccp_li_hist_w[i - 1]; + + if (++i > DCCP_LI_HIST_IVAL_F_LENGTH) + break; + } + + if (i != DCCP_LI_HIST_IVAL_F_LENGTH) + return 0; + + i_tot = max(i_tot0, i_tot1); + + /* FIXME: Why do we do this? -Ian McDonald */ + if (i_tot * 4 < w_tot) + i_tot = w_tot * 4; + + return i_tot * 4 / w_tot; +} + +EXPORT_SYMBOL_GPL(dccp_li_hist_calc_i_mean); + +struct dccp_li_hist_entry *dccp_li_hist_interval_new(struct dccp_li_hist *hist, + struct list_head *list, + const u64 seq_loss, + const u8 win_loss) +{ + struct dccp_li_hist_entry *tail = NULL, *entry; + int i; + + for (i = 0; i <= DCCP_LI_HIST_IVAL_F_LENGTH; ++i) { + entry = dccp_li_hist_entry_new(hist, SLAB_ATOMIC); + if (entry == NULL) { + dccp_li_hist_purge(hist, list); + return NULL; + } + if (tail == NULL) + tail = entry; + list_add(&entry->dccplih_node, list); + } + + entry->dccplih_seqno = seq_loss; + entry->dccplih_win_count = win_loss; + return tail; +} + +EXPORT_SYMBOL_GPL(dccp_li_hist_interval_new); diff --git a/net/dccp/ccids/lib/loss_interval.h b/net/dccp/ccids/lib/loss_interval.h new file mode 100644 index 0000000..13ad47b --- /dev/null +++ b/net/dccp/ccids/lib/loss_interval.h @@ -0,0 +1,61 @@ +#ifndef _DCCP_LI_HIST_ +#define _DCCP_LI_HIST_ +/* + * net/dccp/ccids/lib/loss_interval.h + * + * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand. + * Copyright (c) 2005 Ian McDonald + * Copyright (c) 2005 Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include +#include +#include +#include + +#define DCCP_LI_HIST_IVAL_F_LENGTH 8 + +struct dccp_li_hist { + kmem_cache_t *dccplih_slab; +}; + +extern struct dccp_li_hist *dccp_li_hist_new(const char *name); +extern void dccp_li_hist_delete(struct dccp_li_hist *hist); + +struct dccp_li_hist_entry { + struct list_head dccplih_node; + u64 dccplih_seqno:48, + dccplih_win_count:4; + u32 dccplih_interval; +}; + +static inline struct dccp_li_hist_entry * + dccp_li_hist_entry_new(struct dccp_li_hist *hist, + const unsigned int __nocast prio) +{ + return kmem_cache_alloc(hist->dccplih_slab, prio); +} + +static inline void dccp_li_hist_entry_delete(struct dccp_li_hist *hist, + struct dccp_li_hist_entry *entry) +{ + if (entry != NULL) + kmem_cache_free(hist->dccplih_slab, entry); +} + +extern void dccp_li_hist_purge(struct dccp_li_hist *hist, + struct list_head *list); + +extern u32 dccp_li_hist_calc_i_mean(struct list_head *list); + +extern struct dccp_li_hist_entry * + dccp_li_hist_interval_new(struct dccp_li_hist *hist, + struct list_head *list, + const u64 seq_loss, + const u8 win_loss); +#endif /* _DCCP_LI_HIST_ */ -- cgit v0.10.2 From 4524b259541e1eea07020af825d8e7b0e4faaec5 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sat, 27 Aug 2005 23:18:26 -0300 Subject: [DCCP]: Just move packet_history.[ch] to net/dccp/ccids/lib/ Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/net/dccp/Makefile b/net/dccp/Makefile index 44a867f..fb97bb0 100644 --- a/net/dccp/Makefile +++ b/net/dccp/Makefile @@ -1,7 +1,7 @@ obj-$(CONFIG_IP_DCCP) += dccp.o dccp-y := ccid.o input.o ipv4.o minisocks.o options.o output.o proto.o \ - timer.o packet_history.o + timer.o obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o diff --git a/net/dccp/ccids/Makefile b/net/dccp/ccids/Makefile index 323b68f..29eb1b6 100644 --- a/net/dccp/ccids/Makefile +++ b/net/dccp/ccids/Makefile @@ -1,3 +1,3 @@ obj-$(CONFIG_IP_DCCP_CCID3) += dccp_ccid3.o -dccp_ccid3-y := ccid3.o lib/loss_interval.o +dccp_ccid3-y := ccid3.o lib/loss_interval.o lib/packet_history.o diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 7468928..12548fb 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -37,7 +37,7 @@ #include #include "../ccid.h" #include "../dccp.h" -#include "../packet_history.h" +#include "lib/packet_history.h" #include "lib/loss_interval.h" #include "ccid3.h" diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c new file mode 100644 index 0000000..2d9ef5a --- /dev/null +++ b/net/dccp/ccids/lib/packet_history.c @@ -0,0 +1,199 @@ +/* + * net/dccp/packet_history.h + * + * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand. + * + * An implementation of the DCCP protocol + * + * This code has been developed by the University of Waikato WAND + * research group. For further information please see http://www.wand.net.nz/ + * or e-mail Ian McDonald - iam4@cs.waikato.ac.nz + * + * This code also uses code from Lulea University, rereleased as GPL by its + * authors: + * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon + * + * Changes to meet Linux coding standards, to make it meet latest ccid3 draft + * and to make it work as a loadable module in the DCCP stack written by + * Arnaldo Carvalho de Melo . + * + * Copyright (c) 2005 Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include + +#include "packet_history.h" + +struct dccp_rx_hist *dccp_rx_hist_new(const char *name) +{ + struct dccp_rx_hist *hist = kmalloc(sizeof(*hist), GFP_ATOMIC); + static const char dccp_rx_hist_mask[] = "rx_hist_%s"; + char *slab_name; + + if (hist == NULL) + goto out; + + slab_name = kmalloc(strlen(name) + sizeof(dccp_rx_hist_mask) - 1, + GFP_ATOMIC); + if (slab_name == NULL) + goto out_free_hist; + + sprintf(slab_name, dccp_rx_hist_mask, name); + hist->dccprxh_slab = kmem_cache_create(slab_name, + sizeof(struct dccp_rx_hist_entry), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (hist->dccprxh_slab == NULL) + goto out_free_slab_name; +out: + return hist; +out_free_slab_name: + kfree(slab_name); +out_free_hist: + kfree(hist); + hist = NULL; + goto out; +} + +EXPORT_SYMBOL_GPL(dccp_rx_hist_new); + +void dccp_rx_hist_delete(struct dccp_rx_hist *hist) +{ + const char* name = kmem_cache_name(hist->dccprxh_slab); + + kmem_cache_destroy(hist->dccprxh_slab); + kfree(name); + kfree(hist); +} + +EXPORT_SYMBOL_GPL(dccp_rx_hist_delete); + +void dccp_rx_hist_purge(struct dccp_rx_hist *hist, struct list_head *list) +{ + struct dccp_rx_hist_entry *entry, *next; + + list_for_each_entry_safe(entry, next, list, dccphrx_node) { + list_del_init(&entry->dccphrx_node); + kmem_cache_free(hist->dccprxh_slab, entry); + } +} + +EXPORT_SYMBOL_GPL(dccp_rx_hist_purge); + +struct dccp_rx_hist_entry * + dccp_rx_hist_find_data_packet(const struct list_head *list) +{ + struct dccp_rx_hist_entry *entry, *packet = NULL; + + list_for_each_entry(entry, list, dccphrx_node) + if (entry->dccphrx_type == DCCP_PKT_DATA || + entry->dccphrx_type == DCCP_PKT_DATAACK) { + packet = entry; + break; + } + + return packet; +} + +EXPORT_SYMBOL_GPL(dccp_rx_hist_find_data_packet); + +struct dccp_tx_hist *dccp_tx_hist_new(const char *name) +{ + struct dccp_tx_hist *hist = kmalloc(sizeof(*hist), GFP_ATOMIC); + static const char dccp_tx_hist_mask[] = "tx_hist_%s"; + char *slab_name; + + if (hist == NULL) + goto out; + + slab_name = kmalloc(strlen(name) + sizeof(dccp_tx_hist_mask) - 1, + GFP_ATOMIC); + if (slab_name == NULL) + goto out_free_hist; + + sprintf(slab_name, dccp_tx_hist_mask, name); + hist->dccptxh_slab = kmem_cache_create(slab_name, + sizeof(struct dccp_tx_hist_entry), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (hist->dccptxh_slab == NULL) + goto out_free_slab_name; +out: + return hist; +out_free_slab_name: + kfree(slab_name); +out_free_hist: + kfree(hist); + hist = NULL; + goto out; +} + +EXPORT_SYMBOL_GPL(dccp_tx_hist_new); + +void dccp_tx_hist_delete(struct dccp_tx_hist *hist) +{ + const char* name = kmem_cache_name(hist->dccptxh_slab); + + kmem_cache_destroy(hist->dccptxh_slab); + kfree(name); + kfree(hist); +} + +EXPORT_SYMBOL_GPL(dccp_tx_hist_delete); + +struct dccp_tx_hist_entry * + dccp_tx_hist_find_entry(const struct list_head *list, const u64 seq) +{ + struct dccp_tx_hist_entry *packet = NULL, *entry; + + list_for_each_entry(entry, list, dccphtx_node) + if (entry->dccphtx_seqno == seq) { + packet = entry; + break; + } + + return packet; +} + +EXPORT_SYMBOL_GPL(dccp_tx_hist_find_entry); + +void dccp_tx_hist_purge_older(struct dccp_tx_hist *hist, + struct list_head *list, + struct dccp_tx_hist_entry *packet) +{ + struct dccp_tx_hist_entry *next; + + list_for_each_entry_safe_continue(packet, next, list, dccphtx_node) { + list_del_init(&packet->dccphtx_node); + dccp_tx_hist_entry_delete(hist, packet); + } +} + +EXPORT_SYMBOL_GPL(dccp_tx_hist_purge_older); + +void dccp_tx_hist_purge(struct dccp_tx_hist *hist, struct list_head *list) +{ + struct dccp_tx_hist_entry *entry, *next; + + list_for_each_entry_safe(entry, next, list, dccphtx_node) { + list_del_init(&entry->dccphtx_node); + dccp_tx_hist_entry_delete(hist, entry); + } +} + +EXPORT_SYMBOL_GPL(dccp_tx_hist_purge); diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h new file mode 100644 index 0000000..235828d --- /dev/null +++ b/net/dccp/ccids/lib/packet_history.h @@ -0,0 +1,185 @@ +/* + * net/dccp/packet_history.h + * + * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand. + * + * An implementation of the DCCP protocol + * + * This code has been developed by the University of Waikato WAND + * research group. For further information please see http://www.wand.net.nz/ + * or e-mail Ian McDonald - iam4@cs.waikato.ac.nz + * + * This code also uses code from Lulea University, rereleased as GPL by its + * authors: + * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon + * + * Changes to meet Linux coding standards, to make it meet latest ccid3 draft + * and to make it work as a loadable module in the DCCP stack written by + * Arnaldo Carvalho de Melo . + * + * Copyright (c) 2005 Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef _DCCP_PKT_HIST_ +#define _DCCP_PKT_HIST_ + +#include +#include +#include +#include + +#include "../../dccp.h" + +struct dccp_tx_hist_entry { + struct list_head dccphtx_node; + u64 dccphtx_seqno:48, + dccphtx_ccval:4, + dccphtx_sent:1; + u32 dccphtx_rtt; + struct timeval dccphtx_tstamp; +}; + +struct dccp_rx_hist_entry { + struct list_head dccphrx_node; + u64 dccphrx_seqno:48, + dccphrx_ccval:4, + dccphrx_type:4; + u32 dccphrx_ndp; /* In fact it is from 8 to 24 bits */ + struct timeval dccphrx_tstamp; +}; + +struct dccp_tx_hist { + kmem_cache_t *dccptxh_slab; +}; + +extern struct dccp_tx_hist *dccp_tx_hist_new(const char *name); +extern void dccp_tx_hist_delete(struct dccp_tx_hist *hist); + +struct dccp_rx_hist { + kmem_cache_t *dccprxh_slab; +}; + +extern struct dccp_rx_hist *dccp_rx_hist_new(const char *name); +extern void dccp_rx_hist_delete(struct dccp_rx_hist *hist); +extern struct dccp_rx_hist_entry * + dccp_rx_hist_find_data_packet(const struct list_head *list); + +static inline struct dccp_tx_hist_entry * + dccp_tx_hist_entry_new(struct dccp_tx_hist *hist, + const unsigned int __nocast prio) +{ + struct dccp_tx_hist_entry *entry = kmem_cache_alloc(hist->dccptxh_slab, + prio); + + if (entry != NULL) + entry->dccphtx_sent = 0; + + return entry; +} + +static inline void dccp_tx_hist_entry_delete(struct dccp_tx_hist *hist, + struct dccp_tx_hist_entry *entry) +{ + if (entry != NULL) + kmem_cache_free(hist->dccptxh_slab, entry); +} + +extern struct dccp_tx_hist_entry * + dccp_tx_hist_find_entry(const struct list_head *list, + const u64 seq); + +static inline void dccp_tx_hist_add_entry(struct list_head *list, + struct dccp_tx_hist_entry *entry) +{ + list_add(&entry->dccphtx_node, list); +} + +extern void dccp_tx_hist_purge_older(struct dccp_tx_hist *hist, + struct list_head *list, + struct dccp_tx_hist_entry *next); + +extern void dccp_tx_hist_purge(struct dccp_tx_hist *hist, + struct list_head *list); + +static inline struct dccp_tx_hist_entry * + dccp_tx_hist_head(struct list_head *list) +{ + struct dccp_tx_hist_entry *head = NULL; + + if (!list_empty(list)) + head = list_entry(list->next, struct dccp_tx_hist_entry, + dccphtx_node); + return head; +} + +static inline struct dccp_rx_hist_entry * + dccp_rx_hist_entry_new(struct dccp_rx_hist *hist, + const u32 ndp, + const struct sk_buff *skb, + const unsigned int __nocast prio) +{ + struct dccp_rx_hist_entry *entry = kmem_cache_alloc(hist->dccprxh_slab, + prio); + + if (entry != NULL) { + const struct dccp_hdr *dh = dccp_hdr(skb); + + entry->dccphrx_seqno = DCCP_SKB_CB(skb)->dccpd_seq; + entry->dccphrx_ccval = dh->dccph_ccval; + entry->dccphrx_type = dh->dccph_type; + entry->dccphrx_ndp = ndp; + do_gettimeofday(&(entry->dccphrx_tstamp)); + } + + return entry; +} + +static inline void dccp_rx_hist_entry_delete(struct dccp_rx_hist *hist, + struct dccp_rx_hist_entry *entry) +{ + if (entry != NULL) + kmem_cache_free(hist->dccprxh_slab, entry); +} + +extern void dccp_rx_hist_purge(struct dccp_rx_hist *hist, + struct list_head *list); + +static inline void dccp_rx_hist_add_entry(struct list_head *list, + struct dccp_rx_hist_entry *entry) +{ + list_add(&entry->dccphrx_node, list); +} + +static inline struct dccp_rx_hist_entry * + dccp_rx_hist_head(struct list_head *list) +{ + struct dccp_rx_hist_entry *head = NULL; + + if (!list_empty(list)) + head = list_entry(list->next, struct dccp_rx_hist_entry, + dccphrx_node); + return head; +} + +static inline int + dccp_rx_hist_entry_data_packet(const struct dccp_rx_hist_entry *entry) +{ + return entry->dccphrx_type == DCCP_PKT_DATA || + entry->dccphrx_type == DCCP_PKT_DATAACK; +} + +#endif /* _DCCP_PKT_HIST_ */ diff --git a/net/dccp/packet_history.c b/net/dccp/packet_history.c deleted file mode 100644 index 2d9ef5a..0000000 --- a/net/dccp/packet_history.c +++ /dev/null @@ -1,199 +0,0 @@ -/* - * net/dccp/packet_history.h - * - * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand. - * - * An implementation of the DCCP protocol - * - * This code has been developed by the University of Waikato WAND - * research group. For further information please see http://www.wand.net.nz/ - * or e-mail Ian McDonald - iam4@cs.waikato.ac.nz - * - * This code also uses code from Lulea University, rereleased as GPL by its - * authors: - * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon - * - * Changes to meet Linux coding standards, to make it meet latest ccid3 draft - * and to make it work as a loadable module in the DCCP stack written by - * Arnaldo Carvalho de Melo . - * - * Copyright (c) 2005 Arnaldo Carvalho de Melo - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include -#include - -#include "packet_history.h" - -struct dccp_rx_hist *dccp_rx_hist_new(const char *name) -{ - struct dccp_rx_hist *hist = kmalloc(sizeof(*hist), GFP_ATOMIC); - static const char dccp_rx_hist_mask[] = "rx_hist_%s"; - char *slab_name; - - if (hist == NULL) - goto out; - - slab_name = kmalloc(strlen(name) + sizeof(dccp_rx_hist_mask) - 1, - GFP_ATOMIC); - if (slab_name == NULL) - goto out_free_hist; - - sprintf(slab_name, dccp_rx_hist_mask, name); - hist->dccprxh_slab = kmem_cache_create(slab_name, - sizeof(struct dccp_rx_hist_entry), - 0, SLAB_HWCACHE_ALIGN, - NULL, NULL); - if (hist->dccprxh_slab == NULL) - goto out_free_slab_name; -out: - return hist; -out_free_slab_name: - kfree(slab_name); -out_free_hist: - kfree(hist); - hist = NULL; - goto out; -} - -EXPORT_SYMBOL_GPL(dccp_rx_hist_new); - -void dccp_rx_hist_delete(struct dccp_rx_hist *hist) -{ - const char* name = kmem_cache_name(hist->dccprxh_slab); - - kmem_cache_destroy(hist->dccprxh_slab); - kfree(name); - kfree(hist); -} - -EXPORT_SYMBOL_GPL(dccp_rx_hist_delete); - -void dccp_rx_hist_purge(struct dccp_rx_hist *hist, struct list_head *list) -{ - struct dccp_rx_hist_entry *entry, *next; - - list_for_each_entry_safe(entry, next, list, dccphrx_node) { - list_del_init(&entry->dccphrx_node); - kmem_cache_free(hist->dccprxh_slab, entry); - } -} - -EXPORT_SYMBOL_GPL(dccp_rx_hist_purge); - -struct dccp_rx_hist_entry * - dccp_rx_hist_find_data_packet(const struct list_head *list) -{ - struct dccp_rx_hist_entry *entry, *packet = NULL; - - list_for_each_entry(entry, list, dccphrx_node) - if (entry->dccphrx_type == DCCP_PKT_DATA || - entry->dccphrx_type == DCCP_PKT_DATAACK) { - packet = entry; - break; - } - - return packet; -} - -EXPORT_SYMBOL_GPL(dccp_rx_hist_find_data_packet); - -struct dccp_tx_hist *dccp_tx_hist_new(const char *name) -{ - struct dccp_tx_hist *hist = kmalloc(sizeof(*hist), GFP_ATOMIC); - static const char dccp_tx_hist_mask[] = "tx_hist_%s"; - char *slab_name; - - if (hist == NULL) - goto out; - - slab_name = kmalloc(strlen(name) + sizeof(dccp_tx_hist_mask) - 1, - GFP_ATOMIC); - if (slab_name == NULL) - goto out_free_hist; - - sprintf(slab_name, dccp_tx_hist_mask, name); - hist->dccptxh_slab = kmem_cache_create(slab_name, - sizeof(struct dccp_tx_hist_entry), - 0, SLAB_HWCACHE_ALIGN, - NULL, NULL); - if (hist->dccptxh_slab == NULL) - goto out_free_slab_name; -out: - return hist; -out_free_slab_name: - kfree(slab_name); -out_free_hist: - kfree(hist); - hist = NULL; - goto out; -} - -EXPORT_SYMBOL_GPL(dccp_tx_hist_new); - -void dccp_tx_hist_delete(struct dccp_tx_hist *hist) -{ - const char* name = kmem_cache_name(hist->dccptxh_slab); - - kmem_cache_destroy(hist->dccptxh_slab); - kfree(name); - kfree(hist); -} - -EXPORT_SYMBOL_GPL(dccp_tx_hist_delete); - -struct dccp_tx_hist_entry * - dccp_tx_hist_find_entry(const struct list_head *list, const u64 seq) -{ - struct dccp_tx_hist_entry *packet = NULL, *entry; - - list_for_each_entry(entry, list, dccphtx_node) - if (entry->dccphtx_seqno == seq) { - packet = entry; - break; - } - - return packet; -} - -EXPORT_SYMBOL_GPL(dccp_tx_hist_find_entry); - -void dccp_tx_hist_purge_older(struct dccp_tx_hist *hist, - struct list_head *list, - struct dccp_tx_hist_entry *packet) -{ - struct dccp_tx_hist_entry *next; - - list_for_each_entry_safe_continue(packet, next, list, dccphtx_node) { - list_del_init(&packet->dccphtx_node); - dccp_tx_hist_entry_delete(hist, packet); - } -} - -EXPORT_SYMBOL_GPL(dccp_tx_hist_purge_older); - -void dccp_tx_hist_purge(struct dccp_tx_hist *hist, struct list_head *list) -{ - struct dccp_tx_hist_entry *entry, *next; - - list_for_each_entry_safe(entry, next, list, dccphtx_node) { - list_del_init(&entry->dccphtx_node); - dccp_tx_hist_entry_delete(hist, entry); - } -} - -EXPORT_SYMBOL_GPL(dccp_tx_hist_purge); diff --git a/net/dccp/packet_history.h b/net/dccp/packet_history.h deleted file mode 100644 index 2e5ba34..0000000 --- a/net/dccp/packet_history.h +++ /dev/null @@ -1,185 +0,0 @@ -/* - * net/dccp/packet_history.h - * - * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand. - * - * An implementation of the DCCP protocol - * - * This code has been developed by the University of Waikato WAND - * research group. For further information please see http://www.wand.net.nz/ - * or e-mail Ian McDonald - iam4@cs.waikato.ac.nz - * - * This code also uses code from Lulea University, rereleased as GPL by its - * authors: - * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon - * - * Changes to meet Linux coding standards, to make it meet latest ccid3 draft - * and to make it work as a loadable module in the DCCP stack written by - * Arnaldo Carvalho de Melo . - * - * Copyright (c) 2005 Arnaldo Carvalho de Melo - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#ifndef _DCCP_PKT_HIST_ -#define _DCCP_PKT_HIST_ - -#include -#include -#include -#include - -#include "dccp.h" - -struct dccp_tx_hist_entry { - struct list_head dccphtx_node; - u64 dccphtx_seqno:48, - dccphtx_ccval:4, - dccphtx_sent:1; - u32 dccphtx_rtt; - struct timeval dccphtx_tstamp; -}; - -struct dccp_rx_hist_entry { - struct list_head dccphrx_node; - u64 dccphrx_seqno:48, - dccphrx_ccval:4, - dccphrx_type:4; - u32 dccphrx_ndp; /* In fact it is from 8 to 24 bits */ - struct timeval dccphrx_tstamp; -}; - -struct dccp_tx_hist { - kmem_cache_t *dccptxh_slab; -}; - -extern struct dccp_tx_hist *dccp_tx_hist_new(const char *name); -extern void dccp_tx_hist_delete(struct dccp_tx_hist *hist); - -struct dccp_rx_hist { - kmem_cache_t *dccprxh_slab; -}; - -extern struct dccp_rx_hist *dccp_rx_hist_new(const char *name); -extern void dccp_rx_hist_delete(struct dccp_rx_hist *hist); -extern struct dccp_rx_hist_entry * - dccp_rx_hist_find_data_packet(const struct list_head *list); - -static inline struct dccp_tx_hist_entry * - dccp_tx_hist_entry_new(struct dccp_tx_hist *hist, - const unsigned int __nocast prio) -{ - struct dccp_tx_hist_entry *entry = kmem_cache_alloc(hist->dccptxh_slab, - prio); - - if (entry != NULL) - entry->dccphtx_sent = 0; - - return entry; -} - -static inline void dccp_tx_hist_entry_delete(struct dccp_tx_hist *hist, - struct dccp_tx_hist_entry *entry) -{ - if (entry != NULL) - kmem_cache_free(hist->dccptxh_slab, entry); -} - -extern struct dccp_tx_hist_entry * - dccp_tx_hist_find_entry(const struct list_head *list, - const u64 seq); - -static inline void dccp_tx_hist_add_entry(struct list_head *list, - struct dccp_tx_hist_entry *entry) -{ - list_add(&entry->dccphtx_node, list); -} - -extern void dccp_tx_hist_purge_older(struct dccp_tx_hist *hist, - struct list_head *list, - struct dccp_tx_hist_entry *next); - -extern void dccp_tx_hist_purge(struct dccp_tx_hist *hist, - struct list_head *list); - -static inline struct dccp_tx_hist_entry * - dccp_tx_hist_head(struct list_head *list) -{ - struct dccp_tx_hist_entry *head = NULL; - - if (!list_empty(list)) - head = list_entry(list->next, struct dccp_tx_hist_entry, - dccphtx_node); - return head; -} - -static inline struct dccp_rx_hist_entry * - dccp_rx_hist_entry_new(struct dccp_rx_hist *hist, - const u32 ndp, - const struct sk_buff *skb, - const unsigned int __nocast prio) -{ - struct dccp_rx_hist_entry *entry = kmem_cache_alloc(hist->dccprxh_slab, - prio); - - if (entry != NULL) { - const struct dccp_hdr *dh = dccp_hdr(skb); - - entry->dccphrx_seqno = DCCP_SKB_CB(skb)->dccpd_seq; - entry->dccphrx_ccval = dh->dccph_ccval; - entry->dccphrx_type = dh->dccph_type; - entry->dccphrx_ndp = ndp; - do_gettimeofday(&(entry->dccphrx_tstamp)); - } - - return entry; -} - -static inline void dccp_rx_hist_entry_delete(struct dccp_rx_hist *hist, - struct dccp_rx_hist_entry *entry) -{ - if (entry != NULL) - kmem_cache_free(hist->dccprxh_slab, entry); -} - -extern void dccp_rx_hist_purge(struct dccp_rx_hist *hist, - struct list_head *list); - -static inline void dccp_rx_hist_add_entry(struct list_head *list, - struct dccp_rx_hist_entry *entry) -{ - list_add(&entry->dccphrx_node, list); -} - -static inline struct dccp_rx_hist_entry * - dccp_rx_hist_head(struct list_head *list) -{ - struct dccp_rx_hist_entry *head = NULL; - - if (!list_empty(list)) - head = list_entry(list->next, struct dccp_rx_hist_entry, - dccphrx_node); - return head; -} - -static inline int - dccp_rx_hist_entry_data_packet(const struct dccp_rx_hist_entry *entry) -{ - return entry->dccphrx_type == DCCP_PKT_DATA || - entry->dccphrx_type == DCCP_PKT_DATAACK; -} - -#endif /* _DCCP_PKT_HIST_ */ -- cgit v0.10.2 From 5cea0ddce56ff3406a81fbbab80ef45c65701673 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sat, 27 Aug 2005 23:50:46 -0300 Subject: [DCCP]: Introduce dccp_tfrc_lib module with net/dccp/ccids/lib/*.c I'll now take a look at the other proposed TFRC DCCP CCIDs to find more code that is now in ccid3.c and move to this module, the loss event rate, calc_X, etc most probably will be moved there. The main goal of these changes is to pave the way for the implementation of more TFRC based DCCP CCIDs and to shrink ccid3.c, reducing its complexity and helping in getting it rock solid. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig index 67f9c06..7684d83 100644 --- a/net/dccp/ccids/Kconfig +++ b/net/dccp/ccids/Kconfig @@ -22,4 +22,8 @@ config IP_DCCP_CCID3 If in doubt, say M. +config IP_DCCP_TFRC_LIB + depends on IP_DCCP_CCID3 + def_tristate IP_DCCP_CCID3 + endmenu diff --git a/net/dccp/ccids/Makefile b/net/dccp/ccids/Makefile index 29eb1b6..956f79f 100644 --- a/net/dccp/ccids/Makefile +++ b/net/dccp/ccids/Makefile @@ -1,3 +1,5 @@ obj-$(CONFIG_IP_DCCP_CCID3) += dccp_ccid3.o -dccp_ccid3-y := ccid3.o lib/loss_interval.o lib/packet_history.o +dccp_ccid3-y := ccid3.o + +obj-y += lib/ diff --git a/net/dccp/ccids/lib/Makefile b/net/dccp/ccids/lib/Makefile new file mode 100644 index 0000000..e9a91e2 --- /dev/null +++ b/net/dccp/ccids/lib/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_IP_DCCP_TFRC_LIB) += dccp_tfrc_lib.o + +dccp_tfrc_lib-y := loss_interval.o packet_history.o diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c index 2d9ef5a..f252a95 100644 --- a/net/dccp/ccids/lib/packet_history.c +++ b/net/dccp/ccids/lib/packet_history.c @@ -35,6 +35,7 @@ */ #include +#include #include #include "packet_history.h" @@ -197,3 +198,8 @@ void dccp_tx_hist_purge(struct dccp_tx_hist *hist, struct list_head *list) } EXPORT_SYMBOL_GPL(dccp_tx_hist_purge); + +MODULE_AUTHOR("Ian McDonald , " + "Arnaldo Carvalho de Melo "); +MODULE_DESCRIPTION("DCCP TFRC library"); +MODULE_LICENSE("GPL"); -- cgit v0.10.2 From 36729c1a73c354a155db18d64d9e79b86c446fcf Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sun, 28 Aug 2005 00:47:15 -0300 Subject: [DCCP]: Move the calc_X routines to dccp_tfrc_lib Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 12548fb..a215c46 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -39,6 +39,7 @@ #include "../dccp.h" #include "lib/packet_history.h" #include "lib/loss_interval.h" +#include "lib/tfrc.h" #include "ccid3.h" /* @@ -112,595 +113,6 @@ static inline void ccid3_hc_tx_set_state(struct sock *sk, hctx->ccid3hctx_state = state; } -#define CALCX_ARRSIZE 500 - -#define CALCX_SPLIT 50000 -/* equivalent to 0.05 */ - -static const u32 calcx_lookup[CALCX_ARRSIZE][2] = { - { 37172 , 8172 }, - { 53499 , 11567 }, - { 66664 , 14180 }, - { 78298 , 16388 }, - { 89021 , 18339 }, - { 99147 , 20108 }, - { 108858 , 21738 }, - { 118273 , 23260 }, - { 127474 , 24693 }, - { 136520 , 26052 }, - { 145456 , 27348 }, - { 154316 , 28589 }, - { 163130 , 29783 }, - { 171919 , 30935 }, - { 180704 , 32049 }, - { 189502 , 33130 }, - { 198328 , 34180 }, - { 207194 , 35202 }, - { 216114 , 36198 }, - { 225097 , 37172 }, - { 234153 , 38123 }, - { 243294 , 39055 }, - { 252527 , 39968 }, - { 261861 , 40864 }, - { 271305 , 41743 }, - { 280866 , 42607 }, - { 290553 , 43457 }, - { 300372 , 44293 }, - { 310333 , 45117 }, - { 320441 , 45929 }, - { 330705 , 46729 }, - { 341131 , 47518 }, - { 351728 , 48297 }, - { 362501 , 49066 }, - { 373460 , 49826 }, - { 384609 , 50577 }, - { 395958 , 51320 }, - { 407513 , 52054 }, - { 419281 , 52780 }, - { 431270 , 53499 }, - { 443487 , 54211 }, - { 455940 , 54916 }, - { 468635 , 55614 }, - { 481581 , 56306 }, - { 494785 , 56991 }, - { 508254 , 57671 }, - { 521996 , 58345 }, - { 536019 , 59014 }, - { 550331 , 59677 }, - { 564939 , 60335 }, - { 579851 , 60988 }, - { 595075 , 61636 }, - { 610619 , 62279 }, - { 626491 , 62918 }, - { 642700 , 63553 }, - { 659253 , 64183 }, - { 676158 , 64809 }, - { 693424 , 65431 }, - { 711060 , 66050 }, - { 729073 , 66664 }, - { 747472 , 67275 }, - { 766266 , 67882 }, - { 785464 , 68486 }, - { 805073 , 69087 }, - { 825103 , 69684 }, - { 845562 , 70278 }, - { 866460 , 70868 }, - { 887805 , 71456 }, - { 909606 , 72041 }, - { 931873 , 72623 }, - { 954614 , 73202 }, - { 977839 , 73778 }, - { 1001557 , 74352 }, - { 1025777 , 74923 }, - { 1050508 , 75492 }, - { 1075761 , 76058 }, - { 1101544 , 76621 }, - { 1127867 , 77183 }, - { 1154739 , 77741 }, - { 1182172 , 78298 }, - { 1210173 , 78852 }, - { 1238753 , 79405 }, - { 1267922 , 79955 }, - { 1297689 , 80503 }, - { 1328066 , 81049 }, - { 1359060 , 81593 }, - { 1390684 , 82135 }, - { 1422947 , 82675 }, - { 1455859 , 83213 }, - { 1489430 , 83750 }, - { 1523671 , 84284 }, - { 1558593 , 84817 }, - { 1594205 , 85348 }, - { 1630518 , 85878 }, - { 1667543 , 86406 }, - { 1705290 , 86932 }, - { 1743770 , 87457 }, - { 1782994 , 87980 }, - { 1822973 , 88501 }, - { 1863717 , 89021 }, - { 1905237 , 89540 }, - { 1947545 , 90057 }, - { 1990650 , 90573 }, - { 2034566 , 91087 }, - { 2079301 , 91600 }, - { 2124869 , 92111 }, - { 2171279 , 92622 }, - { 2218543 , 93131 }, - { 2266673 , 93639 }, - { 2315680 , 94145 }, - { 2365575 , 94650 }, - { 2416371 , 95154 }, - { 2468077 , 95657 }, - { 2520707 , 96159 }, - { 2574271 , 96660 }, - { 2628782 , 97159 }, - { 2684250 , 97658 }, - { 2740689 , 98155 }, - { 2798110 , 98651 }, - { 2856524 , 99147 }, - { 2915944 , 99641 }, - { 2976382 , 100134 }, - { 3037850 , 100626 }, - { 3100360 , 101117 }, - { 3163924 , 101608 }, - { 3228554 , 102097 }, - { 3294263 , 102586 }, - { 3361063 , 103073 }, - { 3428966 , 103560 }, - { 3497984 , 104045 }, - { 3568131 , 104530 }, - { 3639419 , 105014 }, - { 3711860 , 105498 }, - { 3785467 , 105980 }, - { 3860253 , 106462 }, - { 3936229 , 106942 }, - { 4013410 , 107422 }, - { 4091808 , 107902 }, - { 4171435 , 108380 }, - { 4252306 , 108858 }, - { 4334431 , 109335 }, - { 4417825 , 109811 }, - { 4502501 , 110287 }, - { 4588472 , 110762 }, - { 4675750 , 111236 }, - { 4764349 , 111709 }, - { 4854283 , 112182 }, - { 4945564 , 112654 }, - { 5038206 , 113126 }, - { 5132223 , 113597 }, - { 5227627 , 114067 }, - { 5324432 , 114537 }, - { 5422652 , 115006 }, - { 5522299 , 115474 }, - { 5623389 , 115942 }, - { 5725934 , 116409 }, - { 5829948 , 116876 }, - { 5935446 , 117342 }, - { 6042439 , 117808 }, - { 6150943 , 118273 }, - { 6260972 , 118738 }, - { 6372538 , 119202 }, - { 6485657 , 119665 }, - { 6600342 , 120128 }, - { 6716607 , 120591 }, - { 6834467 , 121053 }, - { 6953935 , 121514 }, - { 7075025 , 121976 }, - { 7197752 , 122436 }, - { 7322131 , 122896 }, - { 7448175 , 123356 }, - { 7575898 , 123815 }, - { 7705316 , 124274 }, - { 7836442 , 124733 }, - { 7969291 , 125191 }, - { 8103877 , 125648 }, - { 8240216 , 126105 }, - { 8378321 , 126562 }, - { 8518208 , 127018 }, - { 8659890 , 127474 }, - { 8803384 , 127930 }, - { 8948702 , 128385 }, - { 9095861 , 128840 }, - { 9244875 , 129294 }, - { 9395760 , 129748 }, - { 9548529 , 130202 }, - { 9703198 , 130655 }, - { 9859782 , 131108 }, - { 10018296 , 131561 }, - { 10178755 , 132014 }, - { 10341174 , 132466 }, - { 10505569 , 132917 }, - { 10671954 , 133369 }, - { 10840345 , 133820 }, - { 11010757 , 134271 }, - { 11183206 , 134721 }, - { 11357706 , 135171 }, - { 11534274 , 135621 }, - { 11712924 , 136071 }, - { 11893673 , 136520 }, - { 12076536 , 136969 }, - { 12261527 , 137418 }, - { 12448664 , 137867 }, - { 12637961 , 138315 }, - { 12829435 , 138763 }, - { 13023101 , 139211 }, - { 13218974 , 139658 }, - { 13417071 , 140106 }, - { 13617407 , 140553 }, - { 13819999 , 140999 }, - { 14024862 , 141446 }, - { 14232012 , 141892 }, - { 14441465 , 142339 }, - { 14653238 , 142785 }, - { 14867346 , 143230 }, - { 15083805 , 143676 }, - { 15302632 , 144121 }, - { 15523842 , 144566 }, - { 15747453 , 145011 }, - { 15973479 , 145456 }, - { 16201939 , 145900 }, - { 16432847 , 146345 }, - { 16666221 , 146789 }, - { 16902076 , 147233 }, - { 17140429 , 147677 }, - { 17381297 , 148121 }, - { 17624696 , 148564 }, - { 17870643 , 149007 }, - { 18119154 , 149451 }, - { 18370247 , 149894 }, - { 18623936 , 150336 }, - { 18880241 , 150779 }, - { 19139176 , 151222 }, - { 19400759 , 151664 }, - { 19665007 , 152107 }, - { 19931936 , 152549 }, - { 20201564 , 152991 }, - { 20473907 , 153433 }, - { 20748982 , 153875 }, - { 21026807 , 154316 }, - { 21307399 , 154758 }, - { 21590773 , 155199 }, - { 21876949 , 155641 }, - { 22165941 , 156082 }, - { 22457769 , 156523 }, - { 22752449 , 156964 }, - { 23049999 , 157405 }, - { 23350435 , 157846 }, - { 23653774 , 158287 }, - { 23960036 , 158727 }, - { 24269236 , 159168 }, - { 24581392 , 159608 }, - { 24896521 , 160049 }, - { 25214642 , 160489 }, - { 25535772 , 160929 }, - { 25859927 , 161370 }, - { 26187127 , 161810 }, - { 26517388 , 162250 }, - { 26850728 , 162690 }, - { 27187165 , 163130 }, - { 27526716 , 163569 }, - { 27869400 , 164009 }, - { 28215234 , 164449 }, - { 28564236 , 164889 }, - { 28916423 , 165328 }, - { 29271815 , 165768 }, - { 29630428 , 166208 }, - { 29992281 , 166647 }, - { 30357392 , 167087 }, - { 30725779 , 167526 }, - { 31097459 , 167965 }, - { 31472452 , 168405 }, - { 31850774 , 168844 }, - { 32232445 , 169283 }, - { 32617482 , 169723 }, - { 33005904 , 170162 }, - { 33397730 , 170601 }, - { 33792976 , 171041 }, - { 34191663 , 171480 }, - { 34593807 , 171919 }, - { 34999428 , 172358 }, - { 35408544 , 172797 }, - { 35821174 , 173237 }, - { 36237335 , 173676 }, - { 36657047 , 174115 }, - { 37080329 , 174554 }, - { 37507197 , 174993 }, - { 37937673 , 175433 }, - { 38371773 , 175872 }, - { 38809517 , 176311 }, - { 39250924 , 176750 }, - { 39696012 , 177190 }, - { 40144800 , 177629 }, - { 40597308 , 178068 }, - { 41053553 , 178507 }, - { 41513554 , 178947 }, - { 41977332 , 179386 }, - { 42444904 , 179825 }, - { 42916290 , 180265 }, - { 43391509 , 180704 }, - { 43870579 , 181144 }, - { 44353520 , 181583 }, - { 44840352 , 182023 }, - { 45331092 , 182462 }, - { 45825761 , 182902 }, - { 46324378 , 183342 }, - { 46826961 , 183781 }, - { 47333531 , 184221 }, - { 47844106 , 184661 }, - { 48358706 , 185101 }, - { 48877350 , 185541 }, - { 49400058 , 185981 }, - { 49926849 , 186421 }, - { 50457743 , 186861 }, - { 50992759 , 187301 }, - { 51531916 , 187741 }, - { 52075235 , 188181 }, - { 52622735 , 188622 }, - { 53174435 , 189062 }, - { 53730355 , 189502 }, - { 54290515 , 189943 }, - { 54854935 , 190383 }, - { 55423634 , 190824 }, - { 55996633 , 191265 }, - { 56573950 , 191706 }, - { 57155606 , 192146 }, - { 57741621 , 192587 }, - { 58332014 , 193028 }, - { 58926806 , 193470 }, - { 59526017 , 193911 }, - { 60129666 , 194352 }, - { 60737774 , 194793 }, - { 61350361 , 195235 }, - { 61967446 , 195677 }, - { 62589050 , 196118 }, - { 63215194 , 196560 }, - { 63845897 , 197002 }, - { 64481179 , 197444 }, - { 65121061 , 197886 }, - { 65765563 , 198328 }, - { 66414705 , 198770 }, - { 67068508 , 199213 }, - { 67726992 , 199655 }, - { 68390177 , 200098 }, - { 69058085 , 200540 }, - { 69730735 , 200983 }, - { 70408147 , 201426 }, - { 71090343 , 201869 }, - { 71777343 , 202312 }, - { 72469168 , 202755 }, - { 73165837 , 203199 }, - { 73867373 , 203642 }, - { 74573795 , 204086 }, - { 75285124 , 204529 }, - { 76001380 , 204973 }, - { 76722586 , 205417 }, - { 77448761 , 205861 }, - { 78179926 , 206306 }, - { 78916102 , 206750 }, - { 79657310 , 207194 }, - { 80403571 , 207639 }, - { 81154906 , 208084 }, - { 81911335 , 208529 }, - { 82672880 , 208974 }, - { 83439562 , 209419 }, - { 84211402 , 209864 }, - { 84988421 , 210309 }, - { 85770640 , 210755 }, - { 86558080 , 211201 }, - { 87350762 , 211647 }, - { 88148708 , 212093 }, - { 88951938 , 212539 }, - { 89760475 , 212985 }, - { 90574339 , 213432 }, - { 91393551 , 213878 }, - { 92218133 , 214325 }, - { 93048107 , 214772 }, - { 93883493 , 215219 }, - { 94724314 , 215666 }, - { 95570590 , 216114 }, - { 96422343 , 216561 }, - { 97279594 , 217009 }, - { 98142366 , 217457 }, - { 99010679 , 217905 }, - { 99884556 , 218353 }, - { 100764018 , 218801 }, - { 101649086 , 219250 }, - { 102539782 , 219698 }, - { 103436128 , 220147 }, - { 104338146 , 220596 }, - { 105245857 , 221046 }, - { 106159284 , 221495 }, - { 107078448 , 221945 }, - { 108003370 , 222394 }, - { 108934074 , 222844 }, - { 109870580 , 223294 }, - { 110812910 , 223745 }, - { 111761087 , 224195 }, - { 112715133 , 224646 }, - { 113675069 , 225097 }, - { 114640918 , 225548 }, - { 115612702 , 225999 }, - { 116590442 , 226450 }, - { 117574162 , 226902 }, - { 118563882 , 227353 }, - { 119559626 , 227805 }, - { 120561415 , 228258 }, - { 121569272 , 228710 }, - { 122583219 , 229162 }, - { 123603278 , 229615 }, - { 124629471 , 230068 }, - { 125661822 , 230521 }, - { 126700352 , 230974 }, - { 127745083 , 231428 }, - { 128796039 , 231882 }, - { 129853241 , 232336 }, - { 130916713 , 232790 }, - { 131986475 , 233244 }, - { 133062553 , 233699 }, - { 134144966 , 234153 }, - { 135233739 , 234608 }, - { 136328894 , 235064 }, - { 137430453 , 235519 }, - { 138538440 , 235975 }, - { 139652876 , 236430 }, - { 140773786 , 236886 }, - { 141901190 , 237343 }, - { 143035113 , 237799 }, - { 144175576 , 238256 }, - { 145322604 , 238713 }, - { 146476218 , 239170 }, - { 147636442 , 239627 }, - { 148803298 , 240085 }, - { 149976809 , 240542 }, - { 151156999 , 241000 }, - { 152343890 , 241459 }, - { 153537506 , 241917 }, - { 154737869 , 242376 }, - { 155945002 , 242835 }, - { 157158929 , 243294 }, - { 158379673 , 243753 }, - { 159607257 , 244213 }, - { 160841704 , 244673 }, - { 162083037 , 245133 }, - { 163331279 , 245593 }, - { 164586455 , 246054 }, - { 165848586 , 246514 }, - { 167117696 , 246975 }, - { 168393810 , 247437 }, - { 169676949 , 247898 }, - { 170967138 , 248360 }, - { 172264399 , 248822 }, - { 173568757 , 249284 }, - { 174880235 , 249747 }, - { 176198856 , 250209 }, - { 177524643 , 250672 }, - { 178857621 , 251136 }, - { 180197813 , 251599 }, - { 181545242 , 252063 }, - { 182899933 , 252527 }, - { 184261908 , 252991 }, - { 185631191 , 253456 }, - { 187007807 , 253920 }, - { 188391778 , 254385 }, - { 189783129 , 254851 }, - { 191181884 , 255316 }, - { 192588065 , 255782 }, - { 194001698 , 256248 }, - { 195422805 , 256714 }, - { 196851411 , 257181 }, - { 198287540 , 257648 }, - { 199731215 , 258115 }, - { 201182461 , 258582 }, - { 202641302 , 259050 }, - { 204107760 , 259518 }, - { 205581862 , 259986 }, - { 207063630 , 260454 }, - { 208553088 , 260923 }, - { 210050262 , 261392 }, - { 211555174 , 261861 }, - { 213067849 , 262331 }, - { 214588312 , 262800 }, - { 216116586 , 263270 }, - { 217652696 , 263741 }, - { 219196666 , 264211 }, - { 220748520 , 264682 }, - { 222308282 , 265153 }, - { 223875978 , 265625 }, - { 225451630 , 266097 }, - { 227035265 , 266569 }, - { 228626905 , 267041 }, - { 230226576 , 267514 }, - { 231834302 , 267986 }, - { 233450107 , 268460 }, - { 235074016 , 268933 }, - { 236706054 , 269407 }, - { 238346244 , 269881 }, - { 239994613 , 270355 }, - { 241651183 , 270830 }, - { 243315981 , 271305 } -}; - -/* Calculate the send rate as per section 3.1 of RFC3448 - -Returns send rate in bytes per second - -Integer maths and lookups are used as not allowed floating point in kernel - -The function for Xcalc as per section 3.1 of RFC3448 is: - -X = s - ------------------------------------------------------------- - R*sqrt(2*b*p/3) + (t_RTO * (3*sqrt(3*b*p/8) * p * (1+32*p^2))) - -where -X is the trasmit rate in bytes/second -s is the packet size in bytes -R is the round trip time in seconds -p is the loss event rate, between 0 and 1.0, of the number of loss events - as a fraction of the number of packets transmitted -t_RTO is the TCP retransmission timeout value in seconds -b is the number of packets acknowledged by a single TCP acknowledgement - -we can assume that b = 1 and t_RTO is 4 * R. With this the equation becomes: - -X = s - ----------------------------------------------------------------------- - R * sqrt(2 * p / 3) + (12 * R * (sqrt(3 * p / 8) * p * (1 + 32 * p^2))) - - -which we can break down into: - -X = s - -------- - R * f(p) - -where f(p) = sqrt(2 * p / 3) + (12 * sqrt(3 * p / 8) * p * (1 + 32 * p * p)) - -Function parameters: -s - bytes -R - RTT in usecs -p - loss rate (decimal fraction multiplied by 1,000,000) - -Returns Xcalc in bytes per second - -DON'T alter this code unless you run test cases against it as the code -has been manipulated to stop underflow/overlow. - -*/ -static u32 ccid3_calc_x(u16 s, u32 R, u32 p) -{ - int index; - u32 f; - u64 tmp1, tmp2; - - if (p < CALCX_SPLIT) - index = (p / (CALCX_SPLIT / CALCX_ARRSIZE)) - 1; - else - index = (p / (1000000 / CALCX_ARRSIZE)) - 1; - - if (index < 0) - /* p should be 0 unless there is a bug in my code */ - index = 0; - - if (R == 0) - R = 1; /* RTT can't be zero or else divide by zero */ - - BUG_ON(index >= CALCX_ARRSIZE); - - if (p >= CALCX_SPLIT) - f = calcx_lookup[index][0]; - else - f = calcx_lookup[index][1]; - - tmp1 = ((u64)s * 100000000); - tmp2 = ((u64)R * (u64)f); - do_div(tmp2,10000); - do_div(tmp1,tmp2); - /* don't alter above math unless you test due to overflow on 32 bit */ - - return (u32)tmp1; -} - /* Calculate new t_ipi (inter packet interval) by t_ipi = s / X_inst */ static inline void ccid3_calc_new_t_ipi(struct ccid3_hc_tx_sock *hctx) { @@ -737,9 +149,9 @@ static void ccid3_hc_tx_update_x(struct sock *sk) /* To avoid large error in calcX */ if (hctx->ccid3hctx_p >= TFRC_SMALLEST_P) { - hctx->ccid3hctx_x_calc = ccid3_calc_x(hctx->ccid3hctx_s, - hctx->ccid3hctx_rtt, - hctx->ccid3hctx_p); + hctx->ccid3hctx_x_calc = tfrc_calc_x(hctx->ccid3hctx_s, + hctx->ccid3hctx_rtt, + hctx->ccid3hctx_p); hctx->ccid3hctx_x = max_t(u32, min_t(u32, hctx->ccid3hctx_x_calc, 2 * hctx->ccid3hctx_x_recv), (hctx->ccid3hctx_s / @@ -1533,32 +945,6 @@ static void ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) &x_recv, sizeof(x_recv)); } -/* - * args: fvalue - function value to match - * returns: p closest to that value - * - * both fvalue and p are multiplied by 1,000,000 to use ints - */ -static u32 calcx_reverse_lookup(u32 fvalue) { - int ctr = 0; - int small; - - if (fvalue < calcx_lookup[0][1]) - return 0; - if (fvalue <= calcx_lookup[CALCX_ARRSIZE-1][1]) - small = 1; - else if (fvalue > calcx_lookup[CALCX_ARRSIZE-1][0]) - return 1000000; - else - small = 0; - while (fvalue > calcx_lookup[ctr][small]) - ctr++; - if (small) - return (CALCX_SPLIT * ctr / CALCX_ARRSIZE); - else - return (1000000 * ctr / CALCX_ARRSIZE) ; -} - /* calculate first loss interval * * returns estimated loss interval in usecs */ @@ -1627,7 +1013,7 @@ found: tmp2 = (u32)tmp1; fval = (hcrx->ccid3hcrx_s * 100000) / tmp2; /* do not alter order above or you will get overflow on 32 bit */ - p = calcx_reverse_lookup(fval); + p = tfrc_calc_x_reverse_lookup(fval); ccid3_pr_debug("%s, sk=%p, receive rate=%u bytes/s, implied " "loss rate=%u\n", dccp_role(sk), sk, x_recv, p); diff --git a/net/dccp/ccids/lib/Makefile b/net/dccp/ccids/lib/Makefile index e9a91e2..5f940a6 100644 --- a/net/dccp/ccids/lib/Makefile +++ b/net/dccp/ccids/lib/Makefile @@ -1,3 +1,3 @@ obj-$(CONFIG_IP_DCCP_TFRC_LIB) += dccp_tfrc_lib.o -dccp_tfrc_lib-y := loss_interval.o packet_history.o +dccp_tfrc_lib-y := loss_interval.o packet_history.o tfrc_equation.o diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h new file mode 100644 index 0000000..130c4c4 --- /dev/null +++ b/net/dccp/ccids/lib/tfrc.h @@ -0,0 +1,22 @@ +#ifndef _TFRC_H_ +#define _TFRC_H_ +/* + * net/dccp/ccids/lib/tfrc.h + * + * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand. + * Copyright (c) 2005 Ian McDonald + * Copyright (c) 2005 Arnaldo Carvalho de Melo + * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include + +extern u32 tfrc_calc_x(u16 s, u32 R, u32 p); +extern u32 tfrc_calc_x_reverse_lookup(u32 fvalue); + +#endif /* _TFRC_H_ */ diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c new file mode 100644 index 0000000..d2b5933 --- /dev/null +++ b/net/dccp/ccids/lib/tfrc_equation.c @@ -0,0 +1,644 @@ +/* + * net/dccp/ccids/lib/tfrc_equation.c + * + * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand. + * Copyright (c) 2005 Ian McDonald + * Copyright (c) 2005 Arnaldo Carvalho de Melo + * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include + +#include +#include + +#include "tfrc.h" + +#define TFRC_CALC_X_ARRSIZE 500 + +#define TFRC_CALC_X_SPLIT 50000 +/* equivalent to 0.05 */ + +static const u32 tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE][2] = { + { 37172, 8172 }, + { 53499, 11567 }, + { 66664, 14180 }, + { 78298, 16388 }, + { 89021, 18339 }, + { 99147, 20108 }, + { 108858, 21738 }, + { 118273, 23260 }, + { 127474, 24693 }, + { 136520, 26052 }, + { 145456, 27348 }, + { 154316, 28589 }, + { 163130, 29783 }, + { 171919, 30935 }, + { 180704, 32049 }, + { 189502, 33130 }, + { 198328, 34180 }, + { 207194, 35202 }, + { 216114, 36198 }, + { 225097, 37172 }, + { 234153, 38123 }, + { 243294, 39055 }, + { 252527, 39968 }, + { 261861, 40864 }, + { 271305, 41743 }, + { 280866, 42607 }, + { 290553, 43457 }, + { 300372, 44293 }, + { 310333, 45117 }, + { 320441, 45929 }, + { 330705, 46729 }, + { 341131, 47518 }, + { 351728, 48297 }, + { 362501, 49066 }, + { 373460, 49826 }, + { 384609, 50577 }, + { 395958, 51320 }, + { 407513, 52054 }, + { 419281, 52780 }, + { 431270, 53499 }, + { 443487, 54211 }, + { 455940, 54916 }, + { 468635, 55614 }, + { 481581, 56306 }, + { 494785, 56991 }, + { 508254, 57671 }, + { 521996, 58345 }, + { 536019, 59014 }, + { 550331, 59677 }, + { 564939, 60335 }, + { 579851, 60988 }, + { 595075, 61636 }, + { 610619, 62279 }, + { 626491, 62918 }, + { 642700, 63553 }, + { 659253, 64183 }, + { 676158, 64809 }, + { 693424, 65431 }, + { 711060, 66050 }, + { 729073, 66664 }, + { 747472, 67275 }, + { 766266, 67882 }, + { 785464, 68486 }, + { 805073, 69087 }, + { 825103, 69684 }, + { 845562, 70278 }, + { 866460, 70868 }, + { 887805, 71456 }, + { 909606, 72041 }, + { 931873, 72623 }, + { 954614, 73202 }, + { 977839, 73778 }, + { 1001557, 74352 }, + { 1025777, 74923 }, + { 1050508, 75492 }, + { 1075761, 76058 }, + { 1101544, 76621 }, + { 1127867, 77183 }, + { 1154739, 77741 }, + { 1182172, 78298 }, + { 1210173, 78852 }, + { 1238753, 79405 }, + { 1267922, 79955 }, + { 1297689, 80503 }, + { 1328066, 81049 }, + { 1359060, 81593 }, + { 1390684, 82135 }, + { 1422947, 82675 }, + { 1455859, 83213 }, + { 1489430, 83750 }, + { 1523671, 84284 }, + { 1558593, 84817 }, + { 1594205, 85348 }, + { 1630518, 85878 }, + { 1667543, 86406 }, + { 1705290, 86932 }, + { 1743770, 87457 }, + { 1782994, 87980 }, + { 1822973, 88501 }, + { 1863717, 89021 }, + { 1905237, 89540 }, + { 1947545, 90057 }, + { 1990650, 90573 }, + { 2034566, 91087 }, + { 2079301, 91600 }, + { 2124869, 92111 }, + { 2171279, 92622 }, + { 2218543, 93131 }, + { 2266673, 93639 }, + { 2315680, 94145 }, + { 2365575, 94650 }, + { 2416371, 95154 }, + { 2468077, 95657 }, + { 2520707, 96159 }, + { 2574271, 96660 }, + { 2628782, 97159 }, + { 2684250, 97658 }, + { 2740689, 98155 }, + { 2798110, 98651 }, + { 2856524, 99147 }, + { 2915944, 99641 }, + { 2976382, 100134 }, + { 3037850, 100626 }, + { 3100360, 101117 }, + { 3163924, 101608 }, + { 3228554, 102097 }, + { 3294263, 102586 }, + { 3361063, 103073 }, + { 3428966, 103560 }, + { 3497984, 104045 }, + { 3568131, 104530 }, + { 3639419, 105014 }, + { 3711860, 105498 }, + { 3785467, 105980 }, + { 3860253, 106462 }, + { 3936229, 106942 }, + { 4013410, 107422 }, + { 4091808, 107902 }, + { 4171435, 108380 }, + { 4252306, 108858 }, + { 4334431, 109335 }, + { 4417825, 109811 }, + { 4502501, 110287 }, + { 4588472, 110762 }, + { 4675750, 111236 }, + { 4764349, 111709 }, + { 4854283, 112182 }, + { 4945564, 112654 }, + { 5038206, 113126 }, + { 5132223, 113597 }, + { 5227627, 114067 }, + { 5324432, 114537 }, + { 5422652, 115006 }, + { 5522299, 115474 }, + { 5623389, 115942 }, + { 5725934, 116409 }, + { 5829948, 116876 }, + { 5935446, 117342 }, + { 6042439, 117808 }, + { 6150943, 118273 }, + { 6260972, 118738 }, + { 6372538, 119202 }, + { 6485657, 119665 }, + { 6600342, 120128 }, + { 6716607, 120591 }, + { 6834467, 121053 }, + { 6953935, 121514 }, + { 7075025, 121976 }, + { 7197752, 122436 }, + { 7322131, 122896 }, + { 7448175, 123356 }, + { 7575898, 123815 }, + { 7705316, 124274 }, + { 7836442, 124733 }, + { 7969291, 125191 }, + { 8103877, 125648 }, + { 8240216, 126105 }, + { 8378321, 126562 }, + { 8518208, 127018 }, + { 8659890, 127474 }, + { 8803384, 127930 }, + { 8948702, 128385 }, + { 9095861, 128840 }, + { 9244875, 129294 }, + { 9395760, 129748 }, + { 9548529, 130202 }, + { 9703198, 130655 }, + { 9859782, 131108 }, + { 10018296, 131561 }, + { 10178755, 132014 }, + { 10341174, 132466 }, + { 10505569, 132917 }, + { 10671954, 133369 }, + { 10840345, 133820 }, + { 11010757, 134271 }, + { 11183206, 134721 }, + { 11357706, 135171 }, + { 11534274, 135621 }, + { 11712924, 136071 }, + { 11893673, 136520 }, + { 12076536, 136969 }, + { 12261527, 137418 }, + { 12448664, 137867 }, + { 12637961, 138315 }, + { 12829435, 138763 }, + { 13023101, 139211 }, + { 13218974, 139658 }, + { 13417071, 140106 }, + { 13617407, 140553 }, + { 13819999, 140999 }, + { 14024862, 141446 }, + { 14232012, 141892 }, + { 14441465, 142339 }, + { 14653238, 142785 }, + { 14867346, 143230 }, + { 15083805, 143676 }, + { 15302632, 144121 }, + { 15523842, 144566 }, + { 15747453, 145011 }, + { 15973479, 145456 }, + { 16201939, 145900 }, + { 16432847, 146345 }, + { 16666221, 146789 }, + { 16902076, 147233 }, + { 17140429, 147677 }, + { 17381297, 148121 }, + { 17624696, 148564 }, + { 17870643, 149007 }, + { 18119154, 149451 }, + { 18370247, 149894 }, + { 18623936, 150336 }, + { 18880241, 150779 }, + { 19139176, 151222 }, + { 19400759, 151664 }, + { 19665007, 152107 }, + { 19931936, 152549 }, + { 20201564, 152991 }, + { 20473907, 153433 }, + { 20748982, 153875 }, + { 21026807, 154316 }, + { 21307399, 154758 }, + { 21590773, 155199 }, + { 21876949, 155641 }, + { 22165941, 156082 }, + { 22457769, 156523 }, + { 22752449, 156964 }, + { 23049999, 157405 }, + { 23350435, 157846 }, + { 23653774, 158287 }, + { 23960036, 158727 }, + { 24269236, 159168 }, + { 24581392, 159608 }, + { 24896521, 160049 }, + { 25214642, 160489 }, + { 25535772, 160929 }, + { 25859927, 161370 }, + { 26187127, 161810 }, + { 26517388, 162250 }, + { 26850728, 162690 }, + { 27187165, 163130 }, + { 27526716, 163569 }, + { 27869400, 164009 }, + { 28215234, 164449 }, + { 28564236, 164889 }, + { 28916423, 165328 }, + { 29271815, 165768 }, + { 29630428, 166208 }, + { 29992281, 166647 }, + { 30357392, 167087 }, + { 30725779, 167526 }, + { 31097459, 167965 }, + { 31472452, 168405 }, + { 31850774, 168844 }, + { 32232445, 169283 }, + { 32617482, 169723 }, + { 33005904, 170162 }, + { 33397730, 170601 }, + { 33792976, 171041 }, + { 34191663, 171480 }, + { 34593807, 171919 }, + { 34999428, 172358 }, + { 35408544, 172797 }, + { 35821174, 173237 }, + { 36237335, 173676 }, + { 36657047, 174115 }, + { 37080329, 174554 }, + { 37507197, 174993 }, + { 37937673, 175433 }, + { 38371773, 175872 }, + { 38809517, 176311 }, + { 39250924, 176750 }, + { 39696012, 177190 }, + { 40144800, 177629 }, + { 40597308, 178068 }, + { 41053553, 178507 }, + { 41513554, 178947 }, + { 41977332, 179386 }, + { 42444904, 179825 }, + { 42916290, 180265 }, + { 43391509, 180704 }, + { 43870579, 181144 }, + { 44353520, 181583 }, + { 44840352, 182023 }, + { 45331092, 182462 }, + { 45825761, 182902 }, + { 46324378, 183342 }, + { 46826961, 183781 }, + { 47333531, 184221 }, + { 47844106, 184661 }, + { 48358706, 185101 }, + { 48877350, 185541 }, + { 49400058, 185981 }, + { 49926849, 186421 }, + { 50457743, 186861 }, + { 50992759, 187301 }, + { 51531916, 187741 }, + { 52075235, 188181 }, + { 52622735, 188622 }, + { 53174435, 189062 }, + { 53730355, 189502 }, + { 54290515, 189943 }, + { 54854935, 190383 }, + { 55423634, 190824 }, + { 55996633, 191265 }, + { 56573950, 191706 }, + { 57155606, 192146 }, + { 57741621, 192587 }, + { 58332014, 193028 }, + { 58926806, 193470 }, + { 59526017, 193911 }, + { 60129666, 194352 }, + { 60737774, 194793 }, + { 61350361, 195235 }, + { 61967446, 195677 }, + { 62589050, 196118 }, + { 63215194, 196560 }, + { 63845897, 197002 }, + { 64481179, 197444 }, + { 65121061, 197886 }, + { 65765563, 198328 }, + { 66414705, 198770 }, + { 67068508, 199213 }, + { 67726992, 199655 }, + { 68390177, 200098 }, + { 69058085, 200540 }, + { 69730735, 200983 }, + { 70408147, 201426 }, + { 71090343, 201869 }, + { 71777343, 202312 }, + { 72469168, 202755 }, + { 73165837, 203199 }, + { 73867373, 203642 }, + { 74573795, 204086 }, + { 75285124, 204529 }, + { 76001380, 204973 }, + { 76722586, 205417 }, + { 77448761, 205861 }, + { 78179926, 206306 }, + { 78916102, 206750 }, + { 79657310, 207194 }, + { 80403571, 207639 }, + { 81154906, 208084 }, + { 81911335, 208529 }, + { 82672880, 208974 }, + { 83439562, 209419 }, + { 84211402, 209864 }, + { 84988421, 210309 }, + { 85770640, 210755 }, + { 86558080, 211201 }, + { 87350762, 211647 }, + { 88148708, 212093 }, + { 88951938, 212539 }, + { 89760475, 212985 }, + { 90574339, 213432 }, + { 91393551, 213878 }, + { 92218133, 214325 }, + { 93048107, 214772 }, + { 93883493, 215219 }, + { 94724314, 215666 }, + { 95570590, 216114 }, + { 96422343, 216561 }, + { 97279594, 217009 }, + { 98142366, 217457 }, + { 99010679, 217905 }, + { 99884556, 218353 }, + { 100764018, 218801 }, + { 101649086, 219250 }, + { 102539782, 219698 }, + { 103436128, 220147 }, + { 104338146, 220596 }, + { 105245857, 221046 }, + { 106159284, 221495 }, + { 107078448, 221945 }, + { 108003370, 222394 }, + { 108934074, 222844 }, + { 109870580, 223294 }, + { 110812910, 223745 }, + { 111761087, 224195 }, + { 112715133, 224646 }, + { 113675069, 225097 }, + { 114640918, 225548 }, + { 115612702, 225999 }, + { 116590442, 226450 }, + { 117574162, 226902 }, + { 118563882, 227353 }, + { 119559626, 227805 }, + { 120561415, 228258 }, + { 121569272, 228710 }, + { 122583219, 229162 }, + { 123603278, 229615 }, + { 124629471, 230068 }, + { 125661822, 230521 }, + { 126700352, 230974 }, + { 127745083, 231428 }, + { 128796039, 231882 }, + { 129853241, 232336 }, + { 130916713, 232790 }, + { 131986475, 233244 }, + { 133062553, 233699 }, + { 134144966, 234153 }, + { 135233739, 234608 }, + { 136328894, 235064 }, + { 137430453, 235519 }, + { 138538440, 235975 }, + { 139652876, 236430 }, + { 140773786, 236886 }, + { 141901190, 237343 }, + { 143035113, 237799 }, + { 144175576, 238256 }, + { 145322604, 238713 }, + { 146476218, 239170 }, + { 147636442, 239627 }, + { 148803298, 240085 }, + { 149976809, 240542 }, + { 151156999, 241000 }, + { 152343890, 241459 }, + { 153537506, 241917 }, + { 154737869, 242376 }, + { 155945002, 242835 }, + { 157158929, 243294 }, + { 158379673, 243753 }, + { 159607257, 244213 }, + { 160841704, 244673 }, + { 162083037, 245133 }, + { 163331279, 245593 }, + { 164586455, 246054 }, + { 165848586, 246514 }, + { 167117696, 246975 }, + { 168393810, 247437 }, + { 169676949, 247898 }, + { 170967138, 248360 }, + { 172264399, 248822 }, + { 173568757, 249284 }, + { 174880235, 249747 }, + { 176198856, 250209 }, + { 177524643, 250672 }, + { 178857621, 251136 }, + { 180197813, 251599 }, + { 181545242, 252063 }, + { 182899933, 252527 }, + { 184261908, 252991 }, + { 185631191, 253456 }, + { 187007807, 253920 }, + { 188391778, 254385 }, + { 189783129, 254851 }, + { 191181884, 255316 }, + { 192588065, 255782 }, + { 194001698, 256248 }, + { 195422805, 256714 }, + { 196851411, 257181 }, + { 198287540, 257648 }, + { 199731215, 258115 }, + { 201182461, 258582 }, + { 202641302, 259050 }, + { 204107760, 259518 }, + { 205581862, 259986 }, + { 207063630, 260454 }, + { 208553088, 260923 }, + { 210050262, 261392 }, + { 211555174, 261861 }, + { 213067849, 262331 }, + { 214588312, 262800 }, + { 216116586, 263270 }, + { 217652696, 263741 }, + { 219196666, 264211 }, + { 220748520, 264682 }, + { 222308282, 265153 }, + { 223875978, 265625 }, + { 225451630, 266097 }, + { 227035265, 266569 }, + { 228626905, 267041 }, + { 230226576, 267514 }, + { 231834302, 267986 }, + { 233450107, 268460 }, + { 235074016, 268933 }, + { 236706054, 269407 }, + { 238346244, 269881 }, + { 239994613, 270355 }, + { 241651183, 270830 }, + { 243315981, 271305 } +}; + +/* Calculate the send rate as per section 3.1 of RFC3448 + +Returns send rate in bytes per second + +Integer maths and lookups are used as not allowed floating point in kernel + +The function for Xcalc as per section 3.1 of RFC3448 is: + +X = s + ------------------------------------------------------------- + R*sqrt(2*b*p/3) + (t_RTO * (3*sqrt(3*b*p/8) * p * (1+32*p^2))) + +where +X is the trasmit rate in bytes/second +s is the packet size in bytes +R is the round trip time in seconds +p is the loss event rate, between 0 and 1.0, of the number of loss events + as a fraction of the number of packets transmitted +t_RTO is the TCP retransmission timeout value in seconds +b is the number of packets acknowledged by a single TCP acknowledgement + +we can assume that b = 1 and t_RTO is 4 * R. With this the equation becomes: + +X = s + ----------------------------------------------------------------------- + R * sqrt(2 * p / 3) + (12 * R * (sqrt(3 * p / 8) * p * (1 + 32 * p^2))) + + +which we can break down into: + +X = s + -------- + R * f(p) + +where f(p) = sqrt(2 * p / 3) + (12 * sqrt(3 * p / 8) * p * (1 + 32 * p * p)) + +Function parameters: +s - bytes +R - RTT in usecs +p - loss rate (decimal fraction multiplied by 1,000,000) + +Returns Xcalc in bytes per second + +DON'T alter this code unless you run test cases against it as the code +has been manipulated to stop underflow/overlow. + +*/ +u32 tfrc_calc_x(u16 s, u32 R, u32 p) +{ + int index; + u32 f; + u64 tmp1, tmp2; + + if (p < TFRC_CALC_X_SPLIT) + index = (p / (TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE)) - 1; + else + index = (p / (1000000 / TFRC_CALC_X_ARRSIZE)) - 1; + + if (index < 0) + /* p should be 0 unless there is a bug in my code */ + index = 0; + + if (R == 0) + R = 1; /* RTT can't be zero or else divide by zero */ + + BUG_ON(index >= TFRC_CALC_X_ARRSIZE); + + if (p >= TFRC_CALC_X_SPLIT) + f = tfrc_calc_x_lookup[index][0]; + else + f = tfrc_calc_x_lookup[index][1]; + + tmp1 = ((u64)s * 100000000); + tmp2 = ((u64)R * (u64)f); + do_div(tmp2, 10000); + do_div(tmp1, tmp2); + /* Don't alter above math unless you test due to overflow on 32 bit */ + + return (u32)tmp1; +} + +EXPORT_SYMBOL_GPL(tfrc_calc_x); + +/* + * args: fvalue - function value to match + * returns: p closest to that value + * + * both fvalue and p are multiplied by 1,000,000 to use ints + */ +u32 tfrc_calc_x_reverse_lookup(u32 fvalue) +{ + int ctr = 0; + int small; + + if (fvalue < tfrc_calc_x_lookup[0][1]) + return 0; + + if (fvalue <= tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][1]) + small = 1; + else if (fvalue > tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][0]) + return 1000000; + else + small = 0; + + while (fvalue > tfrc_calc_x_lookup[ctr][small]) + ctr++; + + if (small) + return TFRC_CALC_X_SPLIT * ctr / TFRC_CALC_X_ARRSIZE; + else + return 1000000 * ctr / TFRC_CALC_X_ARRSIZE; +} + +EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup); -- cgit v0.10.2 From 072ab6c68e3dd158b68d97eaff16734474d2f8f8 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sun, 28 Aug 2005 01:19:14 -0300 Subject: [CCID3]: Move ccid3_hc_rx_add_hist to packet_history.c Renaming it to dccp_rx_hist_add_packet. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index a215c46..849f558 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -744,125 +744,6 @@ static inline void ccid3_hc_rx_set_state(struct sock *sk, hcrx->ccid3hcrx_state = state; } -static int ccid3_hc_rx_add_hist(struct sock *sk, - struct dccp_rx_hist_entry *packet) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; - struct dccp_rx_hist_entry *entry, *next, *iter; - u8 num_later = 0; - - iter = dccp_rx_hist_head(&hcrx->ccid3hcrx_hist); - if (iter == NULL) - dccp_rx_hist_add_entry(&hcrx->ccid3hcrx_hist, packet); - else { - const u64 seqno = packet->dccphrx_seqno; - - if (after48(seqno, iter->dccphrx_seqno)) - dccp_rx_hist_add_entry(&hcrx->ccid3hcrx_hist, packet); - else { - if (dccp_rx_hist_entry_data_packet(iter)) - num_later = 1; - - list_for_each_entry_continue(iter, - &hcrx->ccid3hcrx_hist, - dccphrx_node) { - if (after48(seqno, iter->dccphrx_seqno)) { - dccp_rx_hist_add_entry(&iter->dccphrx_node, - packet); - goto trim_history; - } - - if (dccp_rx_hist_entry_data_packet(iter)) - num_later++; - - if (num_later == TFRC_RECV_NUM_LATE_LOSS) { - dccp_rx_hist_entry_delete(ccid3_rx_hist, - packet); - ccid3_pr_debug("%s, sk=%p, packet" - "(%llu) already lost!\n", - dccp_role(sk), sk, - seqno); - return 1; - } - } - - if (num_later < TFRC_RECV_NUM_LATE_LOSS) - dccp_rx_hist_add_entry(&hcrx->ccid3hcrx_hist, - packet); - /* - * FIXME: else what? should we destroy the packet - * like above? - */ - } - } - -trim_history: - /* - * Trim history (remove all packets after the NUM_LATE_LOSS + 1 - * data packets) - */ - num_later = TFRC_RECV_NUM_LATE_LOSS + 1; - - if (!list_empty(&hcrx->ccid3hcrx_li_hist)) { - list_for_each_entry_safe(entry, next, &hcrx->ccid3hcrx_hist, - dccphrx_node) { - if (num_later == 0) { - list_del_init(&entry->dccphrx_node); - dccp_rx_hist_entry_delete(ccid3_rx_hist, entry); - } else if (dccp_rx_hist_entry_data_packet(entry)) - --num_later; - } - } else { - int step = 0; - u8 win_count = 0; /* Not needed, but lets shut up gcc */ - int tmp; - /* - * We have no loss interval history so we need at least one - * rtt:s of data packets to approximate rtt. - */ - list_for_each_entry_safe(entry, next, &hcrx->ccid3hcrx_hist, - dccphrx_node) { - if (num_later == 0) { - switch (step) { - case 0: - step = 1; - /* OK, find next data packet */ - num_later = 1; - break; - case 1: - step = 2; - /* OK, find next data packet */ - num_later = 1; - win_count = entry->dccphrx_ccval; - break; - case 2: - tmp = win_count - entry->dccphrx_ccval; - if (tmp < 0) - tmp += TFRC_WIN_COUNT_LIMIT; - if (tmp > TFRC_WIN_COUNT_PER_RTT + 1) { - /* - * We have found a packet older - * than one rtt remove the rest - */ - step = 3; - } else /* OK, find next data packet */ - num_later = 1; - break; - case 3: - list_del_init(&entry->dccphrx_node); - dccp_rx_hist_entry_delete(ccid3_rx_hist, - entry); - break; - } - } else if (dccp_rx_hist_entry_data_packet(entry)) - --num_later; - } - } - - return 0; -} - static void ccid3_hc_rx_send_feedback(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); @@ -1185,7 +1066,8 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) win_count = packet->dccphrx_ccval; - ins = ccid3_hc_rx_add_hist(sk, packet); + ins = dccp_rx_hist_add_packet(ccid3_rx_hist, &hcrx->ccid3hcrx_hist, + &hcrx->ccid3hcrx_li_hist, packet); if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK) return; diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h index f68d0b4..ee8cbac 100644 --- a/net/dccp/ccids/ccid3.h +++ b/net/dccp/ccids/ccid3.h @@ -51,17 +51,11 @@ /* In usecs - half the scheduling granularity as per RFC3448 4.6 */ #define TFRC_OPSYS_HALF_TIME_GRAN (USEC_PER_SEC / (2 * HZ)) -#define TFRC_WIN_COUNT_PER_RTT 4 -#define TFRC_WIN_COUNT_LIMIT 16 - /* In seconds */ #define TFRC_MAX_BACK_OFF_TIME 64 #define TFRC_SMALLEST_P 40 -/* Number of later packets received before one is considered lost */ -#define TFRC_RECV_NUM_LATE_LOSS 3 - enum ccid3_options { TFRC_OPT_LOSS_EVENT_RATE = 192, TFRC_OPT_LOSS_INTERVALS = 193, diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c index f252a95..e2576b4 100644 --- a/net/dccp/ccids/lib/packet_history.c +++ b/net/dccp/ccids/lib/packet_history.c @@ -113,6 +113,117 @@ struct dccp_rx_hist_entry * EXPORT_SYMBOL_GPL(dccp_rx_hist_find_data_packet); +int dccp_rx_hist_add_packet(struct dccp_rx_hist *hist, + struct list_head *rx_list, + struct list_head *li_list, + struct dccp_rx_hist_entry *packet) +{ + struct dccp_rx_hist_entry *entry, *next, *iter; + u8 num_later = 0; + + iter = dccp_rx_hist_head(rx_list); + if (iter == NULL) + dccp_rx_hist_add_entry(rx_list, packet); + else { + const u64 seqno = packet->dccphrx_seqno; + + if (after48(seqno, iter->dccphrx_seqno)) + dccp_rx_hist_add_entry(rx_list, packet); + else { + if (dccp_rx_hist_entry_data_packet(iter)) + num_later = 1; + + list_for_each_entry_continue(iter, rx_list, + dccphrx_node) { + if (after48(seqno, iter->dccphrx_seqno)) { + dccp_rx_hist_add_entry(&iter->dccphrx_node, + packet); + goto trim_history; + } + + if (dccp_rx_hist_entry_data_packet(iter)) + num_later++; + + if (num_later == TFRC_RECV_NUM_LATE_LOSS) { + dccp_rx_hist_entry_delete(hist, packet); + return 1; + } + } + + if (num_later < TFRC_RECV_NUM_LATE_LOSS) + dccp_rx_hist_add_entry(rx_list, packet); + /* + * FIXME: else what? should we destroy the packet + * like above? + */ + } + } + +trim_history: + /* + * Trim history (remove all packets after the NUM_LATE_LOSS + 1 + * data packets) + */ + num_later = TFRC_RECV_NUM_LATE_LOSS + 1; + + if (!list_empty(li_list)) { + list_for_each_entry_safe(entry, next, rx_list, dccphrx_node) { + if (num_later == 0) { + list_del_init(&entry->dccphrx_node); + dccp_rx_hist_entry_delete(hist, entry); + } else if (dccp_rx_hist_entry_data_packet(entry)) + --num_later; + } + } else { + int step = 0; + u8 win_count = 0; /* Not needed, but lets shut up gcc */ + int tmp; + /* + * We have no loss interval history so we need at least one + * rtt:s of data packets to approximate rtt. + */ + list_for_each_entry_safe(entry, next, rx_list, dccphrx_node) { + if (num_later == 0) { + switch (step) { + case 0: + step = 1; + /* OK, find next data packet */ + num_later = 1; + break; + case 1: + step = 2; + /* OK, find next data packet */ + num_later = 1; + win_count = entry->dccphrx_ccval; + break; + case 2: + tmp = win_count - entry->dccphrx_ccval; + if (tmp < 0) + tmp += TFRC_WIN_COUNT_LIMIT; + if (tmp > TFRC_WIN_COUNT_PER_RTT + 1) { + /* + * We have found a packet older + * than one rtt remove the rest + */ + step = 3; + } else /* OK, find next data packet */ + num_later = 1; + break; + case 3: + list_del_init(&entry->dccphrx_node); + dccp_rx_hist_entry_delete(hist, entry); + break; + } + } else if (dccp_rx_hist_entry_data_packet(entry)) + --num_later; + } + } + + return 0; +} + +EXPORT_SYMBOL_GPL(dccp_rx_hist_add_packet); + struct dccp_tx_hist *dccp_tx_hist_new(const char *name) { struct dccp_tx_hist *hist = kmalloc(sizeof(*hist), GFP_ATOMIC); diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h index 235828d..ebfcb8e 100644 --- a/net/dccp/ccids/lib/packet_history.h +++ b/net/dccp/ccids/lib/packet_history.h @@ -44,6 +44,12 @@ #include "../../dccp.h" +/* Number of later packets received before one is considered lost */ +#define TFRC_RECV_NUM_LATE_LOSS 3 + +#define TFRC_WIN_COUNT_PER_RTT 4 +#define TFRC_WIN_COUNT_LIMIT 16 + struct dccp_tx_hist_entry { struct list_head dccphtx_node; u64 dccphtx_seqno:48, @@ -182,4 +188,9 @@ static inline int entry->dccphrx_type == DCCP_PKT_DATAACK; } +extern int dccp_rx_hist_add_packet(struct dccp_rx_hist *hist, + struct list_head *rx_list, + struct list_head *li_list, + struct dccp_rx_hist_entry *packet); + #endif /* _DCCP_PKT_HIST_ */ -- cgit v0.10.2 From 29e4f8b3c340c4b2a0c6dd197b985e03826afd13 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sun, 28 Aug 2005 02:00:28 -0300 Subject: [CCID3]: Move ccid3_hc_rx_detect_loss to packet_history.c Renaming it to dccp_rx_hist_detect_loss. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 849f558..4ff6ede 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -927,86 +927,11 @@ static void ccid3_hc_rx_detect_loss(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; - struct dccp_rx_hist_entry *entry, *next, *packet; - struct dccp_rx_hist_entry *a_loss = NULL; - struct dccp_rx_hist_entry *b_loss = NULL; - u64 seq_loss = DCCP_MAX_SEQNO + 1; - u8 win_loss = 0; - u8 num_later = TFRC_RECV_NUM_LATE_LOSS; + u8 win_loss; + const u64 seq_loss = dccp_rx_hist_detect_loss(&hcrx->ccid3hcrx_hist, + &hcrx->ccid3hcrx_li_hist, + &win_loss); - list_for_each_entry_safe(entry, next, &hcrx->ccid3hcrx_hist, - dccphrx_node) { - if (num_later == 0) { - b_loss = entry; - break; - } else if (dccp_rx_hist_entry_data_packet(entry)) - --num_later; - } - - if (b_loss == NULL) - goto out_update_li; - - num_later = 1; - - list_for_each_entry_safe_continue(entry, next, &hcrx->ccid3hcrx_hist, - dccphrx_node) { - if (num_later == 0) { - a_loss = entry; - break; - } else if (dccp_rx_hist_entry_data_packet(entry)) - --num_later; - } - - if (a_loss == NULL) { - if (list_empty(&hcrx->ccid3hcrx_li_hist)) { - /* no loss event have occured yet */ - LIMIT_NETDEBUG("%s: TODO: find a lost data packet by " - "comparing to initial seqno\n", - dccp_role(sk)); - goto out_update_li; - } else { - pr_info("%s: %s, sk=%p, ERROR! Less than 4 data " - "packets in history", - __FUNCTION__, dccp_role(sk), sk); - return; - } - } - - /* Locate a lost data packet */ - entry = packet = b_loss; - list_for_each_entry_safe_continue(entry, next, &hcrx->ccid3hcrx_hist, - dccphrx_node) { - u64 delta = dccp_delta_seqno(entry->dccphrx_seqno, - packet->dccphrx_seqno); - - if (delta != 0) { - if (dccp_rx_hist_entry_data_packet(packet)) - --delta; - /* - * FIXME: check this, probably this % usage is because - * in earlier drafts the ndp count was just 8 bits - * long, but now it cam be up to 24 bits long. - */ -#if 0 - if (delta % DCCP_NDP_LIMIT != - (packet->dccphrx_ndp - - entry->dccphrx_ndp) % DCCP_NDP_LIMIT) -#endif - if (delta != - packet->dccphrx_ndp - entry->dccphrx_ndp) { - seq_loss = entry->dccphrx_seqno; - dccp_inc_seqno(&seq_loss); - } - } - packet = entry; - if (packet == a_loss) - break; - } - - if (seq_loss != DCCP_MAX_SEQNO + 1) - win_loss = a_loss->dccphrx_ccval; - -out_update_li: ccid3_hc_rx_update_li(sk, seq_loss, win_loss); } diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c index e2576b4..d3f9d20 100644 --- a/net/dccp/ccids/lib/packet_history.c +++ b/net/dccp/ccids/lib/packet_history.c @@ -224,6 +224,88 @@ trim_history: EXPORT_SYMBOL_GPL(dccp_rx_hist_add_packet); +u64 dccp_rx_hist_detect_loss(struct list_head *rx_list, + struct list_head *li_list, u8 *win_loss) +{ + struct dccp_rx_hist_entry *entry, *next, *packet; + struct dccp_rx_hist_entry *a_loss = NULL; + struct dccp_rx_hist_entry *b_loss = NULL; + u64 seq_loss = DCCP_MAX_SEQNO + 1; + u8 num_later = TFRC_RECV_NUM_LATE_LOSS; + + list_for_each_entry_safe(entry, next, rx_list, dccphrx_node) { + if (num_later == 0) { + b_loss = entry; + break; + } else if (dccp_rx_hist_entry_data_packet(entry)) + --num_later; + } + + if (b_loss == NULL) + goto out; + + num_later = 1; + list_for_each_entry_safe_continue(entry, next, rx_list, dccphrx_node) { + if (num_later == 0) { + a_loss = entry; + break; + } else if (dccp_rx_hist_entry_data_packet(entry)) + --num_later; + } + + if (a_loss == NULL) { + if (list_empty(li_list)) { + /* no loss event have occured yet */ + LIMIT_NETDEBUG("%s: TODO: find a lost data packet by " + "comparing to initial seqno\n", + __FUNCTION__); + goto out; + } else { + LIMIT_NETDEBUG("%s: Less than 4 data pkts in history!", + __FUNCTION__); + goto out; + } + } + + /* Locate a lost data packet */ + entry = packet = b_loss; + list_for_each_entry_safe_continue(entry, next, rx_list, dccphrx_node) { + u64 delta = dccp_delta_seqno(entry->dccphrx_seqno, + packet->dccphrx_seqno); + + if (delta != 0) { + if (dccp_rx_hist_entry_data_packet(packet)) + --delta; + /* + * FIXME: check this, probably this % usage is because + * in earlier drafts the ndp count was just 8 bits + * long, but now it cam be up to 24 bits long. + */ +#if 0 + if (delta % DCCP_NDP_LIMIT != + (packet->dccphrx_ndp - + entry->dccphrx_ndp) % DCCP_NDP_LIMIT) +#endif + if (delta != packet->dccphrx_ndp - entry->dccphrx_ndp) { + seq_loss = entry->dccphrx_seqno; + dccp_inc_seqno(&seq_loss); + } + } + packet = entry; + if (packet == a_loss) + break; + } +out: + if (seq_loss != DCCP_MAX_SEQNO + 1) + *win_loss = a_loss->dccphrx_ccval; + else + *win_loss = 0; /* Paranoia */ + + return seq_loss; +} + +EXPORT_SYMBOL_GPL(dccp_rx_hist_detect_loss); + struct dccp_tx_hist *dccp_tx_hist_new(const char *name) { struct dccp_tx_hist *hist = kmalloc(sizeof(*hist), GFP_ATOMIC); diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h index ebfcb8e..fb90a91 100644 --- a/net/dccp/ccids/lib/packet_history.h +++ b/net/dccp/ccids/lib/packet_history.h @@ -193,4 +193,7 @@ extern int dccp_rx_hist_add_packet(struct dccp_rx_hist *hist, struct list_head *li_list, struct dccp_rx_hist_entry *packet); +extern u64 dccp_rx_hist_detect_loss(struct list_head *rx_list, + struct list_head *li_list, u8 *win_loss); + #endif /* _DCCP_PKT_HIST_ */ -- cgit v0.10.2 From 5f2c3b910744f68e1a507f027398f404b3feb5fb Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Sat, 27 Aug 2005 22:37:03 -0700 Subject: [NETFILTER]: Add new iptables TTL target This new iptables target allows manipulation of the TTL of an IPv4 packet. Signed-off-by: Harald Welte Signed-off-by: David S. Miller diff --git a/include/linux/netfilter_ipv4/ipt_TTL.h b/include/linux/netfilter_ipv4/ipt_TTL.h new file mode 100644 index 0000000..ee6611e --- /dev/null +++ b/include/linux/netfilter_ipv4/ipt_TTL.h @@ -0,0 +1,21 @@ +/* TTL modification module for IP tables + * (C) 2000 by Harald Welte */ + +#ifndef _IPT_TTL_H +#define _IPT_TTL_H + +enum { + IPT_TTL_SET = 0, + IPT_TTL_INC, + IPT_TTL_DEC +}; + +#define IPT_TTL_MAXMODE IPT_TTL_DEC + +struct ipt_TTL_info { + u_int8_t mode; + u_int8_t ttl; +}; + + +#endif diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index c4213f3..e046f55 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -664,6 +664,20 @@ config IP_NF_TARGET_CLASSIFY To compile it as a module, choose M here. If unsure, say N. +config IP_NF_TARGET_TTL + tristate 'TTL target support' + depends on IP_NF_MANGLE + help + This option adds a `TTL' target, which enables the user to modify + the TTL value of the IP header. + + While it is safe to decrement/lower the TTL, this target also enables + functionality to increment and set the TTL value of the IP header to + arbitrary values. This is EXTREMELY DANGEROUS since you can easily + create immortal packets that loop forever on the network. + + To compile it as a module, choose M here. If unsure, say N. + config IP_NF_TARGET_CONNMARK tristate 'CONNMARK target support' depends on IP_NF_CONNTRACK_MARK && IP_NF_MANGLE diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 89cae69..a7bd38f 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -85,6 +85,7 @@ obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o obj-$(CONFIG_IP_NF_TARGET_TCPMSS) += ipt_TCPMSS.o obj-$(CONFIG_IP_NF_TARGET_NOTRACK) += ipt_NOTRACK.o obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o +obj-$(CONFIG_IP_NF_TARGET_TTL) += ipt_TTL.o # generic ARP tables obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o diff --git a/net/ipv4/netfilter/ipt_TTL.c b/net/ipv4/netfilter/ipt_TTL.c new file mode 100644 index 0000000..b9ae6a9 --- /dev/null +++ b/net/ipv4/netfilter/ipt_TTL.c @@ -0,0 +1,119 @@ +/* TTL modification target for IP tables + * (C) 2000,2005 by Harald Welte + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include +#include +#include +#include + +#include +#include + +MODULE_AUTHOR("Harald Welte "); +MODULE_DESCRIPTION("IP tables TTL modification module"); +MODULE_LICENSE("GPL"); + +static unsigned int +ipt_ttl_target(struct sk_buff **pskb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const void *targinfo, void *userinfo) +{ + struct iphdr *iph; + const struct ipt_TTL_info *info = targinfo; + u_int16_t diffs[2]; + int new_ttl; + + if (!skb_make_writable(pskb, (*pskb)->len)) + return NF_DROP; + + iph = (*pskb)->nh.iph; + + switch (info->mode) { + case IPT_TTL_SET: + new_ttl = info->ttl; + break; + case IPT_TTL_INC: + new_ttl = iph->ttl + info->ttl; + if (new_ttl > 255) + new_ttl = 255; + break; + case IPT_TTL_DEC: + new_ttl = iph->ttl - info->ttl; + if (new_ttl < 0) + new_ttl = 0; + break; + default: + new_ttl = iph->ttl; + break; + } + + if (new_ttl != iph->ttl) { + diffs[0] = htons(((unsigned)iph->ttl) << 8) ^ 0xFFFF; + iph->ttl = new_ttl; + diffs[1] = htons(((unsigned)iph->ttl) << 8); + iph->check = csum_fold(csum_partial((char *)diffs, + sizeof(diffs), + iph->check^0xFFFF)); + } + + return IPT_CONTINUE; +} + +static int ipt_ttl_checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + struct ipt_TTL_info *info = targinfo; + + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_TTL_info))) { + printk(KERN_WARNING "ipt_TTL: targinfosize %u != %Zu\n", + targinfosize, + IPT_ALIGN(sizeof(struct ipt_TTL_info))); + return 0; + } + + if (strcmp(tablename, "mangle")) { + printk(KERN_WARNING "ipt_TTL: can only be called from " + "\"mangle\" table, not \"%s\"\n", tablename); + return 0; + } + + if (info->mode > IPT_TTL_MAXMODE) { + printk(KERN_WARNING "ipt_TTL: invalid or unknown Mode %u\n", + info->mode); + return 0; + } + + if ((info->mode != IPT_TTL_SET) && (info->ttl == 0)) + return 0; + + return 1; +} + +static struct ipt_target ipt_TTL = { + .name = "TTL", + .target = ipt_ttl_target, + .checkentry = ipt_ttl_checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_target(&ipt_TTL); +} + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_TTL); +} + +module_init(init); +module_exit(fini); -- cgit v0.10.2 From 0ac4f893f20ed524198da5ebf591fc0b9e2ced2f Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Sat, 27 Aug 2005 22:37:30 -0700 Subject: [NETFILTER6]: Add new ip6tables HOPLIMIT target This target allows users to modify the hoplimit header field of the IPv6 header. Signed-off-by: Harald Welte Signed-off-by: David S. Miller diff --git a/include/linux/netfilter_ipv6/ip6t_HL.h b/include/linux/netfilter_ipv6/ip6t_HL.h new file mode 100644 index 0000000..afb7813 --- /dev/null +++ b/include/linux/netfilter_ipv6/ip6t_HL.h @@ -0,0 +1,22 @@ +/* Hop Limit modification module for ip6tables + * Maciej Soltysiak + * Based on HW's TTL module */ + +#ifndef _IP6T_HL_H +#define _IP6T_HL_H + +enum { + IP6T_HL_SET = 0, + IP6T_HL_INC, + IP6T_HL_DEC +}; + +#define IP6T_HL_MAXMODE IP6T_HL_DEC + +struct ip6t_HL_info { + u_int8_t mode; + u_int8_t hop_limit; +}; + + +#endif diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 8a10c2d..216fbe1a 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -239,6 +239,22 @@ config IP6_NF_TARGET_MARK To compile it as a module, choose M here. If unsure, say N. +config IP6_NF_TARGET_HL + tristate 'HL (hoplimit) target support' + depends on IP6_NF_MANGLE + help + This option adds a `HL' target, which enables the user to decrement + the hoplimit value of the IPv6 header or set it to a given (lower) + value. + + While it is safe to decrement the hoplimit value, this option also + enables functionality to increment and set the hoplimit value of the + IPv6 header to arbitrary values. This is EXTREMELY DANGEROUS since + you can easily create immortal packets that loop forever on the + network. + + To compile it as a module, choose M here. If unsure, say N. + #dep_tristate ' LOG target support' CONFIG_IP6_NF_TARGET_LOG $CONFIG_IP6_NF_IPTABLES config IP6_NF_RAW tristate 'raw table support (required for TRACE)' diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index 70f6ba6..bd9a16a 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_IP6_NF_MATCH_PHYSDEV) += ip6t_physdev.o obj-$(CONFIG_IP6_NF_FILTER) += ip6table_filter.o obj-$(CONFIG_IP6_NF_MANGLE) += ip6table_mangle.o obj-$(CONFIG_IP6_NF_TARGET_MARK) += ip6t_MARK.o +obj-$(CONFIG_IP6_NF_TARGET_HL) += ip6t_HL.o obj-$(CONFIG_IP6_NF_QUEUE) += ip6_queue.o obj-$(CONFIG_IP6_NF_TARGET_LOG) += ip6t_LOG.o obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o diff --git a/net/ipv6/netfilter/ip6t_HL.c b/net/ipv6/netfilter/ip6t_HL.c new file mode 100644 index 0000000..8f5549b --- /dev/null +++ b/net/ipv6/netfilter/ip6t_HL.c @@ -0,0 +1,118 @@ +/* + * Hop Limit modification target for ip6tables + * Maciej Soltysiak + * Based on HW's TTL module + * + * This software is distributed under the terms of GNU GPL + */ + +#include +#include +#include + +#include +#include + +MODULE_AUTHOR("Maciej Soltysiak "); +MODULE_DESCRIPTION("IP tables Hop Limit modification module"); +MODULE_LICENSE("GPL"); + +static unsigned int ip6t_hl_target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, void *userinfo) +{ + struct ipv6hdr *ip6h; + const struct ip6t_HL_info *info = targinfo; + u_int16_t diffs[2]; + int new_hl; + + if (!skb_make_writable(pskb, (*pskb)->len)) + return NF_DROP; + + ip6h = (*pskb)->nh.ipv6h; + + switch (info->mode) { + case IP6T_HL_SET: + new_hl = info->hop_limit; + break; + case IP6T_HL_INC: + new_hl = ip6h->hop_limit + info->hop_limit; + if (new_hl > 255) + new_hl = 255; + break; + case IP6T_HL_DEC: + new_hl = ip6h->hop_limit - info->hop_limit; + if (new_hl < 0) + new_hl = 0; + break; + default: + new_hl = ip6h->hop_limit; + break; + } + + if (new_hl != ip6h->hop_limit) { + diffs[0] = htons(((unsigned)ip6h->hop_limit) << 8) ^ 0xFFFF; + ip6h->hop_limit = new_hl; + diffs[1] = htons(((unsigned)ip6h->hop_limit) << 8); + } + + return IP6T_CONTINUE; +} + +static int ip6t_hl_checkentry(const char *tablename, + const struct ip6t_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + struct ip6t_HL_info *info = targinfo; + + if (targinfosize != IP6T_ALIGN(sizeof(struct ip6t_HL_info))) { + printk(KERN_WARNING "ip6t_HL: targinfosize %u != %Zu\n", + targinfosize, + IP6T_ALIGN(sizeof(struct ip6t_HL_info))); + return 0; + } + + if (strcmp(tablename, "mangle")) { + printk(KERN_WARNING "ip6t_HL: can only be called from " + "\"mangle\" table, not \"%s\"\n", tablename); + return 0; + } + + if (info->mode > IP6T_HL_MAXMODE) { + printk(KERN_WARNING "ip6t_HL: invalid or unknown Mode %u\n", + info->mode); + return 0; + } + + if ((info->mode != IP6T_HL_SET) && (info->hop_limit == 0)) { + printk(KERN_WARNING "ip6t_HL: increment/decrement doesn't " + "make sense with value 0\n"); + return 0; + } + + return 1; +} + +static struct ip6t_target ip6t_HL = { + .name = "HL", + .target = ip6t_hl_target, + .checkentry = ip6t_hl_checkentry, + .me = THIS_MODULE +}; + +static int __init init(void) +{ + return ip6t_register_target(&ip6t_HL); +} + +static void __exit fini(void) +{ + ip6t_unregister_target(&ip6t_HL); +} + +module_init(init); +module_exit(fini); -- cgit v0.10.2 From a84ffe430342db6ee585a5038f3242a6b4112d69 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sun, 28 Aug 2005 04:51:32 -0300 Subject: [DCCP]: Introduce DCCP_SOCKOPT_PACKET_SIZE So that applications can set dccp_sock->dccps_pkt_size, that in turn is used in the CCID3 half connection init routines to set ccid3hc[tr]x_s and use it in its rate calculations. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 9e3a137..007c290 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -186,6 +186,9 @@ enum { DCCPF_MAX_CCID_SPECIFIC = 255, }; +/* DCCP socket options */ +#define DCCP_SOCKOPT_PACKET_SIZE 1 + #ifdef __KERNEL__ #include @@ -396,7 +399,7 @@ enum dccp_role { * @dccps_timestamp_echo - latest timestamp received on a TIMESTAMP option * @dccps_ext_header_len - network protocol overhead (IP/IPv6 options) * @dccps_pmtu_cookie - Last pmtu seen by socket - * @dccps_avg_packet_size - FIXME: has to be set by the app thru some setsockopt or ioctl, CCID3 uses it + * @dccps_packet_size - Set thru setsockopt * @dccps_role - Role of this sock, one of %dccp_role * @dccps_ndp_count - number of Non Data Packets since last data packet * @dccps_hc_rx_ackpkts - receiver half connection acked packets @@ -417,7 +420,7 @@ struct dccp_sock { unsigned long dccps_service; struct timeval dccps_timestamp_time; __u32 dccps_timestamp_echo; - __u32 dccps_avg_packet_size; + __u32 dccps_packet_size; unsigned long dccps_ndp_count; __u16 dccps_ext_header_len; __u32 dccps_pmtu_cookie; diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 4ff6ede..e22b0ee 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -672,9 +672,9 @@ static int ccid3_hc_tx_init(struct sock *sk) memset(hctx, 0, sizeof(*hctx)); - if (dp->dccps_avg_packet_size >= TFRC_MIN_PACKET_SIZE && - dp->dccps_avg_packet_size <= TFRC_MAX_PACKET_SIZE) - hctx->ccid3hctx_s = (u16)dp->dccps_avg_packet_size; + if (dp->dccps_packet_size >= TFRC_MIN_PACKET_SIZE && + dp->dccps_packet_size <= TFRC_MAX_PACKET_SIZE) + hctx->ccid3hctx_s = dp->dccps_packet_size; else hctx->ccid3hctx_s = TFRC_STD_PACKET_SIZE; @@ -1058,9 +1058,9 @@ static int ccid3_hc_rx_init(struct sock *sk) memset(hcrx, 0, sizeof(*hcrx)); - if (dp->dccps_avg_packet_size >= TFRC_MIN_PACKET_SIZE && - dp->dccps_avg_packet_size <= TFRC_MAX_PACKET_SIZE) - hcrx->ccid3hcrx_s = (u16)dp->dccps_avg_packet_size; + if (dp->dccps_packet_size >= TFRC_MIN_PACKET_SIZE && + dp->dccps_packet_size <= TFRC_MAX_PACKET_SIZE) + hcrx->ccid3hcrx_s = dp->dccps_packet_size; else hcrx->ccid3hcrx_s = TFRC_STD_PACKET_SIZE; diff --git a/net/dccp/proto.c b/net/dccp/proto.c index f4da656..18a0e69 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -205,23 +205,67 @@ int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg) int dccp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, int optlen) { - dccp_pr_debug("entry\n"); + struct dccp_sock *dp; + int err; + int val; if (level != SOL_DCCP) return ip_setsockopt(sk, level, optname, optval, optlen); - return -EOPNOTSUPP; + if (optlen < sizeof(int)) + return -EINVAL; + + if (get_user(val, (int __user *)optval)) + return -EFAULT; + + lock_sock(sk); + + dp = dccp_sk(sk); + err = 0; + + switch (optname) { + case DCCP_SOCKOPT_PACKET_SIZE: + dp->dccps_packet_size = val; + break; + default: + err = -ENOPROTOOPT; + break; + } + + release_sock(sk); + return err; } int dccp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { - dccp_pr_debug("entry\n"); + struct dccp_sock *dp; + int val, len; if (level != SOL_DCCP) return ip_getsockopt(sk, level, optname, optval, optlen); - return -EOPNOTSUPP; + if (get_user(len, optlen)) + return -EFAULT; + + len = min_t(unsigned int, len, sizeof(int)); + if (len < 0) + return -EINVAL; + + dp = dccp_sk(sk); + + switch (optname) { + case DCCP_SOCKOPT_PACKET_SIZE: + val = dp->dccps_packet_size; + break; + default: + return -ENOPROTOOPT; + } + + if (put_user(len, optlen) || copy_to_user(optval, &val, len)) + return -EFAULT; + + return 0; } int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, -- cgit v0.10.2 From c530cfb1ce1e8f230744c3f3bd86771f50725053 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 29 Aug 2005 02:15:54 -0300 Subject: [CCID3]: Call sk->sk_write_space(sk) when receiving a feedback packet This makes the send rate calculations behave way more closely to what is specified, with the jitter previously seen on x and x_recv disappearing completely on non lossy setups. This resembles the tcp_data_snd_check code, that possibly we'll end up using in DCCP as well, perhaps moving this code to inet_connection_sock. For now I'm doing the simplest implementation tho. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index e22b0ee..7bf3b3a 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -556,6 +556,11 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) /* remove all packets older than the one acked from history */ dccp_tx_hist_purge_older(ccid3_tx_hist, &hctx->ccid3hctx_hist, packet); + /* + * As we have calculated new ipi, delta, t_nom it is possible that + * we now can send a packet, so wake up dccp_wait_for_ccids. + */ + sk->sk_write_space(sk); /* * Schedule no feedback timer to expire in diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 5cd9e79..33456c0 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -127,6 +127,7 @@ extern void dccp_send_sync(struct sock *sk, const u64 seq, const enum dccp_pkt_type pkt_type); extern int dccp_write_xmit(struct sock *sk, struct sk_buff *skb, long *timeo); +extern void dccp_write_space(struct sock *sk); extern void dccp_init_xmit_timers(struct sock *sk); static inline void dccp_clear_xmit_timers(struct sock *sk) diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 3cf2cbc..3fc75db 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -1280,6 +1280,7 @@ static int dccp_v4_init_sock(struct sock *sk) dccp_init_xmit_timers(sk); inet_csk(sk)->icsk_rto = DCCP_TIMEOUT_INIT; sk->sk_state = DCCP_CLOSED; + sk->sk_write_space = dccp_write_space; dp->dccps_mss_cache = 536; dp->dccps_role = DCCP_ROLE_UNDEFINED; diff --git a/net/dccp/output.c b/net/dccp/output.c index 116f6db..28de157 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -150,6 +150,19 @@ unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu) return mss_now; } +void dccp_write_space(struct sock *sk) +{ + read_lock(&sk->sk_callback_lock); + + if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + wake_up_interruptible(sk->sk_sleep); + /* Should agree with poll, otherwise some programs break */ + if (sock_writeable(sk)) + sk_wake_async(sk, 2, POLL_OUT); + + read_unlock(&sk->sk_callback_lock); +} + /** * dccp_wait_for_ccid - Wait for ccid to tell us we can send a packet * @sk: socket to wait for -- cgit v0.10.2 From d992895ba2b27cf5adf1ba0ad6d27662adc54c5e Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Sun, 28 Aug 2005 16:49:11 +1000 Subject: [PATCH] Lazy page table copies in fork() Defer copying of ptes until fault time when it is possible to reconstruct the pte from backing store. Idea from Andi Kleen and Nick Piggin. Thanks to input from Rik van Riel and Linus and to Hugh for correcting my blundering. Ray Fucillo reports: "I applied this latest patch to a 2.6.12 kernel and found that it does resolve the problem. Prior to the patch on this machine, I was seeing about 23ms spent in fork for ever 100MB of shared memory segment. After applying the patch, fork is taking about 1ms regardless of the shared memory size." Signed-off-by: Nick Piggin Signed-off-by: Linus Torvalds diff --git a/mm/memory.c b/mm/memory.c index e046b7e..a596c11 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -498,6 +498,17 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, unsigned long addr = vma->vm_start; unsigned long end = vma->vm_end; + /* + * Don't copy ptes where a page fault will fill them correctly. + * Fork becomes much lighter when there are big shared or private + * readonly mappings. The tradeoff is that copy_page_range is more + * efficient than faulting. + */ + if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_RESERVED))) { + if (!vma->anon_vma) + return 0; + } + if (is_vm_hugetlb_page(vma)) return copy_hugetlb_page_range(dst_mm, src_mm, vma); -- cgit v0.10.2 From b74d0bd53406c23636707565d87ddaa55d315b26 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 10 Aug 2005 13:53:41 +1000 Subject: [PATCH] ppc64: four level pagetables fix With CONFIG_HUGETLB_PAGE=n: In file included from kernel/sysctl.c:37: include/linux/hugetlb.h:104:1: warning: "hugetlb_free_pgd_range" redefined In file included from include/linux/mm.h:36, from kernel/sysctl.c:23: include/asm/pgtable.h:492:1: warning: this is the location of the previous definition Signed-off-by: Andrew Morton Signed-off-by: David Gibson Signed-off-by: Paul Mackerras diff --git a/include/asm-ppc64/pgtable.h b/include/asm-ppc64/pgtable.h index 5ea952a..c83679c 100644 --- a/include/asm-ppc64/pgtable.h +++ b/include/asm-ppc64/pgtable.h @@ -489,8 +489,10 @@ extern pgd_t swapper_pg_dir[]; extern void paging_init(void); +#ifdef CONFIG_HUGETLB_PAGE #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \ free_pgd_range(tlb, addr, end, floor, ceiling) +#endif /* * This gets called at the end of handling a page fault, when -- cgit v0.10.2 From 2739e8cf113ce6e931608986a28bab5a42c0acd9 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Tue, 30 Aug 2005 12:54:07 +1000 Subject: [PATCH] Restore lparmap.s include for iSeries A mistake rebasing the series of ppc64 head.S cleanup patches meant the #include of lparmap.s, needed for iSeries was lost. This patch puts it back again. Signed-off-by: David Gibson Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/kernel/head.S b/arch/ppc64/kernel/head.S index cccec49..d98a998 100644 --- a/arch/ppc64/kernel/head.S +++ b/arch/ppc64/kernel/head.S @@ -1269,7 +1269,16 @@ initial_stab: .= 0x7000 .globl fwnmi_data_area fwnmi_data_area: - .space PAGE_SIZE + + /* iSeries does not use the FWNMI stuff, so it is safe to put + * this here, even if we later allow kernels that will boot on + * both pSeries and iSeries */ +#ifdef CONFIG_PPC_ISERIES + . = LPARMAP_PHYS +#include "lparmap.s" +#endif /* CONFIG_PPC_ISERIES */ + + . = 0x8000 /* * On pSeries, secondary processors spin in the following code. -- cgit v0.10.2 From 1e4a79e0458beca871c662028610ae3a88e3f1bf Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Mon, 29 Aug 2005 18:26:01 +1000 Subject: [PATCH] fix iSeries build for gcc-3.4 gcc 3.4 (at least the build we are using) puts the gcc generated .ident string into a .note section at the end of the files it compiles (gcc 3.3.3-hammer and gcc 4.0.2 Debian puts it in the .text section). This means that the lparmap.s file we produce in the iSeries build may end with a .note section. When we include it into head.S, the assembler can no longer resolve some of the conditional branches since the target label ends up too far away. This patch just forces us back to the .text section after including lparmap.s. The breakage was caused by my patch "iSeries build with newer assemblers and compilers" (sha1-id: 2ad56496627630ebc99f06af5f81ca23e17e014e). Signed-off-by: Stephen Rothwell Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/kernel/head.S b/arch/ppc64/kernel/head.S index d98a998..0369597 100644 --- a/arch/ppc64/kernel/head.S +++ b/arch/ppc64/kernel/head.S @@ -1276,6 +1276,11 @@ fwnmi_data_area: #ifdef CONFIG_PPC_ISERIES . = LPARMAP_PHYS #include "lparmap.s" +/* + * This ".text" is here for old compilers that generate a trailing + * .note section when compiling .c files to .s + */ + .text #endif /* CONFIG_PPC_ISERIES */ . = 0x8000 -- cgit v0.10.2 From 5c0b4b8759f78c31172088a91e10733fc014ccee Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Wed, 17 Aug 2005 16:37:35 +1000 Subject: [PATCH] Formatting changes to vio.c Formatting changes to vio.c to bring it closer to the kernel coding standard. Signed-off-by: Stephen Rothwell Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/kernel/vio.c b/arch/ppc64/kernel/vio.c index 3b790ba..7798f01 100644 --- a/arch/ppc64/kernel/vio.c +++ b/arch/ppc64/kernel/vio.c @@ -37,9 +37,11 @@ static int (*is_match)(const struct vio_device_id *id, static void (*unregister_device_callback)(struct vio_dev *dev); static void (*release_device_callback)(struct device *dev); -/* convert from struct device to struct vio_dev and pass to driver. +/* + * Convert from struct device to struct vio_dev and pass to driver. * dev->driver has already been set by generic code because vio_bus_match - * succeeded. */ + * succeeded. + */ static int vio_bus_probe(struct device *dev) { struct vio_dev *viodev = to_vio_dev(dev); @@ -51,9 +53,8 @@ static int vio_bus_probe(struct device *dev) return error; id = vio_match_device(viodrv->id_table, viodev); - if (id) { + if (id) error = viodrv->probe(viodev, id); - } return error; } @@ -64,9 +65,8 @@ static int vio_bus_remove(struct device *dev) struct vio_dev *viodev = to_vio_dev(dev); struct vio_driver *viodrv = to_vio_driver(dev->driver); - if (viodrv->remove) { + if (viodrv->remove) return viodrv->remove(viodev); - } /* driver can't remove */ return 1; @@ -102,16 +102,17 @@ void vio_unregister_driver(struct vio_driver *viodrv) EXPORT_SYMBOL(vio_unregister_driver); /** - * vio_match_device: - Tell if a VIO device has a matching VIO device id structure. - * @ids: array of VIO device id structures to search in - * @dev: the VIO device structure to match against + * vio_match_device: - Tell if a VIO device has a matching + * VIO device id structure. + * @ids: array of VIO device id structures to search in + * @dev: the VIO device structure to match against * * Used by a driver to check whether a VIO device present in the * system is in its list of supported devices. Returns the matching * vio_device_id structure or NULL if there is no match. */ -static const struct vio_device_id * vio_match_device(const struct vio_device_id *ids, - const struct vio_dev *dev) +static const struct vio_device_id *vio_match_device( + const struct vio_device_id *ids, const struct vio_dev *dev) { while (ids->type) { if (is_match(ids, dev)) @@ -141,7 +142,8 @@ int __init vio_bus_init(int (*match_func)(const struct vio_device_id *id, return err; } - /* the fake parent of all vio devices, just to give us + /* + * The fake parent of all vio devices, just to give us * a nice directory */ err = device_register(&vio_bus_device.dev); @@ -162,7 +164,8 @@ static void __devinit vio_dev_release(struct device *dev) kfree(to_vio_dev(dev)); } -static ssize_t viodev_show_name(struct device *dev, struct device_attribute *attr, char *buf) +static ssize_t viodev_show_name(struct device *dev, + struct device_attribute *attr, char *buf) { return sprintf(buf, "%s\n", to_vio_dev(dev)->name); } @@ -262,16 +265,8 @@ static int vio_bus_match(struct device *dev, struct device_driver *drv) const struct vio_dev *vio_dev = to_vio_dev(dev); struct vio_driver *vio_drv = to_vio_driver(drv); const struct vio_device_id *ids = vio_drv->id_table; - const struct vio_device_id *found_id; - - if (!ids) - return 0; - found_id = vio_match_device(ids, vio_dev); - if (found_id) - return 1; - - return 0; + return (ids != NULL) && (vio_match_device(ids, vio_dev) != NULL); } struct bus_type vio_bus_type = { -- cgit v0.10.2 From b877b90f227fb9698d99fb70492d432362584082 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Wed, 17 Aug 2005 16:40:12 +1000 Subject: [PATCH] Create vio_register_device Take some assignments out of vio_register_device_common and rename it to vio_register_device. Signed-off-by: Stephen Rothwell Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/kernel/iSeries_vio.c b/arch/ppc64/kernel/iSeries_vio.c index b4268cc..d0960a8 100644 --- a/arch/ppc64/kernel/iSeries_vio.c +++ b/arch/ppc64/kernel/iSeries_vio.c @@ -68,7 +68,7 @@ static void __init iommu_vio_init(void) } /** - * vio_register_device: - Register a new vio device. + * vio_register_device_iseries: - Register a new iSeries vio device. * @voidev: The device to register. */ static struct vio_dev *__init vio_register_device_iseries(char *type, @@ -76,7 +76,7 @@ static struct vio_dev *__init vio_register_device_iseries(char *type, { struct vio_dev *viodev; - /* allocate a vio_dev for this node */ + /* allocate a vio_dev for this device */ viodev = kmalloc(sizeof(struct vio_dev), GFP_KERNEL); if (!viodev) return NULL; @@ -84,8 +84,15 @@ static struct vio_dev *__init vio_register_device_iseries(char *type, snprintf(viodev->dev.bus_id, BUS_ID_SIZE, "%s%d", type, unit_num); - return vio_register_device_common(viodev, viodev->dev.bus_id, type, - unit_num, &vio_iommu_table); + viodev->name = viodev->dev.bus_id; + viodev->type = type; + viodev->unit_address = unit_num; + viodev->iommu_table = &vio_iommu_table; + if (vio_register_device(viodev) == NULL) { + kfree(viodev); + return NULL; + } + return viodev; } void __init probe_bus_iseries(void) diff --git a/arch/ppc64/kernel/pSeries_vio.c b/arch/ppc64/kernel/pSeries_vio.c index 338f9e1..81e94f8 100644 --- a/arch/ppc64/kernel/pSeries_vio.c +++ b/arch/ppc64/kernel/pSeries_vio.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -181,11 +182,13 @@ struct vio_dev * __devinit vio_register_device_node(struct device_node *of_node) } snprintf(viodev->dev.bus_id, BUS_ID_SIZE, "%x", *unit_address); + viodev->name = of_node->name; + viodev->type = of_node->type; + viodev->unit_address = *unit_address; + viodev->iommu_table = vio_build_iommu_table(viodev); /* register with generic device framework */ - if (vio_register_device_common(viodev, of_node->name, of_node->type, - *unit_address, vio_build_iommu_table(viodev)) - == NULL) { + if (vio_register_device(viodev) == NULL) { /* XXX free TCE table */ kfree(viodev); return NULL; diff --git a/arch/ppc64/kernel/vio.c b/arch/ppc64/kernel/vio.c index 7798f01..3eab229 100644 --- a/arch/ppc64/kernel/vio.c +++ b/arch/ppc64/kernel/vio.c @@ -171,14 +171,8 @@ static ssize_t viodev_show_name(struct device *dev, } DEVICE_ATTR(name, S_IRUSR | S_IRGRP | S_IROTH, viodev_show_name, NULL); -struct vio_dev * __devinit vio_register_device_common( - struct vio_dev *viodev, char *name, char *type, - uint32_t unit_address, struct iommu_table *iommu_table) +struct vio_dev * __devinit vio_register_device(struct vio_dev *viodev) { - viodev->name = name; - viodev->type = type; - viodev->unit_address = unit_address; - viodev->iommu_table = iommu_table; /* init generic 'struct device' fields: */ viodev->dev.parent = &vio_bus_device.dev; viodev->dev.bus = &vio_bus_type; diff --git a/include/asm-ppc64/vio.h b/include/asm-ppc64/vio.h index a82e87c..578e301 100644 --- a/include/asm-ppc64/vio.h +++ b/include/asm-ppc64/vio.h @@ -56,9 +56,7 @@ const void * vio_get_attribute(struct vio_dev *vdev, void* which, int* length); int vio_get_irq(struct vio_dev *dev); int vio_enable_interrupts(struct vio_dev *dev); int vio_disable_interrupts(struct vio_dev *dev); -extern struct vio_dev * __devinit vio_register_device_common( - struct vio_dev *viodev, char *name, char *type, - uint32_t unit_address, struct iommu_table *iommu_table); +extern struct vio_dev * __devinit vio_register_device(struct vio_dev *viodev); extern struct dma_mapping_ops vio_dma_ops; -- cgit v0.10.2 From 71d276d751ff5ddba28312aecefb174b20a5b970 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Wed, 17 Aug 2005 16:41:44 +1000 Subject: [PATCH] Create vio_bus_ops Create vio_bus_ops so that we just pass a structure to vio_bus_init instead of three separate function pointers. Rearrange vio.h to avoid forward references. vio.h only needs struct device_node from prom.h so remove the include and just declare it. Signed-off-by: Stephen Rothwell Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/kernel/iSeries_vio.c b/arch/ppc64/kernel/iSeries_vio.c index d0960a8..6b754b0 100644 --- a/arch/ppc64/kernel/iSeries_vio.c +++ b/arch/ppc64/kernel/iSeries_vio.c @@ -131,6 +131,10 @@ static int vio_match_device_iseries(const struct vio_device_id *id, return strncmp(dev->type, id->type, strlen(id->type)) == 0; } +static struct vio_bus_ops vio_bus_ops_iseries = { + .match = vio_match_device_iseries, +}; + /** * vio_bus_init_iseries: - Initialize the iSeries virtual IO bus */ @@ -138,7 +142,7 @@ static int __init vio_bus_init_iseries(void) { int err; - err = vio_bus_init(vio_match_device_iseries, NULL, NULL); + err = vio_bus_init(&vio_bus_ops_iseries); if (err == 0) { iommu_vio_init(); vio_bus_device.iommu_table = &vio_iommu_table; diff --git a/arch/ppc64/kernel/pSeries_vio.c b/arch/ppc64/kernel/pSeries_vio.c index 81e94f8..e0ae06f 100644 --- a/arch/ppc64/kernel/pSeries_vio.c +++ b/arch/ppc64/kernel/pSeries_vio.c @@ -76,6 +76,12 @@ static void vio_unregister_device_pseries(struct vio_dev *viodev) device_remove_file(&viodev->dev, &dev_attr_devspec); } +static struct vio_bus_ops vio_bus_ops_pseries = { + .match = vio_match_device_pseries, + .unregister_device = vio_unregister_device_pseries, + .release_device = vio_release_device_pseries, +}; + /** * vio_bus_init_pseries: - Initialize the pSeries virtual IO bus */ @@ -83,9 +89,7 @@ static int __init vio_bus_init_pseries(void) { int err; - err = vio_bus_init(vio_match_device_pseries, - vio_unregister_device_pseries, - vio_release_device_pseries); + err = vio_bus_init(&vio_bus_ops_pseries); if (err == 0) probe_bus_pseries(); return err; diff --git a/arch/ppc64/kernel/vio.c b/arch/ppc64/kernel/vio.c index 3eab229..93c437a 100644 --- a/arch/ppc64/kernel/vio.c +++ b/arch/ppc64/kernel/vio.c @@ -32,10 +32,7 @@ struct vio_dev vio_bus_device = { /* fake "parent" device */ .dev.bus = &vio_bus_type, }; -static int (*is_match)(const struct vio_device_id *id, - const struct vio_dev *dev); -static void (*unregister_device_callback)(struct vio_dev *dev); -static void (*release_device_callback)(struct device *dev); +static struct vio_bus_ops vio_bus_ops; /* * Convert from struct device to struct vio_dev and pass to driver. @@ -115,7 +112,7 @@ static const struct vio_device_id *vio_match_device( const struct vio_device_id *ids, const struct vio_dev *dev) { while (ids->type) { - if (is_match(ids, dev)) + if (vio_bus_ops.match(ids, dev)) return ids; ids++; } @@ -125,16 +122,11 @@ static const struct vio_device_id *vio_match_device( /** * vio_bus_init: - Initialize the virtual IO bus */ -int __init vio_bus_init(int (*match_func)(const struct vio_device_id *id, - const struct vio_dev *dev), - void (*unregister_dev)(struct vio_dev *), - void (*release_dev)(struct device *)) +int __init vio_bus_init(struct vio_bus_ops *ops) { int err; - is_match = match_func; - unregister_device_callback = unregister_dev; - release_device_callback = release_dev; + vio_bus_ops = *ops; err = bus_register(&vio_bus_type); if (err) { @@ -159,8 +151,8 @@ int __init vio_bus_init(int (*match_func)(const struct vio_device_id *id, /* vio_dev refcount hit 0 */ static void __devinit vio_dev_release(struct device *dev) { - if (release_device_callback) - release_device_callback(dev); + if (vio_bus_ops.release_device) + vio_bus_ops.release_device(dev); kfree(to_vio_dev(dev)); } @@ -191,8 +183,8 @@ struct vio_dev * __devinit vio_register_device(struct vio_dev *viodev) void __devinit vio_unregister_device(struct vio_dev *viodev) { - if (unregister_device_callback) - unregister_device_callback(viodev); + if (vio_bus_ops.unregister_device) + vio_bus_ops.unregister_device(viodev); device_remove_file(&viodev->dev, &dev_attr_name); device_unregister(&viodev->dev); } diff --git a/drivers/scsi/ibmvscsi/rpa_vscsi.c b/drivers/scsi/ibmvscsi/rpa_vscsi.c index 035f615..8bf5652 100644 --- a/drivers/scsi/ibmvscsi/rpa_vscsi.c +++ b/drivers/scsi/ibmvscsi/rpa_vscsi.c @@ -28,6 +28,7 @@ */ #include +#include #include #include #include diff --git a/include/asm-ppc64/vio.h b/include/asm-ppc64/vio.h index 578e301..85420bb 100644 --- a/include/asm-ppc64/vio.h +++ b/include/asm-ppc64/vio.h @@ -19,13 +19,14 @@ #include #include #include + #include -#include #include -/* + +/* * Architecture-specific constants for drivers to * extract attributes of the device using vio_get_attribute() -*/ + */ #define VETH_MAC_ADDR "local-mac-address" #define VETH_MCAST_FILTER_SIZE "ibm,mac-address-filters" @@ -37,30 +38,19 @@ #define VIO_IRQ_DISABLE 0UL #define VIO_IRQ_ENABLE 1UL -struct vio_dev; -struct vio_driver; -struct vio_device_id; struct iommu_table; -int vio_register_driver(struct vio_driver *drv); -void vio_unregister_driver(struct vio_driver *drv); - -#ifdef CONFIG_PPC_PSERIES -struct vio_dev * __devinit vio_register_device_node( - struct device_node *node_vdev); -#endif -void __devinit vio_unregister_device(struct vio_dev *dev); -struct vio_dev *vio_find_node(struct device_node *vnode); - -const void * vio_get_attribute(struct vio_dev *vdev, void* which, int* length); -int vio_get_irq(struct vio_dev *dev); -int vio_enable_interrupts(struct vio_dev *dev); -int vio_disable_interrupts(struct vio_dev *dev); -extern struct vio_dev * __devinit vio_register_device(struct vio_dev *viodev); - -extern struct dma_mapping_ops vio_dma_ops; - -extern struct bus_type vio_bus_type; +/* + * The vio_dev structure is used to describe virtual I/O devices. + */ +struct vio_dev { + struct iommu_table *iommu_table; /* vio_map_* uses this */ + char *name; + char *type; + uint32_t unit_address; + unsigned int irq; + struct device dev; +}; struct vio_device_id { char *type; @@ -70,42 +60,51 @@ struct vio_device_id { struct vio_driver { struct list_head node; char *name; - const struct vio_device_id *id_table; /* NULL if wants all devices */ - int (*probe) (struct vio_dev *dev, const struct vio_device_id *id); /* New device inserted */ - int (*remove) (struct vio_dev *dev); /* Device removed (NULL if not a hot-plug capable driver) */ + const struct vio_device_id *id_table; + int (*probe)(struct vio_dev *dev, const struct vio_device_id *id); + int (*remove)(struct vio_dev *dev); unsigned long driver_data; - struct device_driver driver; }; +struct vio_bus_ops { + int (*match)(const struct vio_device_id *id, const struct vio_dev *dev); + void (*unregister_device)(struct vio_dev *); + void (*release_device)(struct device *); +}; + +extern struct dma_mapping_ops vio_dma_ops; +extern struct bus_type vio_bus_type; +extern struct vio_dev vio_bus_device; + +extern int vio_register_driver(struct vio_driver *drv); +extern void vio_unregister_driver(struct vio_driver *drv); + +extern struct vio_dev * __devinit vio_register_device(struct vio_dev *viodev); +extern void __devinit vio_unregister_device(struct vio_dev *dev); + +extern int vio_bus_init(struct vio_bus_ops *); + +#ifdef CONFIG_PPC_PSERIES +struct device_node; + +extern struct vio_dev * __devinit vio_register_device_node( + struct device_node *node_vdev); +extern struct vio_dev *vio_find_node(struct device_node *vnode); +extern const void *vio_get_attribute(struct vio_dev *vdev, void *which, + int *length); +extern int vio_enable_interrupts(struct vio_dev *dev); +extern int vio_disable_interrupts(struct vio_dev *dev); +#endif + static inline struct vio_driver *to_vio_driver(struct device_driver *drv) { return container_of(drv, struct vio_driver, driver); } -/* - * The vio_dev structure is used to describe virtual I/O devices. - */ -struct vio_dev { - struct iommu_table *iommu_table; /* vio_map_* uses this */ - char *name; - char *type; - uint32_t unit_address; - unsigned int irq; - - struct device dev; -}; - -extern struct vio_dev vio_bus_device; - static inline struct vio_dev *to_vio_dev(struct device *dev) { return container_of(dev, struct vio_dev, dev); } -extern int vio_bus_init(int (*is_match)(const struct vio_device_id *id, - const struct vio_dev *dev), - void (*)(struct vio_dev *), - void (*)(struct device *)); - #endif /* _ASM_VIO_H */ -- cgit v0.10.2 From fb120da678c517f72d4b39932062c2191827b331 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Wed, 17 Aug 2005 16:42:59 +1000 Subject: [PATCH] Make MODULE_DEVICE_TABLE work for vio devices Make MODULE_DEVICE_TABLE work for vio devices. Signed-off-by: Stephen Rothwell Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/kernel/vio.c b/arch/ppc64/kernel/vio.c index 93c437a..c90e1dd 100644 --- a/arch/ppc64/kernel/vio.c +++ b/arch/ppc64/kernel/vio.c @@ -111,7 +111,7 @@ EXPORT_SYMBOL(vio_unregister_driver); static const struct vio_device_id *vio_match_device( const struct vio_device_id *ids, const struct vio_dev *dev) { - while (ids->type) { + while (ids->type[0] != '\0') { if (vio_bus_ops.match(ids, dev)) return ids; ids++; diff --git a/drivers/block/viodasd.c b/drivers/block/viodasd.c index 46e56a2..e46ecd2 100644 --- a/drivers/block/viodasd.c +++ b/drivers/block/viodasd.c @@ -776,7 +776,7 @@ static int viodasd_remove(struct vio_dev *vdev) */ static struct vio_device_id viodasd_device_table[] __devinitdata = { { "viodasd", "" }, - { 0, } + { "", "" } }; MODULE_DEVICE_TABLE(vio, viodasd_device_table); diff --git a/drivers/cdrom/viocd.c b/drivers/cdrom/viocd.c index 38dd9ff..0829db5 100644 --- a/drivers/cdrom/viocd.c +++ b/drivers/cdrom/viocd.c @@ -734,7 +734,7 @@ static int viocd_remove(struct vio_dev *vdev) */ static struct vio_device_id viocd_device_table[] __devinitdata = { { "viocd", "" }, - { 0, } + { "", "" } }; MODULE_DEVICE_TABLE(vio, viocd_device_table); diff --git a/drivers/char/hvc_vio.c b/drivers/char/hvc_vio.c index 60bb915..78d681d 100644 --- a/drivers/char/hvc_vio.c +++ b/drivers/char/hvc_vio.c @@ -39,7 +39,7 @@ char hvc_driver_name[] = "hvc_console"; static struct vio_device_id hvc_driver_table[] __devinitdata = { {"serial", "hvterm1"}, - { NULL, } + { "", "" } }; MODULE_DEVICE_TABLE(vio, hvc_driver_table); diff --git a/drivers/char/hvcs.c b/drivers/char/hvcs.c index 3236d24..f47f009 100644 --- a/drivers/char/hvcs.c +++ b/drivers/char/hvcs.c @@ -527,7 +527,7 @@ static int khvcsd(void *unused) static struct vio_device_id hvcs_driver_table[] __devinitdata= { {"serial-server", "hvterm2"}, - { NULL, } + { "", "" } }; MODULE_DEVICE_TABLE(vio, hvcs_driver_table); diff --git a/drivers/char/viotape.c b/drivers/char/viotape.c index 4764b4f..0aff45f 100644 --- a/drivers/char/viotape.c +++ b/drivers/char/viotape.c @@ -991,7 +991,7 @@ static int viotape_remove(struct vio_dev *vdev) */ static struct vio_device_id viotape_device_table[] __devinitdata = { { "viotape", "" }, - { 0, } + { "", "" } }; MODULE_DEVICE_TABLE(vio, viotape_device_table); diff --git a/drivers/net/ibmveth.c b/drivers/net/ibmveth.c index c39b060..32d5fab 100644 --- a/drivers/net/ibmveth.c +++ b/drivers/net/ibmveth.c @@ -1144,7 +1144,7 @@ static void ibmveth_proc_unregister_driver(void) static struct vio_device_id ibmveth_device_table[] __devinitdata= { { "network", "IBM,l-lan"}, - { 0,} + { "", "" } }; MODULE_DEVICE_TABLE(vio, ibmveth_device_table); diff --git a/drivers/net/iseries_veth.c b/drivers/net/iseries_veth.c index 55af32e..183ba97 100644 --- a/drivers/net/iseries_veth.c +++ b/drivers/net/iseries_veth.c @@ -1370,7 +1370,7 @@ static int veth_probe(struct vio_dev *vdev, const struct vio_device_id *id) */ static struct vio_device_id veth_device_table[] __devinitdata = { { "vlan", "" }, - { NULL, NULL } + { "", "" } }; MODULE_DEVICE_TABLE(vio, veth_device_table); diff --git a/drivers/scsi/ibmvscsi/ibmvscsi.c b/drivers/scsi/ibmvscsi/ibmvscsi.c index fe09d14..2cb3c83 100644 --- a/drivers/scsi/ibmvscsi/ibmvscsi.c +++ b/drivers/scsi/ibmvscsi/ibmvscsi.c @@ -1442,7 +1442,7 @@ static int ibmvscsi_remove(struct vio_dev *vdev) */ static struct vio_device_id ibmvscsi_device_table[] __devinitdata = { {"vscsi", "IBM,v-scsi"}, - {0,} + { "", "" } }; MODULE_DEVICE_TABLE(vio, ibmvscsi_device_table); diff --git a/include/asm-ppc64/vio.h b/include/asm-ppc64/vio.h index 85420bb..03f1b95 100644 --- a/include/asm-ppc64/vio.h +++ b/include/asm-ppc64/vio.h @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -52,11 +53,6 @@ struct vio_dev { struct device dev; }; -struct vio_device_id { - char *type; - char *compat; -}; - struct vio_driver { struct list_head node; char *name; diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index 97bbccd..47da39b 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -1,6 +1,6 @@ /* * Device tables which are exported to userspace via - * scripts/table2alias.c. You must keep that file in sync with this + * scripts/mod/file2alias.c. You must keep that file in sync with this * header. */ @@ -190,6 +190,11 @@ struct of_device_id #endif }; +/* VIO */ +struct vio_device_id { + char type[32]; + char compat[32]; +}; /* PCMCIA */ diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c index 5180405..d8ee38a 100644 --- a/scripts/mod/file2alias.c +++ b/scripts/mod/file2alias.c @@ -341,6 +341,22 @@ static int do_of_entry (const char *filename, struct of_device_id *of, char *ali return 1; } +static int do_vio_entry(const char *filename, struct vio_device_id *vio, + char *alias) +{ + char *tmp; + + sprintf(alias, "vio:T%sS%s", vio->type[0] ? vio->type : "*", + vio->compat[0] ? vio->compat : "*"); + + /* Replace all whitespace with underscores */ + for (tmp = alias; tmp && *tmp; tmp++) + if (isspace (*tmp)) + *tmp = '_'; + + return 1; +} + /* Ignore any prefix, eg. v850 prepends _ */ static inline int sym_is(const char *symbol, const char *name) { @@ -422,6 +438,9 @@ void handle_moddevtable(struct module *mod, struct elf_info *info, else if (sym_is(symname, "__mod_of_device_table")) do_table(symval, sym->st_size, sizeof(struct of_device_id), do_of_entry, mod); + else if (sym_is(symname, "__mod_vio_device_table")) + do_table(symval, sym->st_size, sizeof(struct vio_device_id), + do_vio_entry, mod); } -- cgit v0.10.2 From 45e2a6e4e5e22acd4321f69e84b726c2a568dacf Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Mon, 29 Aug 2005 13:15:50 +1000 Subject: [PATCH] Create include/asm-powerpc The ppc and ppc64 trees are hopefully going to merge over time, so this patch begins the process by creating a place for the merging of the header files. Create include/asm-powerpc (and move linkage.h into it from asm-{ppc,ppc64} since we don't like empty directories). Modify the ppc and ppc64 Makefiles to cope. Signed-off-by: Stephen Rothwell Signed-off-by: Paul Mackerras diff --git a/arch/ppc/Makefile b/arch/ppc/Makefile index f9b0d77..d1b6e6d 100644 --- a/arch/ppc/Makefile +++ b/arch/ppc/Makefile @@ -21,11 +21,13 @@ CC := $(CC) -m32 endif LDFLAGS_vmlinux := -Ttext $(KERNELLOAD) -Bstatic -CPPFLAGS += -Iarch/$(ARCH) +CPPFLAGS += -Iarch/$(ARCH) -Iinclude3 AFLAGS += -Iarch/$(ARCH) CFLAGS += -Iarch/$(ARCH) -msoft-float -pipe \ -ffixed-r2 -mmultiple CPP = $(CC) -E $(CFLAGS) +# Temporary hack until we have migrated to asm-powerpc +LINUXINCLUDE += -Iinclude3 CHECKFLAGS += -D__powerpc__ @@ -101,6 +103,7 @@ endef archclean: $(Q)$(MAKE) $(clean)=arch/ppc/boot + $(Q)rm -rf include3 prepare: include/asm-$(ARCH)/offsets.h checkbin @@ -110,6 +113,12 @@ arch/$(ARCH)/kernel/asm-offsets.s: include/asm include/linux/version.h \ include/asm-$(ARCH)/offsets.h: arch/$(ARCH)/kernel/asm-offsets.s $(call filechk,gen-asm-offsets) +# Temporary hack until we have migrated to asm-powerpc +include/asm: include3/asm +include3/asm: + $(Q)if [ ! -d include3 ]; then mkdir -p include3; fi + $(Q)ln -fsn $(srctree)/include/asm-powerpc include3/asm + # Use the file '.tmp_gas_check' for binutils tests, as gas won't output # to stdout and these checks are run even on install targets. TOUT := .tmp_gas_check diff --git a/arch/ppc64/Makefile b/arch/ppc64/Makefile index 731b847..6350cce 100644 --- a/arch/ppc64/Makefile +++ b/arch/ppc64/Makefile @@ -55,6 +55,8 @@ LDFLAGS := -m elf64ppc LDFLAGS_vmlinux := -Bstatic -e $(KERNELLOAD) -Ttext $(KERNELLOAD) CFLAGS += -msoft-float -pipe -mminimal-toc -mtraceback=none \ -mcall-aixdesc +# Temporary hack until we have migrated to asm-powerpc +CPPFLAGS += -Iinclude3 GCC_VERSION := $(call cc-version) GCC_BROKEN_VEC := $(shell if [ $(GCC_VERSION) -lt 0400 ] ; then echo "y"; fi ;) @@ -112,6 +114,7 @@ all: $(KBUILD_IMAGE) archclean: $(Q)$(MAKE) $(clean)=$(boot) + $(Q)rm -rf include3 prepare: include/asm-ppc64/offsets.h @@ -121,6 +124,12 @@ arch/ppc64/kernel/asm-offsets.s: include/asm include/linux/version.h \ include/asm-ppc64/offsets.h: arch/ppc64/kernel/asm-offsets.s $(call filechk,gen-asm-offsets) +# Temporary hack until we have migrated to asm-powerpc +include/asm: include3/asm +include3/asm: + $(Q)if [ ! -d include3 ]; then mkdir -p include3; fi; + $(Q)ln -fsn $(srctree)/include/asm-powerpc include3/asm + define archhelp echo '* zImage - Compressed kernel image (arch/$(ARCH)/boot/zImage)' echo ' zImage.initrd- Compressed kernel image with initrd attached,' diff --git a/include/asm-powerpc/linkage.h b/include/asm-powerpc/linkage.h new file mode 100644 index 0000000..291c2d0 --- /dev/null +++ b/include/asm-powerpc/linkage.h @@ -0,0 +1,6 @@ +#ifndef __ASM_LINKAGE_H +#define __ASM_LINKAGE_H + +/* Nothing to see here... */ + +#endif diff --git a/include/asm-ppc/linkage.h b/include/asm-ppc/linkage.h deleted file mode 100644 index 291c2d0..0000000 --- a/include/asm-ppc/linkage.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef __ASM_LINKAGE_H -#define __ASM_LINKAGE_H - -/* Nothing to see here... */ - -#endif diff --git a/include/asm-ppc64/linkage.h b/include/asm-ppc64/linkage.h deleted file mode 100644 index 291c2d0..0000000 --- a/include/asm-ppc64/linkage.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef __ASM_LINKAGE_H -#define __ASM_LINKAGE_H - -/* Nothing to see here... */ - -#endif -- cgit v0.10.2 From 88999ceb55bf959e63df0c911915166b005977fc Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Mon, 29 Aug 2005 14:06:56 +1000 Subject: [PATCH] Move the identical files from include/asm-ppc{,64} Move the identical files from include/asm-ppc{,64}/ to include/asm-powerpc/. Remove hdreg.h completely as it is unused in the tree. Signed-off-by: Stephen Rothwell Signed-off-by: Paul Mackerras diff --git a/include/asm-powerpc/8253pit.h b/include/asm-powerpc/8253pit.h new file mode 100644 index 0000000..862708a --- /dev/null +++ b/include/asm-powerpc/8253pit.h @@ -0,0 +1,10 @@ +/* + * 8253/8254 Programmable Interval Timer + */ + +#ifndef _8253PIT_H +#define _8253PIT_H + +#define PIT_TICK_RATE 1193182UL + +#endif diff --git a/include/asm-powerpc/agp.h b/include/asm-powerpc/agp.h new file mode 100644 index 0000000..ca9e423 --- /dev/null +++ b/include/asm-powerpc/agp.h @@ -0,0 +1,23 @@ +#ifndef AGP_H +#define AGP_H 1 + +#include + +/* nothing much needed here */ + +#define map_page_into_agp(page) +#define unmap_page_from_agp(page) +#define flush_agp_mappings() +#define flush_agp_cache() mb() + +/* Convert a physical address to an address suitable for the GART. */ +#define phys_to_gart(x) (x) +#define gart_to_phys(x) (x) + +/* GATT allocation. Returns/accepts GATT kernel virtual address. */ +#define alloc_gatt_pages(order) \ + ((char *)__get_free_pages(GFP_KERNEL, (order))) +#define free_gatt_pages(table, order) \ + free_pages((unsigned long)(table), (order)) + +#endif diff --git a/include/asm-powerpc/cputime.h b/include/asm-powerpc/cputime.h new file mode 100644 index 0000000..6d68ad7 --- /dev/null +++ b/include/asm-powerpc/cputime.h @@ -0,0 +1 @@ +#include diff --git a/include/asm-powerpc/div64.h b/include/asm-powerpc/div64.h new file mode 100644 index 0000000..6cd978c --- /dev/null +++ b/include/asm-powerpc/div64.h @@ -0,0 +1 @@ +#include diff --git a/include/asm-powerpc/emergency-restart.h b/include/asm-powerpc/emergency-restart.h new file mode 100644 index 0000000..3711bd9 --- /dev/null +++ b/include/asm-powerpc/emergency-restart.h @@ -0,0 +1 @@ +#include diff --git a/include/asm-powerpc/ipc.h b/include/asm-powerpc/ipc.h new file mode 100644 index 0000000..a46e3d9 --- /dev/null +++ b/include/asm-powerpc/ipc.h @@ -0,0 +1 @@ +#include diff --git a/include/asm-powerpc/xor.h b/include/asm-powerpc/xor.h new file mode 100644 index 0000000..c82eb12 --- /dev/null +++ b/include/asm-powerpc/xor.h @@ -0,0 +1 @@ +#include diff --git a/include/asm-ppc/8253pit.h b/include/asm-ppc/8253pit.h deleted file mode 100644 index 285f784..0000000 --- a/include/asm-ppc/8253pit.h +++ /dev/null @@ -1,10 +0,0 @@ -/* - * 8253/8254 Programmable Interval Timer - */ - -#ifndef _8253PIT_H -#define _8253PIT_H - -#define PIT_TICK_RATE 1193182UL - -#endif diff --git a/include/asm-ppc/agp.h b/include/asm-ppc/agp.h deleted file mode 100644 index ca9e423..0000000 --- a/include/asm-ppc/agp.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef AGP_H -#define AGP_H 1 - -#include - -/* nothing much needed here */ - -#define map_page_into_agp(page) -#define unmap_page_from_agp(page) -#define flush_agp_mappings() -#define flush_agp_cache() mb() - -/* Convert a physical address to an address suitable for the GART. */ -#define phys_to_gart(x) (x) -#define gart_to_phys(x) (x) - -/* GATT allocation. Returns/accepts GATT kernel virtual address. */ -#define alloc_gatt_pages(order) \ - ((char *)__get_free_pages(GFP_KERNEL, (order))) -#define free_gatt_pages(table, order) \ - free_pages((unsigned long)(table), (order)) - -#endif diff --git a/include/asm-ppc/cputime.h b/include/asm-ppc/cputime.h deleted file mode 100644 index 8e9faf5..0000000 --- a/include/asm-ppc/cputime.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef __PPC_CPUTIME_H -#define __PPC_CPUTIME_H - -#include - -#endif /* __PPC_CPUTIME_H */ diff --git a/include/asm-ppc/div64.h b/include/asm-ppc/div64.h deleted file mode 100644 index 6cd978c..0000000 --- a/include/asm-ppc/div64.h +++ /dev/null @@ -1 +0,0 @@ -#include diff --git a/include/asm-ppc/emergency-restart.h b/include/asm-ppc/emergency-restart.h deleted file mode 100644 index 108d8c4..0000000 --- a/include/asm-ppc/emergency-restart.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _ASM_EMERGENCY_RESTART_H -#define _ASM_EMERGENCY_RESTART_H - -#include - -#endif /* _ASM_EMERGENCY_RESTART_H */ diff --git a/include/asm-ppc/hdreg.h b/include/asm-ppc/hdreg.h deleted file mode 100644 index 7f7fd1a..0000000 --- a/include/asm-ppc/hdreg.h +++ /dev/null @@ -1 +0,0 @@ -#include diff --git a/include/asm-ppc/ipc.h b/include/asm-ppc/ipc.h deleted file mode 100644 index a46e3d9..0000000 --- a/include/asm-ppc/ipc.h +++ /dev/null @@ -1 +0,0 @@ -#include diff --git a/include/asm-ppc/xor.h b/include/asm-ppc/xor.h deleted file mode 100644 index c82eb12..0000000 --- a/include/asm-ppc/xor.h +++ /dev/null @@ -1 +0,0 @@ -#include diff --git a/include/asm-ppc64/8253pit.h b/include/asm-ppc64/8253pit.h deleted file mode 100644 index 285f784..0000000 --- a/include/asm-ppc64/8253pit.h +++ /dev/null @@ -1,10 +0,0 @@ -/* - * 8253/8254 Programmable Interval Timer - */ - -#ifndef _8253PIT_H -#define _8253PIT_H - -#define PIT_TICK_RATE 1193182UL - -#endif diff --git a/include/asm-ppc64/agp.h b/include/asm-ppc64/agp.h deleted file mode 100644 index ca9e423..0000000 --- a/include/asm-ppc64/agp.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef AGP_H -#define AGP_H 1 - -#include - -/* nothing much needed here */ - -#define map_page_into_agp(page) -#define unmap_page_from_agp(page) -#define flush_agp_mappings() -#define flush_agp_cache() mb() - -/* Convert a physical address to an address suitable for the GART. */ -#define phys_to_gart(x) (x) -#define gart_to_phys(x) (x) - -/* GATT allocation. Returns/accepts GATT kernel virtual address. */ -#define alloc_gatt_pages(order) \ - ((char *)__get_free_pages(GFP_KERNEL, (order))) -#define free_gatt_pages(table, order) \ - free_pages((unsigned long)(table), (order)) - -#endif diff --git a/include/asm-ppc64/cputime.h b/include/asm-ppc64/cputime.h deleted file mode 100644 index 8e9faf5..0000000 --- a/include/asm-ppc64/cputime.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef __PPC_CPUTIME_H -#define __PPC_CPUTIME_H - -#include - -#endif /* __PPC_CPUTIME_H */ diff --git a/include/asm-ppc64/div64.h b/include/asm-ppc64/div64.h deleted file mode 100644 index 6cd978c..0000000 --- a/include/asm-ppc64/div64.h +++ /dev/null @@ -1 +0,0 @@ -#include diff --git a/include/asm-ppc64/emergency-restart.h b/include/asm-ppc64/emergency-restart.h deleted file mode 100644 index 108d8c4..0000000 --- a/include/asm-ppc64/emergency-restart.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _ASM_EMERGENCY_RESTART_H -#define _ASM_EMERGENCY_RESTART_H - -#include - -#endif /* _ASM_EMERGENCY_RESTART_H */ diff --git a/include/asm-ppc64/hdreg.h b/include/asm-ppc64/hdreg.h deleted file mode 100644 index 7f7fd1a..0000000 --- a/include/asm-ppc64/hdreg.h +++ /dev/null @@ -1 +0,0 @@ -#include diff --git a/include/asm-ppc64/ipc.h b/include/asm-ppc64/ipc.h deleted file mode 100644 index a46e3d9..0000000 --- a/include/asm-ppc64/ipc.h +++ /dev/null @@ -1 +0,0 @@ -#include diff --git a/include/asm-ppc64/xor.h b/include/asm-ppc64/xor.h deleted file mode 100644 index c82eb12..0000000 --- a/include/asm-ppc64/xor.h +++ /dev/null @@ -1 +0,0 @@ -#include -- cgit v0.10.2 From 6f9aa727433fe7647869c9b64ce2f7b5feac0052 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Mon, 29 Aug 2005 14:08:11 +1000 Subject: [PATCH] Move all the very similar files to asm-powerpc They differed in either simple comments or in the protecting ifdefs. Signed-off-by: Stephen Rothwell Signed-off-by: Paul Mackerras diff --git a/include/asm-powerpc/errno.h b/include/asm-powerpc/errno.h new file mode 100644 index 0000000..19f20bd --- /dev/null +++ b/include/asm-powerpc/errno.h @@ -0,0 +1,11 @@ +#ifndef _PPC_ERRNO_H +#define _PPC_ERRNO_H + +#include + +#undef EDEADLOCK +#define EDEADLOCK 58 /* File locking deadlock error */ + +#define _LAST_ERRNO 516 + +#endif diff --git a/include/asm-powerpc/ioctl.h b/include/asm-powerpc/ioctl.h new file mode 100644 index 0000000..93c6acf --- /dev/null +++ b/include/asm-powerpc/ioctl.h @@ -0,0 +1,69 @@ +#ifndef _PPC_IOCTL_H +#define _PPC_IOCTL_H + + +/* + * this was copied from the alpha as it's a bit cleaner there. + * -- Cort + */ + +#define _IOC_NRBITS 8 +#define _IOC_TYPEBITS 8 +#define _IOC_SIZEBITS 13 +#define _IOC_DIRBITS 3 + +#define _IOC_NRMASK ((1 << _IOC_NRBITS)-1) +#define _IOC_TYPEMASK ((1 << _IOC_TYPEBITS)-1) +#define _IOC_SIZEMASK ((1 << _IOC_SIZEBITS)-1) +#define _IOC_DIRMASK ((1 << _IOC_DIRBITS)-1) + +#define _IOC_NRSHIFT 0 +#define _IOC_TYPESHIFT (_IOC_NRSHIFT+_IOC_NRBITS) +#define _IOC_SIZESHIFT (_IOC_TYPESHIFT+_IOC_TYPEBITS) +#define _IOC_DIRSHIFT (_IOC_SIZESHIFT+_IOC_SIZEBITS) + +/* + * Direction bits _IOC_NONE could be 0, but OSF/1 gives it a bit. + * And this turns out useful to catch old ioctl numbers in header + * files for us. + */ +#define _IOC_NONE 1U +#define _IOC_READ 2U +#define _IOC_WRITE 4U + +#define _IOC(dir,type,nr,size) \ + (((dir) << _IOC_DIRSHIFT) | \ + ((type) << _IOC_TYPESHIFT) | \ + ((nr) << _IOC_NRSHIFT) | \ + ((size) << _IOC_SIZESHIFT)) + +/* provoke compile error for invalid uses of size argument */ +extern unsigned int __invalid_size_argument_for_IOC; +#define _IOC_TYPECHECK(t) \ + ((sizeof(t) == sizeof(t[1]) && \ + sizeof(t) < (1 << _IOC_SIZEBITS)) ? \ + sizeof(t) : __invalid_size_argument_for_IOC) + +/* used to create numbers */ +#define _IO(type,nr) _IOC(_IOC_NONE,(type),(nr),0) +#define _IOR(type,nr,size) _IOC(_IOC_READ,(type),(nr),(_IOC_TYPECHECK(size))) +#define _IOW(type,nr,size) _IOC(_IOC_WRITE,(type),(nr),(_IOC_TYPECHECK(size))) +#define _IOWR(type,nr,size) _IOC(_IOC_READ|_IOC_WRITE,(type),(nr),(_IOC_TYPECHECK(size))) +#define _IOR_BAD(type,nr,size) _IOC(_IOC_READ,(type),(nr),sizeof(size)) +#define _IOW_BAD(type,nr,size) _IOC(_IOC_WRITE,(type),(nr),sizeof(size)) +#define _IOWR_BAD(type,nr,size) _IOC(_IOC_READ|_IOC_WRITE,(type),(nr),sizeof(size)) + +/* used to decode them.. */ +#define _IOC_DIR(nr) (((nr) >> _IOC_DIRSHIFT) & _IOC_DIRMASK) +#define _IOC_TYPE(nr) (((nr) >> _IOC_TYPESHIFT) & _IOC_TYPEMASK) +#define _IOC_NR(nr) (((nr) >> _IOC_NRSHIFT) & _IOC_NRMASK) +#define _IOC_SIZE(nr) (((nr) >> _IOC_SIZESHIFT) & _IOC_SIZEMASK) + +/* various drivers, such as the pcmcia stuff, need these... */ +#define IOC_IN (_IOC_WRITE << _IOC_DIRSHIFT) +#define IOC_OUT (_IOC_READ << _IOC_DIRSHIFT) +#define IOC_INOUT ((_IOC_WRITE|_IOC_READ) << _IOC_DIRSHIFT) +#define IOCSIZE_MASK (_IOC_SIZEMASK << _IOC_SIZESHIFT) +#define IOCSIZE_SHIFT (_IOC_SIZESHIFT) + +#endif diff --git a/include/asm-powerpc/ioctls.h b/include/asm-powerpc/ioctls.h new file mode 100644 index 0000000..f5b7f2b --- /dev/null +++ b/include/asm-powerpc/ioctls.h @@ -0,0 +1,107 @@ +#ifndef _ASM_PPC_IOCTLS_H +#define _ASM_PPC_IOCTLS_H + +#include + +#define FIOCLEX _IO('f', 1) +#define FIONCLEX _IO('f', 2) +#define FIOASYNC _IOW('f', 125, int) +#define FIONBIO _IOW('f', 126, int) +#define FIONREAD _IOR('f', 127, int) +#define TIOCINQ FIONREAD +#define FIOQSIZE _IOR('f', 128, loff_t) + +#define TIOCGETP _IOR('t', 8, struct sgttyb) +#define TIOCSETP _IOW('t', 9, struct sgttyb) +#define TIOCSETN _IOW('t', 10, struct sgttyb) /* TIOCSETP wo flush */ + +#define TIOCSETC _IOW('t', 17, struct tchars) +#define TIOCGETC _IOR('t', 18, struct tchars) +#define TCGETS _IOR('t', 19, struct termios) +#define TCSETS _IOW('t', 20, struct termios) +#define TCSETSW _IOW('t', 21, struct termios) +#define TCSETSF _IOW('t', 22, struct termios) + +#define TCGETA _IOR('t', 23, struct termio) +#define TCSETA _IOW('t', 24, struct termio) +#define TCSETAW _IOW('t', 25, struct termio) +#define TCSETAF _IOW('t', 28, struct termio) + +#define TCSBRK _IO('t', 29) +#define TCXONC _IO('t', 30) +#define TCFLSH _IO('t', 31) + +#define TIOCSWINSZ _IOW('t', 103, struct winsize) +#define TIOCGWINSZ _IOR('t', 104, struct winsize) +#define TIOCSTART _IO('t', 110) /* start output, like ^Q */ +#define TIOCSTOP _IO('t', 111) /* stop output, like ^S */ +#define TIOCOUTQ _IOR('t', 115, int) /* output queue size */ + +#define TIOCGLTC _IOR('t', 116, struct ltchars) +#define TIOCSLTC _IOW('t', 117, struct ltchars) +#define TIOCSPGRP _IOW('t', 118, int) +#define TIOCGPGRP _IOR('t', 119, int) + +#define TIOCEXCL 0x540C +#define TIOCNXCL 0x540D +#define TIOCSCTTY 0x540E + +#define TIOCSTI 0x5412 +#define TIOCMGET 0x5415 +#define TIOCMBIS 0x5416 +#define TIOCMBIC 0x5417 +#define TIOCMSET 0x5418 +# define TIOCM_LE 0x001 +# define TIOCM_DTR 0x002 +# define TIOCM_RTS 0x004 +# define TIOCM_ST 0x008 +# define TIOCM_SR 0x010 +# define TIOCM_CTS 0x020 +# define TIOCM_CAR 0x040 +# define TIOCM_RNG 0x080 +# define TIOCM_DSR 0x100 +# define TIOCM_CD TIOCM_CAR +# define TIOCM_RI TIOCM_RNG + +#define TIOCGSOFTCAR 0x5419 +#define TIOCSSOFTCAR 0x541A +#define TIOCLINUX 0x541C +#define TIOCCONS 0x541D +#define TIOCGSERIAL 0x541E +#define TIOCSSERIAL 0x541F +#define TIOCPKT 0x5420 +# define TIOCPKT_DATA 0 +# define TIOCPKT_FLUSHREAD 1 +# define TIOCPKT_FLUSHWRITE 2 +# define TIOCPKT_STOP 4 +# define TIOCPKT_START 8 +# define TIOCPKT_NOSTOP 16 +# define TIOCPKT_DOSTOP 32 + + +#define TIOCNOTTY 0x5422 +#define TIOCSETD 0x5423 +#define TIOCGETD 0x5424 +#define TCSBRKP 0x5425 /* Needed for POSIX tcsendbreak() */ +#define TIOCSBRK 0x5427 /* BSD compatibility */ +#define TIOCCBRK 0x5428 /* BSD compatibility */ +#define TIOCGSID 0x5429 /* Return the session ID of FD */ +#define TIOCGPTN _IOR('T',0x30, unsigned int) /* Get Pty Number (of pty-mux device) */ +#define TIOCSPTLCK _IOW('T',0x31, int) /* Lock/unlock Pty */ + +#define TIOCSERCONFIG 0x5453 +#define TIOCSERGWILD 0x5454 +#define TIOCSERSWILD 0x5455 +#define TIOCGLCKTRMIOS 0x5456 +#define TIOCSLCKTRMIOS 0x5457 +#define TIOCSERGSTRUCT 0x5458 /* For debugging only */ +#define TIOCSERGETLSR 0x5459 /* Get line status register */ + /* ioctl (fd, TIOCSERGETLSR, &result) where result may be as below */ +# define TIOCSER_TEMT 0x01 /* Transmitter physically empty */ +#define TIOCSERGETMULTI 0x545A /* Get multiport config */ +#define TIOCSERSETMULTI 0x545B /* Set multiport config */ + +#define TIOCMIWAIT 0x545C /* wait for a change on serial input line(s) */ +#define TIOCGICOUNT 0x545D /* read serial port inline interrupt counts */ + +#endif /* _ASM_PPC_IOCTLS_H */ diff --git a/include/asm-powerpc/local.h b/include/asm-powerpc/local.h new file mode 100644 index 0000000..c11c530 --- /dev/null +++ b/include/asm-powerpc/local.h @@ -0,0 +1 @@ +#include diff --git a/include/asm-powerpc/namei.h b/include/asm-powerpc/namei.h new file mode 100644 index 0000000..29c9ec8 --- /dev/null +++ b/include/asm-powerpc/namei.h @@ -0,0 +1,20 @@ +/* + * include/asm-ppc/namei.h + * Adapted from include/asm-alpha/namei.h + * + * Included from fs/namei.c + */ + +#ifdef __KERNEL__ +#ifndef __PPC_NAMEI_H +#define __PPC_NAMEI_H + +/* This dummy routine maybe changed to something useful + * for /usr/gnemul/ emulation stuff. + * Look at asm-sparc/namei.h for details. + */ + +#define __emul_prefix() NULL + +#endif /* __PPC_NAMEI_H */ +#endif /* __KERNEL__ */ diff --git a/include/asm-powerpc/percpu.h b/include/asm-powerpc/percpu.h new file mode 100644 index 0000000..06a959d --- /dev/null +++ b/include/asm-powerpc/percpu.h @@ -0,0 +1 @@ +#include diff --git a/include/asm-powerpc/poll.h b/include/asm-powerpc/poll.h new file mode 100644 index 0000000..be50249 --- /dev/null +++ b/include/asm-powerpc/poll.h @@ -0,0 +1,23 @@ +#ifndef __PPC_POLL_H +#define __PPC_POLL_H + +#define POLLIN 0x0001 +#define POLLPRI 0x0002 +#define POLLOUT 0x0004 +#define POLLERR 0x0008 +#define POLLHUP 0x0010 +#define POLLNVAL 0x0020 +#define POLLRDNORM 0x0040 +#define POLLRDBAND 0x0080 +#define POLLWRNORM 0x0100 +#define POLLWRBAND 0x0200 +#define POLLMSG 0x0400 +#define POLLREMOVE 0x1000 + +struct pollfd { + int fd; + short events; + short revents; +}; + +#endif diff --git a/include/asm-powerpc/resource.h b/include/asm-powerpc/resource.h new file mode 100644 index 0000000..04bc4db --- /dev/null +++ b/include/asm-powerpc/resource.h @@ -0,0 +1 @@ +#include diff --git a/include/asm-powerpc/shmparam.h b/include/asm-powerpc/shmparam.h new file mode 100644 index 0000000..d625060 --- /dev/null +++ b/include/asm-powerpc/shmparam.h @@ -0,0 +1,6 @@ +#ifndef _PPC_SHMPARAM_H +#define _PPC_SHMPARAM_H + +#define SHMLBA PAGE_SIZE /* attach addr a multiple of this */ + +#endif /* _PPC_SHMPARAM_H */ diff --git a/include/asm-powerpc/string.h b/include/asm-powerpc/string.h new file mode 100644 index 0000000..2255759 --- /dev/null +++ b/include/asm-powerpc/string.h @@ -0,0 +1,32 @@ +#ifndef _PPC_STRING_H_ +#define _PPC_STRING_H_ + +#ifdef __KERNEL__ + +#define __HAVE_ARCH_STRCPY +#define __HAVE_ARCH_STRNCPY +#define __HAVE_ARCH_STRLEN +#define __HAVE_ARCH_STRCMP +#define __HAVE_ARCH_STRCAT +#define __HAVE_ARCH_MEMSET +#define __HAVE_ARCH_MEMCPY +#define __HAVE_ARCH_MEMMOVE +#define __HAVE_ARCH_MEMCMP +#define __HAVE_ARCH_MEMCHR + +extern int strcasecmp(const char *, const char *); +extern int strncasecmp(const char *, const char *, int); +extern char * strcpy(char *,const char *); +extern char * strncpy(char *,const char *, __kernel_size_t); +extern __kernel_size_t strlen(const char *); +extern int strcmp(const char *,const char *); +extern char * strcat(char *, const char *); +extern void * memset(void *,int,__kernel_size_t); +extern void * memcpy(void *,const void *,__kernel_size_t); +extern void * memmove(void *,const void *,__kernel_size_t); +extern int memcmp(const void *,const void *,__kernel_size_t); +extern void * memchr(const void *,int,__kernel_size_t); + +#endif /* __KERNEL__ */ + +#endif diff --git a/include/asm-powerpc/unaligned.h b/include/asm-powerpc/unaligned.h new file mode 100644 index 0000000..45520d9 --- /dev/null +++ b/include/asm-powerpc/unaligned.h @@ -0,0 +1,18 @@ +#ifdef __KERNEL__ +#ifndef __PPC_UNALIGNED_H +#define __PPC_UNALIGNED_H + +/* + * The PowerPC can do unaligned accesses itself in big endian mode. + * + * The strange macros are there to make sure these can't + * be misused in a way that makes them not work on other + * architectures where unaligned accesses aren't as simple. + */ + +#define get_unaligned(ptr) (*(ptr)) + +#define put_unaligned(val, ptr) ((void)( *(ptr) = (val) )) + +#endif +#endif /* __KERNEL__ */ diff --git a/include/asm-ppc/errno.h b/include/asm-ppc/errno.h deleted file mode 100644 index 19f20bd..0000000 --- a/include/asm-ppc/errno.h +++ /dev/null @@ -1,11 +0,0 @@ -#ifndef _PPC_ERRNO_H -#define _PPC_ERRNO_H - -#include - -#undef EDEADLOCK -#define EDEADLOCK 58 /* File locking deadlock error */ - -#define _LAST_ERRNO 516 - -#endif diff --git a/include/asm-ppc/ioctl.h b/include/asm-ppc/ioctl.h deleted file mode 100644 index 93c6acf..0000000 --- a/include/asm-ppc/ioctl.h +++ /dev/null @@ -1,69 +0,0 @@ -#ifndef _PPC_IOCTL_H -#define _PPC_IOCTL_H - - -/* - * this was copied from the alpha as it's a bit cleaner there. - * -- Cort - */ - -#define _IOC_NRBITS 8 -#define _IOC_TYPEBITS 8 -#define _IOC_SIZEBITS 13 -#define _IOC_DIRBITS 3 - -#define _IOC_NRMASK ((1 << _IOC_NRBITS)-1) -#define _IOC_TYPEMASK ((1 << _IOC_TYPEBITS)-1) -#define _IOC_SIZEMASK ((1 << _IOC_SIZEBITS)-1) -#define _IOC_DIRMASK ((1 << _IOC_DIRBITS)-1) - -#define _IOC_NRSHIFT 0 -#define _IOC_TYPESHIFT (_IOC_NRSHIFT+_IOC_NRBITS) -#define _IOC_SIZESHIFT (_IOC_TYPESHIFT+_IOC_TYPEBITS) -#define _IOC_DIRSHIFT (_IOC_SIZESHIFT+_IOC_SIZEBITS) - -/* - * Direction bits _IOC_NONE could be 0, but OSF/1 gives it a bit. - * And this turns out useful to catch old ioctl numbers in header - * files for us. - */ -#define _IOC_NONE 1U -#define _IOC_READ 2U -#define _IOC_WRITE 4U - -#define _IOC(dir,type,nr,size) \ - (((dir) << _IOC_DIRSHIFT) | \ - ((type) << _IOC_TYPESHIFT) | \ - ((nr) << _IOC_NRSHIFT) | \ - ((size) << _IOC_SIZESHIFT)) - -/* provoke compile error for invalid uses of size argument */ -extern unsigned int __invalid_size_argument_for_IOC; -#define _IOC_TYPECHECK(t) \ - ((sizeof(t) == sizeof(t[1]) && \ - sizeof(t) < (1 << _IOC_SIZEBITS)) ? \ - sizeof(t) : __invalid_size_argument_for_IOC) - -/* used to create numbers */ -#define _IO(type,nr) _IOC(_IOC_NONE,(type),(nr),0) -#define _IOR(type,nr,size) _IOC(_IOC_READ,(type),(nr),(_IOC_TYPECHECK(size))) -#define _IOW(type,nr,size) _IOC(_IOC_WRITE,(type),(nr),(_IOC_TYPECHECK(size))) -#define _IOWR(type,nr,size) _IOC(_IOC_READ|_IOC_WRITE,(type),(nr),(_IOC_TYPECHECK(size))) -#define _IOR_BAD(type,nr,size) _IOC(_IOC_READ,(type),(nr),sizeof(size)) -#define _IOW_BAD(type,nr,size) _IOC(_IOC_WRITE,(type),(nr),sizeof(size)) -#define _IOWR_BAD(type,nr,size) _IOC(_IOC_READ|_IOC_WRITE,(type),(nr),sizeof(size)) - -/* used to decode them.. */ -#define _IOC_DIR(nr) (((nr) >> _IOC_DIRSHIFT) & _IOC_DIRMASK) -#define _IOC_TYPE(nr) (((nr) >> _IOC_TYPESHIFT) & _IOC_TYPEMASK) -#define _IOC_NR(nr) (((nr) >> _IOC_NRSHIFT) & _IOC_NRMASK) -#define _IOC_SIZE(nr) (((nr) >> _IOC_SIZESHIFT) & _IOC_SIZEMASK) - -/* various drivers, such as the pcmcia stuff, need these... */ -#define IOC_IN (_IOC_WRITE << _IOC_DIRSHIFT) -#define IOC_OUT (_IOC_READ << _IOC_DIRSHIFT) -#define IOC_INOUT ((_IOC_WRITE|_IOC_READ) << _IOC_DIRSHIFT) -#define IOCSIZE_MASK (_IOC_SIZEMASK << _IOC_SIZESHIFT) -#define IOCSIZE_SHIFT (_IOC_SIZESHIFT) - -#endif diff --git a/include/asm-ppc/ioctls.h b/include/asm-ppc/ioctls.h deleted file mode 100644 index f5b7f2b..0000000 --- a/include/asm-ppc/ioctls.h +++ /dev/null @@ -1,107 +0,0 @@ -#ifndef _ASM_PPC_IOCTLS_H -#define _ASM_PPC_IOCTLS_H - -#include - -#define FIOCLEX _IO('f', 1) -#define FIONCLEX _IO('f', 2) -#define FIOASYNC _IOW('f', 125, int) -#define FIONBIO _IOW('f', 126, int) -#define FIONREAD _IOR('f', 127, int) -#define TIOCINQ FIONREAD -#define FIOQSIZE _IOR('f', 128, loff_t) - -#define TIOCGETP _IOR('t', 8, struct sgttyb) -#define TIOCSETP _IOW('t', 9, struct sgttyb) -#define TIOCSETN _IOW('t', 10, struct sgttyb) /* TIOCSETP wo flush */ - -#define TIOCSETC _IOW('t', 17, struct tchars) -#define TIOCGETC _IOR('t', 18, struct tchars) -#define TCGETS _IOR('t', 19, struct termios) -#define TCSETS _IOW('t', 20, struct termios) -#define TCSETSW _IOW('t', 21, struct termios) -#define TCSETSF _IOW('t', 22, struct termios) - -#define TCGETA _IOR('t', 23, struct termio) -#define TCSETA _IOW('t', 24, struct termio) -#define TCSETAW _IOW('t', 25, struct termio) -#define TCSETAF _IOW('t', 28, struct termio) - -#define TCSBRK _IO('t', 29) -#define TCXONC _IO('t', 30) -#define TCFLSH _IO('t', 31) - -#define TIOCSWINSZ _IOW('t', 103, struct winsize) -#define TIOCGWINSZ _IOR('t', 104, struct winsize) -#define TIOCSTART _IO('t', 110) /* start output, like ^Q */ -#define TIOCSTOP _IO('t', 111) /* stop output, like ^S */ -#define TIOCOUTQ _IOR('t', 115, int) /* output queue size */ - -#define TIOCGLTC _IOR('t', 116, struct ltchars) -#define TIOCSLTC _IOW('t', 117, struct ltchars) -#define TIOCSPGRP _IOW('t', 118, int) -#define TIOCGPGRP _IOR('t', 119, int) - -#define TIOCEXCL 0x540C -#define TIOCNXCL 0x540D -#define TIOCSCTTY 0x540E - -#define TIOCSTI 0x5412 -#define TIOCMGET 0x5415 -#define TIOCMBIS 0x5416 -#define TIOCMBIC 0x5417 -#define TIOCMSET 0x5418 -# define TIOCM_LE 0x001 -# define TIOCM_DTR 0x002 -# define TIOCM_RTS 0x004 -# define TIOCM_ST 0x008 -# define TIOCM_SR 0x010 -# define TIOCM_CTS 0x020 -# define TIOCM_CAR 0x040 -# define TIOCM_RNG 0x080 -# define TIOCM_DSR 0x100 -# define TIOCM_CD TIOCM_CAR -# define TIOCM_RI TIOCM_RNG - -#define TIOCGSOFTCAR 0x5419 -#define TIOCSSOFTCAR 0x541A -#define TIOCLINUX 0x541C -#define TIOCCONS 0x541D -#define TIOCGSERIAL 0x541E -#define TIOCSSERIAL 0x541F -#define TIOCPKT 0x5420 -# define TIOCPKT_DATA 0 -# define TIOCPKT_FLUSHREAD 1 -# define TIOCPKT_FLUSHWRITE 2 -# define TIOCPKT_STOP 4 -# define TIOCPKT_START 8 -# define TIOCPKT_NOSTOP 16 -# define TIOCPKT_DOSTOP 32 - - -#define TIOCNOTTY 0x5422 -#define TIOCSETD 0x5423 -#define TIOCGETD 0x5424 -#define TCSBRKP 0x5425 /* Needed for POSIX tcsendbreak() */ -#define TIOCSBRK 0x5427 /* BSD compatibility */ -#define TIOCCBRK 0x5428 /* BSD compatibility */ -#define TIOCGSID 0x5429 /* Return the session ID of FD */ -#define TIOCGPTN _IOR('T',0x30, unsigned int) /* Get Pty Number (of pty-mux device) */ -#define TIOCSPTLCK _IOW('T',0x31, int) /* Lock/unlock Pty */ - -#define TIOCSERCONFIG 0x5453 -#define TIOCSERGWILD 0x5454 -#define TIOCSERSWILD 0x5455 -#define TIOCGLCKTRMIOS 0x5456 -#define TIOCSLCKTRMIOS 0x5457 -#define TIOCSERGSTRUCT 0x5458 /* For debugging only */ -#define TIOCSERGETLSR 0x5459 /* Get line status register */ - /* ioctl (fd, TIOCSERGETLSR, &result) where result may be as below */ -# define TIOCSER_TEMT 0x01 /* Transmitter physically empty */ -#define TIOCSERGETMULTI 0x545A /* Get multiport config */ -#define TIOCSERSETMULTI 0x545B /* Set multiport config */ - -#define TIOCMIWAIT 0x545C /* wait for a change on serial input line(s) */ -#define TIOCGICOUNT 0x545D /* read serial port inline interrupt counts */ - -#endif /* _ASM_PPC_IOCTLS_H */ diff --git a/include/asm-ppc/local.h b/include/asm-ppc/local.h deleted file mode 100644 index b08e3ec..0000000 --- a/include/asm-ppc/local.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef __PPC_LOCAL_H -#define __PPC_LOCAL_H - -#include - -#endif /* __PPC_LOCAL_H */ diff --git a/include/asm-ppc/namei.h b/include/asm-ppc/namei.h deleted file mode 100644 index 29c9ec8..0000000 --- a/include/asm-ppc/namei.h +++ /dev/null @@ -1,20 +0,0 @@ -/* - * include/asm-ppc/namei.h - * Adapted from include/asm-alpha/namei.h - * - * Included from fs/namei.c - */ - -#ifdef __KERNEL__ -#ifndef __PPC_NAMEI_H -#define __PPC_NAMEI_H - -/* This dummy routine maybe changed to something useful - * for /usr/gnemul/ emulation stuff. - * Look at asm-sparc/namei.h for details. - */ - -#define __emul_prefix() NULL - -#endif /* __PPC_NAMEI_H */ -#endif /* __KERNEL__ */ diff --git a/include/asm-ppc/percpu.h b/include/asm-ppc/percpu.h deleted file mode 100644 index d66667c..0000000 --- a/include/asm-ppc/percpu.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef __ARCH_PPC_PERCPU__ -#define __ARCH_PPC_PERCPU__ - -#include - -#endif /* __ARCH_PPC_PERCPU__ */ diff --git a/include/asm-ppc/poll.h b/include/asm-ppc/poll.h deleted file mode 100644 index be50249..0000000 --- a/include/asm-ppc/poll.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef __PPC_POLL_H -#define __PPC_POLL_H - -#define POLLIN 0x0001 -#define POLLPRI 0x0002 -#define POLLOUT 0x0004 -#define POLLERR 0x0008 -#define POLLHUP 0x0010 -#define POLLNVAL 0x0020 -#define POLLRDNORM 0x0040 -#define POLLRDBAND 0x0080 -#define POLLWRNORM 0x0100 -#define POLLWRBAND 0x0200 -#define POLLMSG 0x0400 -#define POLLREMOVE 0x1000 - -struct pollfd { - int fd; - short events; - short revents; -}; - -#endif diff --git a/include/asm-ppc/resource.h b/include/asm-ppc/resource.h deleted file mode 100644 index 86a1ea2..0000000 --- a/include/asm-ppc/resource.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _PPC_RESOURCE_H -#define _PPC_RESOURCE_H - -#include - -#endif diff --git a/include/asm-ppc/shmparam.h b/include/asm-ppc/shmparam.h deleted file mode 100644 index d625060..0000000 --- a/include/asm-ppc/shmparam.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _PPC_SHMPARAM_H -#define _PPC_SHMPARAM_H - -#define SHMLBA PAGE_SIZE /* attach addr a multiple of this */ - -#endif /* _PPC_SHMPARAM_H */ diff --git a/include/asm-ppc/string.h b/include/asm-ppc/string.h deleted file mode 100644 index 2255759..0000000 --- a/include/asm-ppc/string.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef _PPC_STRING_H_ -#define _PPC_STRING_H_ - -#ifdef __KERNEL__ - -#define __HAVE_ARCH_STRCPY -#define __HAVE_ARCH_STRNCPY -#define __HAVE_ARCH_STRLEN -#define __HAVE_ARCH_STRCMP -#define __HAVE_ARCH_STRCAT -#define __HAVE_ARCH_MEMSET -#define __HAVE_ARCH_MEMCPY -#define __HAVE_ARCH_MEMMOVE -#define __HAVE_ARCH_MEMCMP -#define __HAVE_ARCH_MEMCHR - -extern int strcasecmp(const char *, const char *); -extern int strncasecmp(const char *, const char *, int); -extern char * strcpy(char *,const char *); -extern char * strncpy(char *,const char *, __kernel_size_t); -extern __kernel_size_t strlen(const char *); -extern int strcmp(const char *,const char *); -extern char * strcat(char *, const char *); -extern void * memset(void *,int,__kernel_size_t); -extern void * memcpy(void *,const void *,__kernel_size_t); -extern void * memmove(void *,const void *,__kernel_size_t); -extern int memcmp(const void *,const void *,__kernel_size_t); -extern void * memchr(const void *,int,__kernel_size_t); - -#endif /* __KERNEL__ */ - -#endif diff --git a/include/asm-ppc/unaligned.h b/include/asm-ppc/unaligned.h deleted file mode 100644 index 45520d9..0000000 --- a/include/asm-ppc/unaligned.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifdef __KERNEL__ -#ifndef __PPC_UNALIGNED_H -#define __PPC_UNALIGNED_H - -/* - * The PowerPC can do unaligned accesses itself in big endian mode. - * - * The strange macros are there to make sure these can't - * be misused in a way that makes them not work on other - * architectures where unaligned accesses aren't as simple. - */ - -#define get_unaligned(ptr) (*(ptr)) - -#define put_unaligned(val, ptr) ((void)( *(ptr) = (val) )) - -#endif -#endif /* __KERNEL__ */ diff --git a/include/asm-ppc64/errno.h b/include/asm-ppc64/errno.h deleted file mode 100644 index 69bc3b0..0000000 --- a/include/asm-ppc64/errno.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef _PPC64_ERRNO_H -#define _PPC64_ERRNO_H - -/* - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include - -#undef EDEADLOCK -#define EDEADLOCK 58 /* File locking deadlock error */ - -#define _LAST_ERRNO 516 - -#endif diff --git a/include/asm-ppc64/ioctl.h b/include/asm-ppc64/ioctl.h deleted file mode 100644 index 42b8c5d..0000000 --- a/include/asm-ppc64/ioctl.h +++ /dev/null @@ -1,74 +0,0 @@ -#ifndef _PPC64_IOCTL_H -#define _PPC64_IOCTL_H - - -/* - * This was copied from the alpha as it's a bit cleaner there. - * -- Cort - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#define _IOC_NRBITS 8 -#define _IOC_TYPEBITS 8 -#define _IOC_SIZEBITS 13 -#define _IOC_DIRBITS 3 - -#define _IOC_NRMASK ((1 << _IOC_NRBITS)-1) -#define _IOC_TYPEMASK ((1 << _IOC_TYPEBITS)-1) -#define _IOC_SIZEMASK ((1 << _IOC_SIZEBITS)-1) -#define _IOC_DIRMASK ((1 << _IOC_DIRBITS)-1) - -#define _IOC_NRSHIFT 0 -#define _IOC_TYPESHIFT (_IOC_NRSHIFT+_IOC_NRBITS) -#define _IOC_SIZESHIFT (_IOC_TYPESHIFT+_IOC_TYPEBITS) -#define _IOC_DIRSHIFT (_IOC_SIZESHIFT+_IOC_SIZEBITS) - -/* - * Direction bits _IOC_NONE could be 0, but OSF/1 gives it a bit. - * And this turns out useful to catch old ioctl numbers in header - * files for us. - */ -#define _IOC_NONE 1U -#define _IOC_READ 2U -#define _IOC_WRITE 4U - -#define _IOC(dir,type,nr,size) \ - (((dir) << _IOC_DIRSHIFT) | \ - ((type) << _IOC_TYPESHIFT) | \ - ((nr) << _IOC_NRSHIFT) | \ - ((size) << _IOC_SIZESHIFT)) - -/* provoke compile error for invalid uses of size argument */ -extern unsigned int __invalid_size_argument_for_IOC; -#define _IOC_TYPECHECK(t) \ - ((sizeof(t) == sizeof(t[1]) && \ - sizeof(t) < (1 << _IOC_SIZEBITS)) ? \ - sizeof(t) : __invalid_size_argument_for_IOC) - -/* used to create numbers */ -#define _IO(type,nr) _IOC(_IOC_NONE,(type),(nr),0) -#define _IOR(type,nr,size) _IOC(_IOC_READ,(type),(nr),(_IOC_TYPECHECK(size))) -#define _IOW(type,nr,size) _IOC(_IOC_WRITE,(type),(nr),(_IOC_TYPECHECK(size))) -#define _IOWR(type,nr,size) _IOC(_IOC_READ|_IOC_WRITE,(type),(nr),(_IOC_TYPECHECK(size))) -#define _IOR_BAD(type,nr,size) _IOC(_IOC_READ,(type),(nr),sizeof(size)) -#define _IOW_BAD(type,nr,size) _IOC(_IOC_WRITE,(type),(nr),sizeof(size)) -#define _IOWR_BAD(type,nr,size) _IOC(_IOC_READ|_IOC_WRITE,(type),(nr),sizeof(size)) - -/* used to decode them.. */ -#define _IOC_DIR(nr) (((nr) >> _IOC_DIRSHIFT) & _IOC_DIRMASK) -#define _IOC_TYPE(nr) (((nr) >> _IOC_TYPESHIFT) & _IOC_TYPEMASK) -#define _IOC_NR(nr) (((nr) >> _IOC_NRSHIFT) & _IOC_NRMASK) -#define _IOC_SIZE(nr) (((nr) >> _IOC_SIZESHIFT) & _IOC_SIZEMASK) - -/* various drivers, such as the pcmcia stuff, need these... */ -#define IOC_IN (_IOC_WRITE << _IOC_DIRSHIFT) -#define IOC_OUT (_IOC_READ << _IOC_DIRSHIFT) -#define IOC_INOUT ((_IOC_WRITE|_IOC_READ) << _IOC_DIRSHIFT) -#define IOCSIZE_MASK (_IOC_SIZEMASK << _IOC_SIZESHIFT) -#define IOCSIZE_SHIFT (_IOC_SIZESHIFT) - -#endif /* _PPC64_IOCTL_H */ diff --git a/include/asm-ppc64/ioctls.h b/include/asm-ppc64/ioctls.h deleted file mode 100644 index 48796bf..0000000 --- a/include/asm-ppc64/ioctls.h +++ /dev/null @@ -1,114 +0,0 @@ -#ifndef _ASM_PPC64_IOCTLS_H -#define _ASM_PPC64_IOCTLS_H - -/* - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include - -#define FIOCLEX _IO('f', 1) -#define FIONCLEX _IO('f', 2) -#define FIOASYNC _IOW('f', 125, int) -#define FIONBIO _IOW('f', 126, int) -#define FIONREAD _IOR('f', 127, int) -#define TIOCINQ FIONREAD -#define FIOQSIZE _IOR('f', 128, loff_t) - -#define TIOCGETP _IOR('t', 8, struct sgttyb) -#define TIOCSETP _IOW('t', 9, struct sgttyb) -#define TIOCSETN _IOW('t', 10, struct sgttyb) /* TIOCSETP wo flush */ - -#define TIOCSETC _IOW('t', 17, struct tchars) -#define TIOCGETC _IOR('t', 18, struct tchars) -#define TCGETS _IOR('t', 19, struct termios) -#define TCSETS _IOW('t', 20, struct termios) -#define TCSETSW _IOW('t', 21, struct termios) -#define TCSETSF _IOW('t', 22, struct termios) - -#define TCGETA _IOR('t', 23, struct termio) -#define TCSETA _IOW('t', 24, struct termio) -#define TCSETAW _IOW('t', 25, struct termio) -#define TCSETAF _IOW('t', 28, struct termio) - -#define TCSBRK _IO('t', 29) -#define TCXONC _IO('t', 30) -#define TCFLSH _IO('t', 31) - -#define TIOCSWINSZ _IOW('t', 103, struct winsize) -#define TIOCGWINSZ _IOR('t', 104, struct winsize) -#define TIOCSTART _IO('t', 110) /* start output, like ^Q */ -#define TIOCSTOP _IO('t', 111) /* stop output, like ^S */ -#define TIOCOUTQ _IOR('t', 115, int) /* output queue size */ - -#define TIOCGLTC _IOR('t', 116, struct ltchars) -#define TIOCSLTC _IOW('t', 117, struct ltchars) -#define TIOCSPGRP _IOW('t', 118, int) -#define TIOCGPGRP _IOR('t', 119, int) - -#define TIOCEXCL 0x540C -#define TIOCNXCL 0x540D -#define TIOCSCTTY 0x540E - -#define TIOCSTI 0x5412 -#define TIOCMGET 0x5415 -#define TIOCMBIS 0x5416 -#define TIOCMBIC 0x5417 -#define TIOCMSET 0x5418 -# define TIOCM_LE 0x001 -# define TIOCM_DTR 0x002 -# define TIOCM_RTS 0x004 -# define TIOCM_ST 0x008 -# define TIOCM_SR 0x010 -# define TIOCM_CTS 0x020 -# define TIOCM_CAR 0x040 -# define TIOCM_RNG 0x080 -# define TIOCM_DSR 0x100 -# define TIOCM_CD TIOCM_CAR -# define TIOCM_RI TIOCM_RNG - -#define TIOCGSOFTCAR 0x5419 -#define TIOCSSOFTCAR 0x541A -#define TIOCLINUX 0x541C -#define TIOCCONS 0x541D -#define TIOCGSERIAL 0x541E -#define TIOCSSERIAL 0x541F -#define TIOCPKT 0x5420 -# define TIOCPKT_DATA 0 -# define TIOCPKT_FLUSHREAD 1 -# define TIOCPKT_FLUSHWRITE 2 -# define TIOCPKT_STOP 4 -# define TIOCPKT_START 8 -# define TIOCPKT_NOSTOP 16 -# define TIOCPKT_DOSTOP 32 - - -#define TIOCNOTTY 0x5422 -#define TIOCSETD 0x5423 -#define TIOCGETD 0x5424 -#define TCSBRKP 0x5425 /* Needed for POSIX tcsendbreak() */ -#define TIOCSBRK 0x5427 /* BSD compatibility */ -#define TIOCCBRK 0x5428 /* BSD compatibility */ -#define TIOCGSID 0x5429 /* Return the session ID of FD */ -#define TIOCGPTN _IOR('T',0x30, unsigned int) /* Get Pty Number (of pty-mux device) */ -#define TIOCSPTLCK _IOW('T',0x31, int) /* Lock/unlock Pty */ - -#define TIOCSERCONFIG 0x5453 -#define TIOCSERGWILD 0x5454 -#define TIOCSERSWILD 0x5455 -#define TIOCGLCKTRMIOS 0x5456 -#define TIOCSLCKTRMIOS 0x5457 -#define TIOCSERGSTRUCT 0x5458 /* For debugging only */ -#define TIOCSERGETLSR 0x5459 /* Get line status register */ - /* ioctl (fd, TIOCSERGETLSR, &result) where result may be as below */ -# define TIOCSER_TEMT 0x01 /* Transmitter physically empty */ -#define TIOCSERGETMULTI 0x545A /* Get multiport config */ -#define TIOCSERSETMULTI 0x545B /* Set multiport config */ - -#define TIOCMIWAIT 0x545C /* wait for a change on serial input line(s) */ -#define TIOCGICOUNT 0x545D /* read serial port inline interrupt counts */ - -#endif /* _ASM_PPC64_IOCTLS_H */ diff --git a/include/asm-ppc64/local.h b/include/asm-ppc64/local.h deleted file mode 100644 index c11c530..0000000 --- a/include/asm-ppc64/local.h +++ /dev/null @@ -1 +0,0 @@ -#include diff --git a/include/asm-ppc64/namei.h b/include/asm-ppc64/namei.h deleted file mode 100644 index a1412a2..0000000 --- a/include/asm-ppc64/namei.h +++ /dev/null @@ -1,23 +0,0 @@ -/* - * linux/include/asm-ppc/namei.h - * Adapted from linux/include/asm-alpha/namei.h - * - * Included from linux/fs/namei.c - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#ifndef __PPC64_NAMEI_H -#define __PPC64_NAMEI_H - -/* This dummy routine maybe changed to something useful - * for /usr/gnemul/ emulation stuff. - * Look at asm-sparc/namei.h for details. - */ - -#define __emul_prefix() NULL - -#endif /* __PPC64_NAMEI_H */ diff --git a/include/asm-ppc64/percpu.h b/include/asm-ppc64/percpu.h deleted file mode 100644 index 60a659a..0000000 --- a/include/asm-ppc64/percpu.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef __ARCH_PPC64_PERCPU__ -#define __ARCH_PPC64_PERCPU__ - -#include - -#endif /* __ARCH_PPC64_PERCPU__ */ diff --git a/include/asm-ppc64/poll.h b/include/asm-ppc64/poll.h deleted file mode 100644 index 370fa3b..0000000 --- a/include/asm-ppc64/poll.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef __PPC64_POLL_H -#define __PPC64_POLL_H - -/* - * Copyright (C) 2001 PPC64 Team, IBM Corp - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#define POLLIN 0x0001 -#define POLLPRI 0x0002 -#define POLLOUT 0x0004 -#define POLLERR 0x0008 -#define POLLHUP 0x0010 -#define POLLNVAL 0x0020 -#define POLLRDNORM 0x0040 -#define POLLRDBAND 0x0080 -#define POLLWRNORM 0x0100 -#define POLLWRBAND 0x0200 -#define POLLMSG 0x0400 -#define POLLREMOVE 0x1000 - -struct pollfd { - int fd; - short events; - short revents; -}; - -#endif /* __PPC64_POLL_H */ diff --git a/include/asm-ppc64/resource.h b/include/asm-ppc64/resource.h deleted file mode 100644 index add031b..0000000 --- a/include/asm-ppc64/resource.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _PPC64_RESOURCE_H -#define _PPC64_RESOURCE_H - -#include - -#endif /* _PPC64_RESOURCE_H */ diff --git a/include/asm-ppc64/shmparam.h b/include/asm-ppc64/shmparam.h deleted file mode 100644 index b2825ce..0000000 --- a/include/asm-ppc64/shmparam.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef _PPC64_SHMPARAM_H -#define _PPC64_SHMPARAM_H - -/* - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#define SHMLBA PAGE_SIZE /* attach addr a multiple of this */ - -#endif /* _PPC64_SHMPARAM_H */ diff --git a/include/asm-ppc64/string.h b/include/asm-ppc64/string.h deleted file mode 100644 index eeca68e..0000000 --- a/include/asm-ppc64/string.h +++ /dev/null @@ -1,35 +0,0 @@ -#ifndef _PPC64_STRING_H_ -#define _PPC64_STRING_H_ - -/* - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#define __HAVE_ARCH_STRCPY -#define __HAVE_ARCH_STRNCPY -#define __HAVE_ARCH_STRLEN -#define __HAVE_ARCH_STRCMP -#define __HAVE_ARCH_STRCAT -#define __HAVE_ARCH_MEMSET -#define __HAVE_ARCH_MEMCPY -#define __HAVE_ARCH_MEMMOVE -#define __HAVE_ARCH_MEMCMP -#define __HAVE_ARCH_MEMCHR - -extern int strcasecmp(const char *, const char *); -extern int strncasecmp(const char *, const char *, int); -extern char * strcpy(char *,const char *); -extern char * strncpy(char *,const char *, __kernel_size_t); -extern __kernel_size_t strlen(const char *); -extern int strcmp(const char *,const char *); -extern char * strcat(char *, const char *); -extern void * memset(void *,int,__kernel_size_t); -extern void * memcpy(void *,const void *,__kernel_size_t); -extern void * memmove(void *,const void *,__kernel_size_t); -extern int memcmp(const void *,const void *,__kernel_size_t); -extern void * memchr(const void *,int,__kernel_size_t); - -#endif /* _PPC64_STRING_H_ */ diff --git a/include/asm-ppc64/unaligned.h b/include/asm-ppc64/unaligned.h deleted file mode 100644 index 636e93c..0000000 --- a/include/asm-ppc64/unaligned.h +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef __PPC64_UNALIGNED_H -#define __PPC64_UNALIGNED_H - -/* - * The PowerPC can do unaligned accesses itself in big endian mode. - * - * The strange macros are there to make sure these can't - * be misused in a way that makes them not work on other - * architectures where unaligned accesses aren't as simple. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#define get_unaligned(ptr) (*(ptr)) - -#define put_unaligned(val, ptr) ((void)( *(ptr) = (val) )) - -#endif /* __PPC64_UNALIGNED_H */ -- cgit v0.10.2 From 7fea82ab1a74030f79a2adfac1af3d93b8638fc3 Mon Sep 17 00:00:00 2001 From: Olof Johansson Date: Sun, 28 Aug 2005 21:42:10 -0500 Subject: [PATCH] PPC64: Don't try to claim memory from OF at 1GB mark Some RS64-based machines (p620, F80, others) have problems with firmware returning 0xdeadbeef instead of failure to allocations that end at the 1GB mark. We have two options: 1. Detect the undocumented 0xdeadbeef return value and interpret it as a failure. 2. Avoid allocating that high. (2) is really the cleaner solution here. 768MB is plenty of room so use that as the max alloc_top instead of 1GB. Signed-off-by: Olof Johansson Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/kernel/prom_init.c b/arch/ppc64/kernel/prom_init.c index adcf972..122283a 100644 --- a/arch/ppc64/kernel/prom_init.c +++ b/arch/ppc64/kernel/prom_init.c @@ -892,7 +892,10 @@ static void __init prom_init_mem(void) if ( RELOC(of_platform) == PLATFORM_PSERIES_LPAR ) RELOC(alloc_top) = RELOC(rmo_top); else - RELOC(alloc_top) = RELOC(rmo_top) = min(0x40000000ul, RELOC(ram_top)); + /* Some RS64 machines have buggy firmware where claims up at 1GB + * fails. Cap at 768MB as a workaround. Still plenty of room. + */ + RELOC(alloc_top) = RELOC(rmo_top) = min(0x30000000ul, RELOC(ram_top)); prom_printf("memory layout at init:\n"); prom_printf(" memory_limit : %x (16 MB aligned)\n", RELOC(prom_memory_limit)); -- cgit v0.10.2 From 04ed65190a5d1562220dd3a7fc9eac2402c7104c Mon Sep 17 00:00:00 2001 From: Jake Moilanen Date: Wed, 24 Aug 2005 15:22:12 -0500 Subject: [PATCH] oprofile PVR 970MP Here's the 970MP's PVR (processor version register) entry for oprofile. Signed-off-by: Jake Moilanen Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/oprofile/common.c b/arch/ppc64/oprofile/common.c index b28bfda..4acd1a4 100644 --- a/arch/ppc64/oprofile/common.c +++ b/arch/ppc64/oprofile/common.c @@ -153,6 +153,7 @@ int __init oprofile_arch_init(struct oprofile_operations *ops) case PV_970: case PV_970FX: + case PV_970MP: model = &op_model_power4; model->num_counters = 8; ops->cpu_type = "ppc64/970"; diff --git a/include/asm-ppc64/processor.h b/include/asm-ppc64/processor.h index 50b14c0..7bd4796 100644 --- a/include/asm-ppc64/processor.h +++ b/include/asm-ppc64/processor.h @@ -268,6 +268,7 @@ #define PV_970FX 0x003C #define PV_630 0x0040 #define PV_630p 0x0041 +#define PV_970MP 0x0044 #define PV_BE 0x0070 /* Platforms supported by PPC64 */ -- cgit v0.10.2 From 717522ff44f1fbee5ea09e83d7cd4b5c956e30f9 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 24 Aug 2005 08:53:03 +1000 Subject: [PATCH] ppc64: Add CONFIG_HZ While ppc64 has the CONFIG_HZ Kconfig option, it wasnt actually being used. Connect it up and set all platforms to 250Hz. Signed-off-by: Anton Blanchard Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/configs/g5_defconfig b/arch/ppc64/configs/g5_defconfig index ab56774..fc83d93 100644 --- a/arch/ppc64/configs/g5_defconfig +++ b/arch/ppc64/configs/g5_defconfig @@ -103,10 +103,10 @@ CONFIG_PREEMPT_NONE=y # CONFIG_PREEMPT_VOLUNTARY is not set # CONFIG_PREEMPT is not set # CONFIG_PREEMPT_BKL is not set -CONFIG_HZ_100=y -# CONFIG_HZ_250 is not set +# CONFIG_HZ_100 is not set +CONFIG_HZ_250=y # CONFIG_HZ_1000 is not set -CONFIG_HZ=100 +CONFIG_HZ=250 CONFIG_GENERIC_HARDIRQS=y CONFIG_SECCOMP=y CONFIG_ISA_DMA_API=y diff --git a/arch/ppc64/configs/iSeries_defconfig b/arch/ppc64/configs/iSeries_defconfig index 219c667..013d4e0 100644 --- a/arch/ppc64/configs/iSeries_defconfig +++ b/arch/ppc64/configs/iSeries_defconfig @@ -94,10 +94,10 @@ CONFIG_PREEMPT_NONE=y # CONFIG_PREEMPT_VOLUNTARY is not set # CONFIG_PREEMPT is not set # CONFIG_PREEMPT_BKL is not set -CONFIG_HZ_100=y -# CONFIG_HZ_250 is not set +# CONFIG_HZ_100 is not set +CONFIG_HZ_250=y # CONFIG_HZ_1000 is not set -CONFIG_HZ=100 +CONFIG_HZ=250 CONFIG_GENERIC_HARDIRQS=y CONFIG_LPARCFG=y CONFIG_SECCOMP=y diff --git a/arch/ppc64/configs/maple_defconfig b/arch/ppc64/configs/maple_defconfig index 2033fe6..dd42892 100644 --- a/arch/ppc64/configs/maple_defconfig +++ b/arch/ppc64/configs/maple_defconfig @@ -103,10 +103,10 @@ CONFIG_PREEMPT_NONE=y # CONFIG_PREEMPT_VOLUNTARY is not set # CONFIG_PREEMPT is not set # CONFIG_PREEMPT_BKL is not set -CONFIG_HZ_100=y -# CONFIG_HZ_250 is not set +# CONFIG_HZ_100 is not set +CONFIG_HZ_250=y # CONFIG_HZ_1000 is not set -CONFIG_HZ=100 +CONFIG_HZ=250 CONFIG_GENERIC_HARDIRQS=y CONFIG_SECCOMP=y CONFIG_ISA_DMA_API=y diff --git a/arch/ppc64/configs/pSeries_defconfig b/arch/ppc64/configs/pSeries_defconfig index 297fd52..29f7b80 100644 --- a/arch/ppc64/configs/pSeries_defconfig +++ b/arch/ppc64/configs/pSeries_defconfig @@ -112,10 +112,10 @@ CONFIG_PREEMPT_NONE=y # CONFIG_PREEMPT_VOLUNTARY is not set # CONFIG_PREEMPT is not set # CONFIG_PREEMPT_BKL is not set -CONFIG_HZ_100=y -# CONFIG_HZ_250 is not set +# CONFIG_HZ_100 is not set +CONFIG_HZ_250=y # CONFIG_HZ_1000 is not set -CONFIG_HZ=100 +CONFIG_HZ=250 CONFIG_EEH=y CONFIG_GENERIC_HARDIRQS=y CONFIG_PPC_RTAS=y diff --git a/arch/ppc64/defconfig b/arch/ppc64/defconfig index c361e77..7cb4750 100644 --- a/arch/ppc64/defconfig +++ b/arch/ppc64/defconfig @@ -114,10 +114,10 @@ CONFIG_PREEMPT_NONE=y # CONFIG_PREEMPT_VOLUNTARY is not set # CONFIG_PREEMPT is not set # CONFIG_PREEMPT_BKL is not set -CONFIG_HZ_100=y -# CONFIG_HZ_250 is not set +# CONFIG_HZ_100 is not set +CONFIG_HZ_250=y # CONFIG_HZ_1000 is not set -CONFIG_HZ=100 +CONFIG_HZ=250 CONFIG_EEH=y CONFIG_GENERIC_HARDIRQS=y CONFIG_PPC_RTAS=y diff --git a/include/asm-ppc64/param.h b/include/asm-ppc64/param.h index 1fad38d..76c212d 100644 --- a/include/asm-ppc64/param.h +++ b/include/asm-ppc64/param.h @@ -1,6 +1,8 @@ #ifndef _ASM_PPC64_PARAM_H #define _ASM_PPC64_PARAM_H +#include + /* * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -9,7 +11,7 @@ */ #ifdef __KERNEL__ -# define HZ 1000 /* Internal kernel timer frequency */ +# define HZ CONFIG_HZ /* Internal kernel timer frequency */ # define USER_HZ 100 /* .. some user interfaces are in "ticks" */ # define CLOCKS_PER_SEC (USER_HZ) /* like times() */ #endif -- cgit v0.10.2 From 5ff98ae18bec792d77bfea801aa4b3385b98b355 Mon Sep 17 00:00:00 2001 From: Joel Schopp Date: Thu, 11 Aug 2005 17:39:28 -0500 Subject: [PATCH] ppc64: of_device.c remove useless code Coverity found more unused code. Signed-off-by: Joel Schopp Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/kernel/of_device.c b/arch/ppc64/kernel/of_device.c index b80e819..da58081 100644 --- a/arch/ppc64/kernel/of_device.c +++ b/arch/ppc64/kernel/of_device.c @@ -236,7 +236,6 @@ void of_device_unregister(struct of_device *ofdev) struct of_device* of_platform_device_create(struct device_node *np, const char *bus_id) { struct of_device *dev; - u32 *reg; dev = kmalloc(sizeof(*dev), GFP_KERNEL); if (!dev) @@ -250,7 +249,6 @@ struct of_device* of_platform_device_create(struct device_node *np, const char * dev->dev.bus = &of_platform_bus_type; dev->dev.release = of_release_dev; - reg = (u32 *)get_property(np, "reg", NULL); strlcpy(dev->dev.bus_id, bus_id, BUS_ID_SIZE); if (of_device_register(dev) != 0) { -- cgit v0.10.2 From 597f95e2bfbe9b83ed8b0761ebf4e7d55fd4df17 Mon Sep 17 00:00:00 2001 From: Joel Schopp Date: Fri, 12 Aug 2005 14:34:58 -0500 Subject: [PATCH] ppc64: lparconfig.c memory leak This patch fixes a rare memory leak found by Coverity. Signed-off-by: Joel Schopp Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/kernel/lparcfg.c b/arch/ppc64/kernel/lparcfg.c index 9d034ff..edad361 100644 --- a/arch/ppc64/kernel/lparcfg.c +++ b/arch/ppc64/kernel/lparcfg.c @@ -273,6 +273,7 @@ static void parse_system_parameter_string(struct seq_file *m) if (!workbuffer) { printk(KERN_ERR "%s %s kmalloc failure at line %d \n", __FILE__, __FUNCTION__, __LINE__); + kfree(local_buffer); return; } #ifdef LPARCFG_DEBUG -- cgit v0.10.2 From 8913ca1c9ccb5eb6471afd419159729eef6e2730 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 27 Jul 2005 15:47:23 +1000 Subject: [PATCH] Remove nested feature sections The {BEGIN,END}_FTR_SECTION asm macros used in ppc64 to nop out sections of code at runtime cannot be nested. However, we do nest them in hash_low.S. We get away with it there, because there is nothing between the BEGIN markers for each section. However, that's confusing to someone reading the code. This patch removes the nested ifset and ifclr feature sections, replacing them with a single feature section in the full mask/value form. Signed-off-by: David Gibson Signed-off-by: Paul Mackerras diff --git a/arch/ppc64/mm/hash_low.S b/arch/ppc64/mm/hash_low.S index fbff248..35eb49e 100644 --- a/arch/ppc64/mm/hash_low.S +++ b/arch/ppc64/mm/hash_low.S @@ -129,12 +129,10 @@ _GLOBAL(__hash_page) * code rather than call a C function...) */ BEGIN_FTR_SECTION -BEGIN_FTR_SECTION mr r4,r30 mr r5,r7 bl .hash_page_do_lazy_icache -END_FTR_SECTION_IFSET(CPU_FTR_NOEXECUTE) -END_FTR_SECTION_IFCLR(CPU_FTR_COHERENT_ICACHE) +END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE) /* At this point, r3 contains new PP bits, save them in * place of "access" in the param area (sic) -- cgit v0.10.2 From 7ef24b69f9ff4858d7242059fbb19477c10e6dd7 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 25 Aug 2005 17:14:46 -0700 Subject: [PATCH] s2io build fix Damir Perisa reports: drivers/net/s2io.h:765: error: invalid lvalue in assignment drivers/net/s2io.h:766: error: invalid lvalue in assignment That's a gcc4 error. I don't see why the casts are there anyway.. Cc: Jeff Garzik Signed-off-by: Andrew Morton Signed-off-by: Jeff Garzik diff --git a/drivers/net/s2io.h b/drivers/net/s2io.h index 5d92707..bc64d96 100644 --- a/drivers/net/s2io.h +++ b/drivers/net/s2io.h @@ -762,8 +762,8 @@ static inline u64 readq(void __iomem *addr) { u64 ret = 0; ret = readl(addr + 4); - (u64) ret <<= 32; - (u64) ret |= readl(addr); + ret <<= 32; + ret |= readl(addr); return ret; } -- cgit v0.10.2 From d8971fcb702e24d1e22c77fd1772f182ffee87e3 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 29 Aug 2005 22:51:28 -0700 Subject: [INET]: compile errors when DEBUG is defined Fix build problem found by compiling driver with DEBUG defined that used tcp.h. Since pr_debug(arg) expands to printk("<7>" arg) the argument needs to be string that can be concatenated. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 8a87a3a..651f824 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -147,7 +147,7 @@ static inline void inet_csk_clear_xmit_timer(struct sock *sk, const int what) } #ifdef INET_CSK_DEBUG else { - pr_debug(inet_csk_timer_bug_msg); + pr_debug("%s", inet_csk_timer_bug_msg); } #endif } @@ -180,7 +180,7 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what, } #ifdef INET_CSK_DEBUG else { - pr_debug(inet_csk_timer_bug_msg); + pr_debug("%s", inet_csk_timer_bug_msg); } #endif } -- cgit v0.10.2