From ec924b4726e3df000d3ac7ae10cb8ef1adcd60ca Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 17 Jul 2006 18:20:51 +0300 Subject: IB/uverbs: Fix unlocking in error paths ib_uverbs_create_ah() and ib_uverbs_create_srq() did not release the PD's read lock in their error paths, which lead to deadlock when destroying the PD. Signed-off-by: Michael S. Tsirkin Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index bdf5d50..0371806 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -1775,7 +1775,7 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file, ah = ib_create_ah(pd, &attr); if (IS_ERR(ah)) { ret = PTR_ERR(ah); - goto err; + goto err_put; } ah->uobject = uobj; @@ -1811,6 +1811,9 @@ err_copy: err_destroy: ib_destroy_ah(ah); +err_put: + put_pd_read(pd); + err: put_uobj_write(uobj); return ret; @@ -1984,7 +1987,7 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file, srq = pd->device->create_srq(pd, &attr, &udata); if (IS_ERR(srq)) { ret = PTR_ERR(srq); - goto err; + goto err_put; } srq->device = pd->device; @@ -2029,6 +2032,9 @@ err_copy: err_destroy: ib_destroy_srq(srq); +err_put: + put_pd_read(pd); + err: put_uobj_write(&obj->uobject); return ret; -- cgit v0.10.2 From 43db2bc04409b1e1b74f9768e3284cec18a87d0b Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Sun, 23 Jul 2006 15:16:04 -0700 Subject: IB/uverbs: Fix lockdep warnings Lockdep warns because uverbs is trying to take uobj->mutex when it already holds that lock. This is because there are really multiple types of uobjs even though all of their locks are initialized in common code. Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 0371806..30923eb 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -42,6 +42,13 @@ #include "uverbs.h" +static struct lock_class_key pd_lock_key; +static struct lock_class_key mr_lock_key; +static struct lock_class_key cq_lock_key; +static struct lock_class_key qp_lock_key; +static struct lock_class_key ah_lock_key; +static struct lock_class_key srq_lock_key; + #define INIT_UDATA(udata, ibuf, obuf, ilen, olen) \ do { \ (udata)->inbuf = (void __user *) (ibuf); \ @@ -76,12 +83,13 @@ */ static void init_uobj(struct ib_uobject *uobj, u64 user_handle, - struct ib_ucontext *context) + struct ib_ucontext *context, struct lock_class_key *key) { uobj->user_handle = user_handle; uobj->context = context; kref_init(&uobj->ref); init_rwsem(&uobj->mutex); + lockdep_set_class(&uobj->mutex, key); uobj->live = 0; } @@ -470,7 +478,7 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file, if (!uobj) return -ENOMEM; - init_uobj(uobj, 0, file->ucontext); + init_uobj(uobj, 0, file->ucontext, &pd_lock_key); down_write(&uobj->mutex); pd = file->device->ib_dev->alloc_pd(file->device->ib_dev, @@ -591,7 +599,7 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, if (!obj) return -ENOMEM; - init_uobj(&obj->uobject, 0, file->ucontext); + init_uobj(&obj->uobject, 0, file->ucontext, &mr_lock_key); down_write(&obj->uobject.mutex); /* @@ -770,7 +778,7 @@ ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file, if (!obj) return -ENOMEM; - init_uobj(&obj->uobject, cmd.user_handle, file->ucontext); + init_uobj(&obj->uobject, cmd.user_handle, file->ucontext, &cq_lock_key); down_write(&obj->uobject.mutex); if (cmd.comp_channel >= 0) { @@ -1051,13 +1059,14 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, if (!obj) return -ENOMEM; - init_uobj(&obj->uevent.uobject, cmd.user_handle, file->ucontext); + init_uobj(&obj->uevent.uobject, cmd.user_handle, file->ucontext, &qp_lock_key); down_write(&obj->uevent.uobject.mutex); + srq = cmd.is_srq ? idr_read_srq(cmd.srq_handle, file->ucontext) : NULL; pd = idr_read_pd(cmd.pd_handle, file->ucontext); scq = idr_read_cq(cmd.send_cq_handle, file->ucontext); - rcq = idr_read_cq(cmd.recv_cq_handle, file->ucontext); - srq = cmd.is_srq ? idr_read_srq(cmd.srq_handle, file->ucontext) : NULL; + rcq = cmd.recv_cq_handle == cmd.send_cq_handle ? + scq : idr_read_cq(cmd.recv_cq_handle, file->ucontext); if (!pd || !scq || !rcq || (cmd.is_srq && !srq)) { ret = -EINVAL; @@ -1125,7 +1134,8 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, put_pd_read(pd); put_cq_read(scq); - put_cq_read(rcq); + if (rcq != scq) + put_cq_read(rcq); if (srq) put_srq_read(srq); @@ -1150,7 +1160,7 @@ err_put: put_pd_read(pd); if (scq) put_cq_read(scq); - if (rcq) + if (rcq && rcq != scq) put_cq_read(rcq); if (srq) put_srq_read(srq); @@ -1751,7 +1761,7 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file, if (!uobj) return -ENOMEM; - init_uobj(uobj, cmd.user_handle, file->ucontext); + init_uobj(uobj, cmd.user_handle, file->ucontext, &ah_lock_key); down_write(&uobj->mutex); pd = idr_read_pd(cmd.pd_handle, file->ucontext); @@ -1966,7 +1976,7 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file, if (!obj) return -ENOMEM; - init_uobj(&obj->uobject, cmd.user_handle, file->ucontext); + init_uobj(&obj->uobject, cmd.user_handle, file->ucontext, &srq_lock_key); down_write(&obj->uobject.mutex); pd = idr_read_pd(cmd.pd_handle, file->ucontext); -- cgit v0.10.2 From 1252c517cf3df240ae51946a096035765dfd2e6d Mon Sep 17 00:00:00 2001 From: Dotan Barak Date: Thu, 13 Jul 2006 11:05:49 +0300 Subject: IB/mthca: Fix SRQ limit event range check Mem-free HCAs always keep one spare SRQ WQE, so the SRQ limit cannot be set beyond srq->max - 1. Signed-off-by: Dotan Barak Signed-off-by: Michael S. Tsirkin Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/mthca/mthca_srq.c b/drivers/infiniband/hw/mthca/mthca_srq.c index fab417c..b60a9d7 100644 --- a/drivers/infiniband/hw/mthca/mthca_srq.c +++ b/drivers/infiniband/hw/mthca/mthca_srq.c @@ -370,7 +370,8 @@ int mthca_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, return -EINVAL; if (attr_mask & IB_SRQ_LIMIT) { - if (attr->srq_limit > srq->max) + u32 max_wr = mthca_is_memfree(dev) ? srq->max - 1 : srq->max; + if (attr->srq_limit > max_wr) return -EINVAL; mutex_lock(&srq->mutex); -- cgit v0.10.2 From 3d37b9e209136cf178562bbedc7cd2ecb1da8beb Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Mon, 17 Jul 2006 18:18:36 -0700 Subject: IB/ipath: Fix a data corruption This patch fixes a problem where certain error packets are passed to the InfiniBand layer for processing even though the packet actually was received with an error. Signed-off-by: Ralph Campbell Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c index 823131d..f98518d 100644 --- a/drivers/infiniband/hw/ipath/ipath_driver.c +++ b/drivers/infiniband/hw/ipath/ipath_driver.c @@ -859,6 +859,38 @@ static void ipath_rcv_layer(struct ipath_devdata *dd, u32 etail, __ipath_layer_rcv_lid(dd, hdr); } +static void ipath_rcv_hdrerr(struct ipath_devdata *dd, + u32 eflags, + u32 l, + u32 etail, + u64 *rc) +{ + char emsg[128]; + struct ipath_message_header *hdr; + + get_rhf_errstring(eflags, emsg, sizeof emsg); + hdr = (struct ipath_message_header *)&rc[1]; + ipath_cdbg(PKT, "RHFerrs %x hdrqtail=%x typ=%u " + "tlen=%x opcode=%x egridx=%x: %s\n", + eflags, l, + ipath_hdrget_rcv_type((__le32 *) rc), + ipath_hdrget_length_in_bytes((__le32 *) rc), + be32_to_cpu(hdr->bth[0]) >> 24, + etail, emsg); + + /* Count local link integrity errors. */ + if (eflags & (INFINIPATH_RHF_H_ICRCERR | INFINIPATH_RHF_H_VCRCERR)) { + u8 n = (dd->ipath_ibcctrl >> + INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT) & + INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK; + + if (++dd->ipath_lli_counter > n) { + dd->ipath_lli_counter = 0; + dd->ipath_lli_errors++; + } + } +} + /* * ipath_kreceive - receive a packet * @dd: the infinipath device @@ -875,7 +907,6 @@ void ipath_kreceive(struct ipath_devdata *dd) struct ipath_message_header *hdr; u32 eflags, i, etype, tlen, pkttot = 0, updegr=0, reloop=0; static u64 totcalls; /* stats, may eventually remove */ - char emsg[128]; if (!dd->ipath_hdrqtailptr) { ipath_dev_err(dd, @@ -938,26 +969,9 @@ reloop: "%x\n", etype); } - if (eflags & ~(INFINIPATH_RHF_H_TIDERR | - INFINIPATH_RHF_H_IHDRERR)) { - get_rhf_errstring(eflags, emsg, sizeof emsg); - ipath_cdbg(PKT, "RHFerrs %x hdrqtail=%x typ=%u " - "tlen=%x opcode=%x egridx=%x: %s\n", - eflags, l, etype, tlen, bthbytes[0], - ipath_hdrget_index((__le32 *) rc), emsg); - /* Count local link integrity errors. */ - if (eflags & (INFINIPATH_RHF_H_ICRCERR | - INFINIPATH_RHF_H_VCRCERR)) { - u8 n = (dd->ipath_ibcctrl >> - INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT) & - INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK; - - if (++dd->ipath_lli_counter > n) { - dd->ipath_lli_counter = 0; - dd->ipath_lli_errors++; - } - } - } else if (etype == RCVHQ_RCV_TYPE_NON_KD) { + if (unlikely(eflags)) + ipath_rcv_hdrerr(dd, eflags, l, etail, rc); + else if (etype == RCVHQ_RCV_TYPE_NON_KD) { int ret = __ipath_verbs_rcv(dd, rc + 1, ebuf, tlen); if (ret == -ENODEV) @@ -981,25 +995,7 @@ reloop: else if (etype == RCVHQ_RCV_TYPE_EXPECTED) ipath_dbg("Bug: Expected TID, opcode %x; ignored\n", be32_to_cpu(hdr->bth[0]) & 0xff); - else if (eflags & (INFINIPATH_RHF_H_TIDERR | - INFINIPATH_RHF_H_IHDRERR)) { - /* - * This is a type 3 packet, only the LRH is in the - * rcvhdrq, the rest of the header is in the eager - * buffer. - */ - u8 opcode; - if (ebuf) { - bthbytes = (u8 *) ebuf; - opcode = *bthbytes; - } - else - opcode = 0; - get_rhf_errstring(eflags, emsg, sizeof emsg); - ipath_dbg("Err %x (%s), opcode %x, egrbuf %x, " - "len %x\n", eflags, emsg, opcode, etail, - tlen); - } else { + else { /* * error packet, type of error unknown. * Probably type 3, but we don't know, so don't -- cgit v0.10.2 From c9f79bdc21da9c8d466b6ba7c8bbd6b8e0110ce2 Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Mon, 17 Jul 2006 18:19:54 -0700 Subject: IB/ipath: Fix ib_ipath driver to work with SRP I am still working on a proposal to remove the phys_to_virt() calls in the ib_ipath driver. In the mean time, this patch allows SRP to work by fixing the R_Key check and conversion from IB address to kernel virtual address. It also returns the correct page size for FMRs. Signed-off-by: Ralph Campbell Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ipath/ipath_keys.c b/drivers/infiniband/hw/ipath/ipath_keys.c index 46773c6..a5ca279 100644 --- a/drivers/infiniband/hw/ipath/ipath_keys.c +++ b/drivers/infiniband/hw/ipath/ipath_keys.c @@ -197,6 +197,21 @@ int ipath_rkey_ok(struct ipath_ibdev *dev, struct ipath_sge_state *ss, size_t off; int ret; + /* + * We use RKEY == zero for physical addresses + * (see ipath_get_dma_mr). + */ + if (rkey == 0) { + sge->mr = NULL; + sge->vaddr = phys_to_virt(vaddr); + sge->length = len; + sge->sge_length = len; + ss->sg_list = NULL; + ss->num_sge = 1; + ret = 1; + goto bail; + } + mr = rkt->table[(rkey >> (32 - ib_ipath_lkey_table_size))]; if (unlikely(mr == NULL || mr->lkey != rkey)) { ret = 0; diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c index 56ac336..70bce7a 100644 --- a/drivers/infiniband/hw/ipath/ipath_verbs.c +++ b/drivers/infiniband/hw/ipath/ipath_verbs.c @@ -627,6 +627,7 @@ static int ipath_query_device(struct ib_device *ibdev, props->device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR | IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT | IB_DEVICE_SYS_IMAGE_GUID; + props->page_size_cap = PAGE_SIZE; props->vendor_id = ipath_layer_get_vendorid(dev->dd); props->vendor_part_id = ipath_layer_get_deviceid(dev->dd); props->hw_ver = ipath_layer_get_pcirev(dev->dd); -- cgit v0.10.2 From 16c59419a09f0140a07a1828d6a45656265e07c7 Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Mon, 17 Jul 2006 18:21:24 -0700 Subject: IB/ipath: ipath_skip_sge() can break if num_sge > 1 ipath_skip_sge() doesn't exactly duplicate the side effects of ipath_copy_sge() if num_sge > 1 since it doesn't decrement ss->num_sge. This could result in the sg_list being accessed out of bounds. Since ipath_skip_sge() is almost always called with num_sge == 1, the original "optimization" is almost never used. Signed-off-by: Ralph Campbell Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c index 70bce7a..d70a9b6 100644 --- a/drivers/infiniband/hw/ipath/ipath_verbs.c +++ b/drivers/infiniband/hw/ipath/ipath_verbs.c @@ -191,10 +191,6 @@ void ipath_skip_sge(struct ipath_sge_state *ss, u32 length) { struct ipath_sge *sge = &ss->sge; - while (length > sge->sge_length) { - length -= sge->sge_length; - ss->sge = *ss->sg_list++; - } while (length) { u32 len = sge->length; -- cgit v0.10.2 From 2527e681fd4fd4231c2e04f09d7b04d3cab8eefe Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Thu, 20 Jul 2006 11:25:50 +0300 Subject: IB/mad: Validate MADs for spec compliance Validate MADs sent by userspace clients for spec compliance with C13-18.1.1 (prevent duplicate requests and responses sent on the same port). Without this, RMPP transactions get aborted because of duplicate packets. This patch is similar to that provided by Jack Morgenstein. Signed-off-by: Sean Hefty Signed-off-by: Michael S. Tsirkin Signed-off-by: Jack Morgenstein Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index 5ed4dab..1c3cfbb 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -167,6 +167,15 @@ static int is_vendor_method_in_use( return 0; } +int ib_response_mad(struct ib_mad *mad) +{ + return ((mad->mad_hdr.method & IB_MGMT_METHOD_RESP) || + (mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS) || + ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_BM) && + (mad->mad_hdr.attr_mod & IB_BM_ATTR_MOD_RESP))); +} +EXPORT_SYMBOL(ib_response_mad); + /* * ib_register_mad_agent - Register to send/receive MADs */ @@ -570,13 +579,6 @@ int ib_unregister_mad_agent(struct ib_mad_agent *mad_agent) } EXPORT_SYMBOL(ib_unregister_mad_agent); -static inline int response_mad(struct ib_mad *mad) -{ - /* Trap represses are responses although response bit is reset */ - return ((mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS) || - (mad->mad_hdr.method & IB_MGMT_METHOD_RESP)); -} - static void dequeue_mad(struct ib_mad_list_head *mad_list) { struct ib_mad_queue *mad_queue; @@ -723,7 +725,7 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv, switch (ret) { case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY: - if (response_mad(&mad_priv->mad.mad) && + if (ib_response_mad(&mad_priv->mad.mad) && mad_agent_priv->agent.recv_handler) { local->mad_priv = mad_priv; local->recv_mad_agent = mad_agent_priv; @@ -1551,7 +1553,7 @@ find_mad_agent(struct ib_mad_port_private *port_priv, unsigned long flags; spin_lock_irqsave(&port_priv->reg_lock, flags); - if (response_mad(mad)) { + if (ib_response_mad(mad)) { u32 hi_tid; struct ib_mad_agent_private *entry; @@ -1799,7 +1801,7 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv, } /* Complete corresponding request */ - if (response_mad(mad_recv_wc->recv_buf.mad)) { + if (ib_response_mad(mad_recv_wc->recv_buf.mad)) { spin_lock_irqsave(&mad_agent_priv->lock, flags); mad_send_wr = ib_find_send_mad(mad_agent_priv, mad_recv_wc); if (!mad_send_wr) { diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index afe70a5..1273f88 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -112,8 +112,10 @@ struct ib_umad_device { struct ib_umad_file { struct ib_umad_port *port; struct list_head recv_list; + struct list_head send_list; struct list_head port_list; spinlock_t recv_lock; + spinlock_t send_lock; wait_queue_head_t recv_wait; struct ib_mad_agent *agent[IB_UMAD_MAX_AGENTS]; int agents_dead; @@ -177,12 +179,21 @@ static int queue_packet(struct ib_umad_file *file, return ret; } +static void dequeue_send(struct ib_umad_file *file, + struct ib_umad_packet *packet) + { + spin_lock_irq(&file->send_lock); + list_del(&packet->list); + spin_unlock_irq(&file->send_lock); + } + static void send_handler(struct ib_mad_agent *agent, struct ib_mad_send_wc *send_wc) { struct ib_umad_file *file = agent->context; struct ib_umad_packet *packet = send_wc->send_buf->context[0]; + dequeue_send(file, packet); ib_destroy_ah(packet->msg->ah); ib_free_send_mad(packet->msg); @@ -370,6 +381,51 @@ static int copy_rmpp_mad(struct ib_mad_send_buf *msg, const char __user *buf) return 0; } +static int same_destination(struct ib_user_mad_hdr *hdr1, + struct ib_user_mad_hdr *hdr2) +{ + if (!hdr1->grh_present && !hdr2->grh_present) + return (hdr1->lid == hdr2->lid); + + if (hdr1->grh_present && hdr2->grh_present) + return !memcmp(hdr1->gid, hdr2->gid, 16); + + return 0; +} + +static int is_duplicate(struct ib_umad_file *file, + struct ib_umad_packet *packet) +{ + struct ib_umad_packet *sent_packet; + struct ib_mad_hdr *sent_hdr, *hdr; + + hdr = (struct ib_mad_hdr *) packet->mad.data; + list_for_each_entry(sent_packet, &file->send_list, list) { + sent_hdr = (struct ib_mad_hdr *) sent_packet->mad.data; + + if ((hdr->tid != sent_hdr->tid) || + (hdr->mgmt_class != sent_hdr->mgmt_class)) + continue; + + /* + * No need to be overly clever here. If two new operations have + * the same TID, reject the second as a duplicate. This is more + * restrictive than required by the spec. + */ + if (!ib_response_mad((struct ib_mad *) hdr)) { + if (!ib_response_mad((struct ib_mad *) sent_hdr)) + return 1; + continue; + } else if (!ib_response_mad((struct ib_mad *) sent_hdr)) + continue; + + if (same_destination(&packet->mad.hdr, &sent_packet->mad.hdr)) + return 1; + } + + return 0; +} + static ssize_t ib_umad_write(struct file *filp, const char __user *buf, size_t count, loff_t *pos) { @@ -379,7 +435,6 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf, struct ib_ah_attr ah_attr; struct ib_ah *ah; struct ib_rmpp_mad *rmpp_mad; - u8 method; __be64 *tid; int ret, data_len, hdr_len, copy_offset, rmpp_active; @@ -473,28 +528,36 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf, } /* - * If userspace is generating a request that will generate a - * response, we need to make sure the high-order part of the - * transaction ID matches the agent being used to send the - * MAD. + * Set the high-order part of the transaction ID to make MADs from + * different agents unique, and allow routing responses back to the + * original requestor. */ - method = ((struct ib_mad_hdr *) packet->msg->mad)->method; - - if (!(method & IB_MGMT_METHOD_RESP) && - method != IB_MGMT_METHOD_TRAP_REPRESS && - method != IB_MGMT_METHOD_SEND) { + if (!ib_response_mad(packet->msg->mad)) { tid = &((struct ib_mad_hdr *) packet->msg->mad)->tid; *tid = cpu_to_be64(((u64) agent->hi_tid) << 32 | (be64_to_cpup(tid) & 0xffffffff)); + rmpp_mad->mad_hdr.tid = *tid; + } + + spin_lock_irq(&file->send_lock); + ret = is_duplicate(file, packet); + if (!ret) + list_add_tail(&packet->list, &file->send_list); + spin_unlock_irq(&file->send_lock); + if (ret) { + ret = -EINVAL; + goto err_msg; } ret = ib_post_send_mad(packet->msg, NULL); if (ret) - goto err_msg; + goto err_send; up_read(&file->port->mutex); return count; +err_send: + dequeue_send(file, packet); err_msg: ib_free_send_mad(packet->msg); err_ah: @@ -657,7 +720,9 @@ static int ib_umad_open(struct inode *inode, struct file *filp) } spin_lock_init(&file->recv_lock); + spin_lock_init(&file->send_lock); INIT_LIST_HEAD(&file->recv_list); + INIT_LIST_HEAD(&file->send_list); init_waitqueue_head(&file->recv_wait); file->port = port; diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h index 5ff7755..585d28e 100644 --- a/include/rdma/ib_mad.h +++ b/include/rdma/ib_mad.h @@ -75,6 +75,7 @@ #define IB_MGMT_METHOD_TRAP_REPRESS 0x07 #define IB_MGMT_METHOD_RESP 0x80 +#define IB_BM_ATTR_MOD_RESP cpu_to_be32(1) #define IB_MGMT_MAX_METHODS 128 @@ -247,6 +248,12 @@ struct ib_mad_send_buf { }; /** + * ib_response_mad - Returns if the specified MAD has been generated in + * response to a sent request or trap. + */ +int ib_response_mad(struct ib_mad *mad); + +/** * ib_get_rmpp_resptime - Returns the RMPP response time. * @rmpp_hdr: An RMPP header. */ -- cgit v0.10.2 From 624d01f899f6bbd75fd06890f231e1f46555d376 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Mon, 24 Jul 2006 10:42:00 +0300 Subject: IB/ipoib: Fix oops with ipoib_debug_mcast set Need to set mcast->ah before debug code dereferences it. Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index ab40488..b5e6a7b 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -264,6 +264,10 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, if (!ah) { ipoib_warn(priv, "ib_address_create failed\n"); } else { + spin_lock_irq(&priv->lock); + mcast->ah = ah; + spin_unlock_irq(&priv->lock); + ipoib_dbg_mcast(priv, "MGID " IPOIB_GID_FMT " AV %p, LID 0x%04x, SL %d\n", IPOIB_GID_ARG(mcast->mcmember.mgid), @@ -271,10 +275,6 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, be16_to_cpu(mcast->mcmember.mlid), mcast->mcmember.sl); } - - spin_lock_irq(&priv->lock); - mcast->ah = ah; - spin_unlock_irq(&priv->lock); } /* actually send any queued packets */ -- cgit v0.10.2 From 8a7f752125a930a83f4d8dfe37fa5a081ab19d31 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 19 Jul 2006 17:44:37 +0300 Subject: IB/ipoib: Fix packet loss after hardware address update The neighbour ha field may get updated without destroying the neighbour. In this case, the ha field gets out of sync with the address handle stored in ipoib_neigh->ah, with the result that the ah field would point to an incorrect path, resulting in all packets being lost. Signed-off-by: Michael S. Tsirkin Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index 3f89f5e1..474aa21 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -212,6 +212,7 @@ struct ipoib_path { struct ipoib_neigh { struct ipoib_ah *ah; + union ib_gid dgid; struct sk_buff_head queue; struct neighbour *neighbour; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 1c6ea1c..cf71d2a 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -404,6 +404,8 @@ static void path_rec_completion(int status, list_for_each_entry(neigh, &path->neigh_list, list) { kref_get(&path->ah->ref); neigh->ah = path->ah; + memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw, + sizeof(union ib_gid)); while ((skb = __skb_dequeue(&neigh->queue))) __skb_queue_tail(&skqueue, skb); @@ -510,6 +512,8 @@ static void neigh_add_path(struct sk_buff *skb, struct net_device *dev) if (path->ah) { kref_get(&path->ah->ref); neigh->ah = path->ah; + memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw, + sizeof(union ib_gid)); ipoib_send(dev, skb, path->ah, be32_to_cpup((__be32 *) skb->dst->neighbour->ha)); @@ -633,6 +637,25 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) neigh = *to_ipoib_neigh(skb->dst->neighbour); if (likely(neigh->ah)) { + if (unlikely(memcmp(&neigh->dgid.raw, + skb->dst->neighbour->ha + 4, + sizeof(union ib_gid)))) { + spin_lock(&priv->lock); + /* + * It's safe to call ipoib_put_ah() inside + * priv->lock here, because we know that + * path->ah will always hold one more reference, + * so ipoib_put_ah() will never do more than + * decrement the ref count. + */ + ipoib_put_ah(neigh->ah); + list_del(&neigh->list); + ipoib_neigh_free(neigh); + spin_unlock(&priv->lock); + ipoib_path_lookup(skb, dev); + goto out; + } + ipoib_send(dev, skb, neigh->ah, be32_to_cpup((__be32 *) skb->dst->neighbour->ha)); goto out; -- cgit v0.10.2 From 8fdf679fdb00f588b65abb9c775c178098a05aeb Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Mon, 24 Jul 2006 09:36:50 -0700 Subject: IB/mthca: Initialize max_cmds before debug code prints it Read the max_cmds value from the response to the QUERY_FW command before printing out the value, so that the real value goes into the debug output. Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.c b/drivers/infiniband/hw/mthca/mthca_cmd.c index d0f7731..deabc14 100644 --- a/drivers/infiniband/hw/mthca/mthca_cmd.c +++ b/drivers/infiniband/hw/mthca/mthca_cmd.c @@ -778,11 +778,12 @@ int mthca_QUERY_FW(struct mthca_dev *dev, u8 *status) ((dev->fw_ver & 0xffff0000ull) >> 16) | ((dev->fw_ver & 0x0000ffffull) << 16); + MTHCA_GET(lg, outbox, QUERY_FW_MAX_CMD_OFFSET); + dev->cmd.max_cmds = 1 << lg; + mthca_dbg(dev, "FW version %012llx, max commands %d\n", (unsigned long long) dev->fw_ver, dev->cmd.max_cmds); - MTHCA_GET(lg, outbox, QUERY_FW_MAX_CMD_OFFSET); - dev->cmd.max_cmds = 1 << lg; MTHCA_GET(dev->catas_err.addr, outbox, QUERY_FW_ERR_START_OFFSET); MTHCA_GET(dev->catas_err.size, outbox, QUERY_FW_ERR_SIZE_OFFSET); -- cgit v0.10.2 From 4b79f0af48d529a360d3529def01835dc5d45fe1 Mon Sep 17 00:00:00 2001 From: Ian McDonald Date: Sun, 23 Jul 2006 23:33:28 -0700 Subject: [DCCP]: Fix default sequence window size When using the default sequence window size (100) I got the following in my logs: Jun 22 14:24:09 localhost kernel: [ 1492.114775] DCCP: Step 6 failed for DATA packet, (LSWL(6279674225) <= P.seqno(6279674749) <= S.SWH(6279674324)) and (P.ackno doesn't exist or LAWL(18798206530) <= P.ackno(1125899906842620) <= S.AWH(18798206548), sending SYNC... Jun 22 14:24:09 localhost kernel: [ 1492.115147] DCCP: Step 6 failed for DATA packet, (LSWL(6279674225) <= P.seqno(6279674750) <= S.SWH(6279674324)) and (P.ackno doesn't exist or LAWL(18798206530) <= P.ackno(1125899906842620) <= S.AWH(18798206549), sending SYNC... I went to alter the default sysctl and it didn't take for new sockets. Below patch fixes this. I think the default is too low but it is what the DCCP spec specifies. As a side effect of this my rx speed using iperf goes from about 2.8 Mbits/sec to 3.5. This is still far too slow but it is a step in the right direction. Compile tested only for IPv6 but not particularly complex change. Signed off by: Ian McDonald Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller diff --git a/net/dccp/feat.h b/net/dccp/feat.h index 6048373..b44c4550 100644 --- a/net/dccp/feat.h +++ b/net/dccp/feat.h @@ -26,4 +26,6 @@ extern void dccp_feat_clean(struct dccp_minisock *dmsk); extern int dccp_feat_clone(struct sock *oldsk, struct sock *newsk); extern int dccp_feat_init(struct dccp_minisock *dmsk); +extern int dccp_feat_default_sequence_window; + #endif /* _DCCP_FEAT_H */ diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index c3073e7..7f56f7e 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -504,8 +504,7 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) ireq = inet_rsk(req); ireq->loc_addr = daddr; ireq->rmt_addr = saddr; - req->rcv_wnd = 100; /* Fake, option parsing will get the - right value */ + req->rcv_wnd = dccp_feat_default_sequence_window; ireq->opt = NULL; /* diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index ff42bc4..9f3d4d7 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -31,6 +31,7 @@ #include "dccp.h" #include "ipv6.h" +#include "feat.h" /* Socket used for sending RSTs and ACKs */ static struct socket *dccp_v6_ctl_socket; @@ -707,8 +708,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) ireq = inet_rsk(req); ipv6_addr_copy(&ireq6->rmt_addr, &skb->nh.ipv6h->saddr); ipv6_addr_copy(&ireq6->loc_addr, &skb->nh.ipv6h->daddr); - req->rcv_wnd = 100; /* Fake, option parsing will get the - right value */ + req->rcv_wnd = dccp_feat_default_sequence_window; ireq6->pktopts = NULL; if (ipv6_opt_accepted(sk, skb) || diff --git a/net/dccp/options.c b/net/dccp/options.c index c3cda1e..daf72bb 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -29,6 +29,8 @@ int dccp_feat_default_ack_ratio = DCCPF_INITIAL_ACK_RATIO; int dccp_feat_default_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR; int dccp_feat_default_send_ndp_count = DCCPF_INITIAL_SEND_NDP_COUNT; +EXPORT_SYMBOL_GPL(dccp_feat_default_sequence_window); + void dccp_minisock_init(struct dccp_minisock *dmsk) { dmsk->dccpms_sequence_window = dccp_feat_default_sequence_window; -- cgit v0.10.2 From 2266d8886f64c66e0a4e61e3e1c19dbc27ed00d4 Mon Sep 17 00:00:00 2001 From: Guillaume Chazarain Date: Sun, 23 Jul 2006 23:37:24 -0700 Subject: [PKT_SCHED]: Fix regression in PSCHED_TADD{,2}. In PSCHED_TADD and PSCHED_TADD2, if delta is less than tv.tv_usec (so, less than USEC_PER_SEC too) then tv_res will be smaller than tv. The affectation "(tv_res).tv_usec = __delta;" is wrong. The fix is to revert to the original code before 4ee303dfeac6451b402e3d8512723d3a0f861857 and change the 'if' in 'while'. [Shuya MAEDA: "while (__delta >= USEC_PER_SEC){ ... }" instead of "while (__delta > USEC_PER_SEC){ ... }"] Signed-off-by: Guillaume Chazarain Signed-off-by: David S. Miller diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 1925c65..f6afee7 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -169,23 +169,17 @@ psched_tod_diff(int delta_sec, int bound) #define PSCHED_TADD2(tv, delta, tv_res) \ ({ \ - int __delta = (delta); \ - (tv_res) = (tv); \ - while(__delta >= USEC_PER_SEC){ \ - (tv_res).tv_sec++; \ - __delta -= USEC_PER_SEC; \ - } \ + int __delta = (tv).tv_usec + (delta); \ + (tv_res).tv_sec = (tv).tv_sec; \ + while (__delta >= USEC_PER_SEC) { (tv_res).tv_sec++; __delta -= USEC_PER_SEC; } \ (tv_res).tv_usec = __delta; \ }) #define PSCHED_TADD(tv, delta) \ ({ \ - int __delta = (delta); \ - while(__delta >= USEC_PER_SEC){ \ - (tv).tv_sec++; \ - __delta -= USEC_PER_SEC; \ - } \ - (tv).tv_usec = __delta; \ + (tv).tv_usec += (delta); \ + while ((tv).tv_usec >= USEC_PER_SEC) { (tv).tv_sec++; \ + (tv).tv_usec -= USEC_PER_SEC; } \ }) /* Set/check that time is in the "past perfect"; -- cgit v0.10.2 From 98bcd08b5bfe78c1c9bda5768aa081e0fe4fcc4f Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Fri, 14 Jul 2006 11:42:12 +0200 Subject: [Bluetooth] Correct RFCOMM channel MTU for broken implementations Some Bluetooth RFCOMM implementations try to negotiate a bigger channel MTU than we can support for a particular session. The maximum MTU for a RFCOMM session is limited through the L2CAP layer. So if the other side proposes a channel MTU that is bigger than the underlying L2CAP MTU, we should reduce it to the L2CAP MTU of the session minus five bytes for the RFCOMM headers. Signed-off-by: Marcel Holtmann diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index 77eab8f..332dd8f 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -55,6 +55,7 @@ #define VERSION "1.8" static int disable_cfc = 0; +static int channel_mtu = -1; static unsigned int l2cap_mtu = RFCOMM_MAX_L2CAP_MTU; static struct task_struct *rfcomm_thread; @@ -812,7 +813,10 @@ static int rfcomm_send_pn(struct rfcomm_session *s, int cr, struct rfcomm_dlc *d pn->credits = 0; } - pn->mtu = htobs(d->mtu); + if (cr && channel_mtu >= 0) + pn->mtu = htobs(channel_mtu); + else + pn->mtu = htobs(d->mtu); *ptr = __fcs(buf); ptr++; @@ -1243,7 +1247,10 @@ static int rfcomm_apply_pn(struct rfcomm_dlc *d, int cr, struct rfcomm_pn *pn) d->priority = pn->priority; - d->mtu = s->mtu = btohs(pn->mtu); + d->mtu = btohs(pn->mtu); + + if (cr && d->mtu > s->mtu) + d->mtu = s->mtu; return 0; } @@ -1770,6 +1777,11 @@ static inline void rfcomm_accept_connection(struct rfcomm_session *s) s = rfcomm_session_add(nsock, BT_OPEN); if (s) { rfcomm_session_hold(s); + + /* We should adjust MTU on incoming sessions. + * L2CAP MTU minus UIH header and FCS. */ + s->mtu = min(l2cap_pi(nsock->sk)->omtu, l2cap_pi(nsock->sk)->imtu) - 5; + rfcomm_schedule(RFCOMM_SCHED_RX); } else sock_release(nsock); @@ -2087,6 +2099,9 @@ module_exit(rfcomm_exit); module_param(disable_cfc, bool, 0644); MODULE_PARM_DESC(disable_cfc, "Disable credit based flow control"); +module_param(channel_mtu, int, 0644); +MODULE_PARM_DESC(channel_mtu, "Default MTU for the RFCOMM channel"); + module_param(l2cap_mtu, uint, 0644); MODULE_PARM_DESC(l2cap_mtu, "Default MTU for the L2CAP connection"); -- cgit v0.10.2 From 520ca78acc652c89c92e8bf29536319afa9d88bb Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Fri, 14 Jul 2006 16:01:52 +0200 Subject: [Bluetooth] Correct SCO buffer size for another Broadcom chip The SCO buffer size values on IBM/Lenovo ThinkPad laptops with a Bluetooth chip from Broadcom are wrong. The USB Bluetooth driver has to set a quirk to correct the SCO buffer size values. Signed-off-by: Marcel Holtmann diff --git a/drivers/bluetooth/hci_usb.c b/drivers/bluetooth/hci_usb.c index 6a0c223..f4784f4 100644 --- a/drivers/bluetooth/hci_usb.c +++ b/drivers/bluetooth/hci_usb.c @@ -67,6 +67,8 @@ static int ignore = 0; static int ignore_dga = 0; static int ignore_csr = 0; static int ignore_sniffer = 0; +static int disable_scofix = 0; +static int force_scofix = 0; static int reset = 0; #ifdef CONFIG_BT_HCIUSB_SCO @@ -110,6 +112,9 @@ static struct usb_device_id blacklist_ids[] = { { USB_DEVICE(0x0a5c, 0x200a), .driver_info = HCI_RESET | HCI_BROKEN_ISOC }, { USB_DEVICE(0x0a5c, 0x2009), .driver_info = HCI_BCM92035 }, + /* IBM/Lenovo ThinkPad with Broadcom chip */ + { USB_DEVICE(0x0a5c, 0x201e), .driver_info = HCI_WRONG_SCO_MTU }, + /* Microsoft Wireless Transceiver for Bluetooth 2.0 */ { USB_DEVICE(0x045e, 0x009c), .driver_info = HCI_RESET }, @@ -990,8 +995,10 @@ static int hci_usb_probe(struct usb_interface *intf, const struct usb_device_id if (reset || id->driver_info & HCI_RESET) set_bit(HCI_QUIRK_RESET_ON_INIT, &hdev->quirks); - if (id->driver_info & HCI_WRONG_SCO_MTU) - set_bit(HCI_QUIRK_FIXUP_BUFFER_SIZE, &hdev->quirks); + if (force_scofix || id->driver_info & HCI_WRONG_SCO_MTU) { + if (!disable_scofix) + set_bit(HCI_QUIRK_FIXUP_BUFFER_SIZE, &hdev->quirks); + } if (id->driver_info & HCI_SNIFFER) { if (le16_to_cpu(udev->descriptor.bcdDevice) > 0x997) @@ -1161,6 +1168,12 @@ MODULE_PARM_DESC(ignore_csr, "Ignore devices with id 0a12:0001"); module_param(ignore_sniffer, bool, 0644); MODULE_PARM_DESC(ignore_sniffer, "Ignore devices with id 0a12:0002"); +module_param(disable_scofix, bool, 0644); +MODULE_PARM_DESC(disable_scofix, "Disable fixup of wrong SCO buffer size"); + +module_param(force_scofix, bool, 0644); +MODULE_PARM_DESC(force_scofix, "Force fixup of wrong SCO buffers size"); + module_param(reset, bool, 0644); MODULE_PARM_DESC(reset, "Send HCI reset command on initialization"); -- cgit v0.10.2 From ea9727f6e55dabc7a58cf56c87e65665e239e171 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Tue, 18 Jul 2006 17:47:40 +0200 Subject: [Bluetooth] Correct SCO buffer size for Belkin devices The Belkin F8T012 and F8T013 devices are both based on a Bluetooth chip from Broadcom and their SCO buffer size values are wrong. The Bluetooth core should correct these values. Signed-off-by: Marcel Holtmann diff --git a/drivers/bluetooth/hci_usb.c b/drivers/bluetooth/hci_usb.c index f4784f4..d73eb10 100644 --- a/drivers/bluetooth/hci_usb.c +++ b/drivers/bluetooth/hci_usb.c @@ -127,8 +127,9 @@ static struct usb_device_id blacklist_ids[] = { /* RTX Telecom based adapter with buggy SCO support */ { USB_DEVICE(0x0400, 0x0807), .driver_info = HCI_BROKEN_ISOC }, - /* Belkin F8T012 */ + /* Belkin F8T012 and F8T013 devices */ { USB_DEVICE(0x050d, 0x0012), .driver_info = HCI_WRONG_SCO_MTU }, + { USB_DEVICE(0x050d, 0x0013), .driver_info = HCI_WRONG_SCO_MTU }, /* Digianswer devices */ { USB_DEVICE(0x08fd, 0x0001), .driver_info = HCI_DIGIANSWER }, -- cgit v0.10.2 From 8e4f7230a3bd015862f3af58dc563dbc1cdebfe2 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Tue, 18 Jul 2006 18:04:59 +0200 Subject: [Bluetooth] Add quirk for another broken RTX Telecom based dongle This patch disables the ISOC transfers for another broken RTX Telecom based USB dongle. Starting the USB ISOC transfers only ends in a burst of error messages for invalid SCO packets on connection handle 0. Signed-off-by: Marcel Holtmann diff --git a/drivers/bluetooth/hci_usb.c b/drivers/bluetooth/hci_usb.c index d73eb10..963c0145 100644 --- a/drivers/bluetooth/hci_usb.c +++ b/drivers/bluetooth/hci_usb.c @@ -124,8 +124,9 @@ static struct usb_device_id blacklist_ids[] = { /* ISSC Bluetooth Adapter v3.1 */ { USB_DEVICE(0x1131, 0x1001), .driver_info = HCI_RESET }, - /* RTX Telecom based adapter with buggy SCO support */ + /* RTX Telecom based adapters with buggy SCO support */ { USB_DEVICE(0x0400, 0x0807), .driver_info = HCI_BROKEN_ISOC }, + { USB_DEVICE(0x0400, 0x080a), .driver_info = HCI_BROKEN_ISOC }, /* Belkin F8T012 and F8T013 devices */ { USB_DEVICE(0x050d, 0x0012), .driver_info = HCI_WRONG_SCO_MTU }, -- cgit v0.10.2 From e9e9290f5c85887baf1123a36ec9fdf56a10cf4b Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Tue, 18 Jul 2006 18:32:33 +0200 Subject: [Bluetooth] Enable SCO support for Broadcom HID proxy dongle The Broadcom dongles with HID proxy support actually support SCO over HCI if the SCO buffer size values are corrected. So instead of disabling the SCO support, mark this dongle with the quirk for the Bluetooth core to correct the wrong buffer size values. Signed-off-by: Marcel Holtmann diff --git a/drivers/bluetooth/hci_usb.c b/drivers/bluetooth/hci_usb.c index 963c0145..e2d4bea 100644 --- a/drivers/bluetooth/hci_usb.c +++ b/drivers/bluetooth/hci_usb.c @@ -109,7 +109,7 @@ static struct usb_device_id blacklist_ids[] = { { USB_DEVICE(0x0a5c, 0x2033), .driver_info = HCI_IGNORE }, /* Broadcom BCM2035 */ - { USB_DEVICE(0x0a5c, 0x200a), .driver_info = HCI_RESET | HCI_BROKEN_ISOC }, + { USB_DEVICE(0x0a5c, 0x200a), .driver_info = HCI_RESET | HCI_WRONG_SCO_MTU }, { USB_DEVICE(0x0a5c, 0x2009), .driver_info = HCI_BCM92035 }, /* IBM/Lenovo ThinkPad with Broadcom chip */ -- cgit v0.10.2 From a922ba5510530ae8e3c60edc85c56f72347a3c86 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 24 Jul 2006 13:49:06 -0700 Subject: [IPV6] xfrm6_tunnel: Delete debugging code. It doesn't compile, and it's dubious in several regards: 1) is enabled by non-Kconfig controlled CONFIG_* value (noted by Randy Dunlap) 2) XFRM6_TUNNEL_SPI_MAGIC is defined after it's first use 3) the debugging messages print object pointer addresses which have no meaning without context So let's just get rid of it. Signed-off-by: David S. Miller diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index 6b44fe8..c8f9369 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -31,27 +31,6 @@ #include #include -#ifdef CONFIG_IPV6_XFRM6_TUNNEL_DEBUG -# define X6TDEBUG 3 -#else -# define X6TDEBUG 1 -#endif - -#define X6TPRINTK(fmt, args...) printk(fmt, ## args) -#define X6TNOPRINTK(fmt, args...) do { ; } while(0) - -#if X6TDEBUG >= 1 -# define X6TPRINTK1 X6TPRINTK -#else -# define X6TPRINTK1 X6TNOPRINTK -#endif - -#if X6TDEBUG >= 3 -# define X6TPRINTK3 X6TPRINTK -#else -# define X6TPRINTK3 X6TNOPRINTK -#endif - /* * xfrm_tunnel_spi things are for allocating unique id ("spi") * per xfrm_address_t. @@ -62,15 +41,8 @@ struct xfrm6_tunnel_spi { xfrm_address_t addr; u32 spi; atomic_t refcnt; -#ifdef XFRM6_TUNNEL_SPI_MAGIC - u32 magic; -#endif }; -#ifdef CONFIG_IPV6_XFRM6_TUNNEL_DEBUG -# define XFRM6_TUNNEL_SPI_MAGIC 0xdeadbeef -#endif - static DEFINE_RWLOCK(xfrm6_tunnel_spi_lock); static u32 xfrm6_tunnel_spi; @@ -86,43 +58,15 @@ static kmem_cache_t *xfrm6_tunnel_spi_kmem __read_mostly; static struct hlist_head xfrm6_tunnel_spi_byaddr[XFRM6_TUNNEL_SPI_BYADDR_HSIZE]; static struct hlist_head xfrm6_tunnel_spi_byspi[XFRM6_TUNNEL_SPI_BYSPI_HSIZE]; -#ifdef XFRM6_TUNNEL_SPI_MAGIC -static int x6spi_check_magic(const struct xfrm6_tunnel_spi *x6spi, - const char *name) -{ - if (unlikely(x6spi->magic != XFRM6_TUNNEL_SPI_MAGIC)) { - X6TPRINTK3(KERN_DEBUG "%s(): x6spi object " - "at %p has corrupted magic %08x " - "(should be %08x)\n", - name, x6spi, x6spi->magic, XFRM6_TUNNEL_SPI_MAGIC); - return -1; - } - return 0; -} -#else -static int inline x6spi_check_magic(const struct xfrm6_tunnel_spi *x6spi, - const char *name) -{ - return 0; -} -#endif - -#define X6SPI_CHECK_MAGIC(x6spi) x6spi_check_magic((x6spi), __FUNCTION__) - - static unsigned inline xfrm6_tunnel_spi_hash_byaddr(xfrm_address_t *addr) { unsigned h; - X6TPRINTK3(KERN_DEBUG "%s(addr=%p)\n", __FUNCTION__, addr); - h = addr->a6[0] ^ addr->a6[1] ^ addr->a6[2] ^ addr->a6[3]; h ^= h >> 16; h ^= h >> 8; h &= XFRM6_TUNNEL_SPI_BYADDR_HSIZE - 1; - X6TPRINTK3(KERN_DEBUG "%s() = %u\n", __FUNCTION__, h); - return h; } @@ -136,19 +80,13 @@ static int xfrm6_tunnel_spi_init(void) { int i; - X6TPRINTK3(KERN_DEBUG "%s()\n", __FUNCTION__); - xfrm6_tunnel_spi = 0; xfrm6_tunnel_spi_kmem = kmem_cache_create("xfrm6_tunnel_spi", sizeof(struct xfrm6_tunnel_spi), 0, SLAB_HWCACHE_ALIGN, NULL, NULL); - if (!xfrm6_tunnel_spi_kmem) { - X6TPRINTK1(KERN_ERR - "%s(): failed to allocate xfrm6_tunnel_spi_kmem\n", - __FUNCTION__); + if (!xfrm6_tunnel_spi_kmem) return -ENOMEM; - } for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++) INIT_HLIST_HEAD(&xfrm6_tunnel_spi_byaddr[i]); @@ -161,22 +99,16 @@ static void xfrm6_tunnel_spi_fini(void) { int i; - X6TPRINTK3(KERN_DEBUG "%s()\n", __FUNCTION__); - for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++) { if (!hlist_empty(&xfrm6_tunnel_spi_byaddr[i])) - goto err; + return; } for (i = 0; i < XFRM6_TUNNEL_SPI_BYSPI_HSIZE; i++) { if (!hlist_empty(&xfrm6_tunnel_spi_byspi[i])) - goto err; + return; } kmem_cache_destroy(xfrm6_tunnel_spi_kmem); xfrm6_tunnel_spi_kmem = NULL; - return; -err: - X6TPRINTK1(KERN_ERR "%s(): table is not empty\n", __FUNCTION__); - return; } static struct xfrm6_tunnel_spi *__xfrm6_tunnel_spi_lookup(xfrm_address_t *saddr) @@ -184,19 +116,13 @@ static struct xfrm6_tunnel_spi *__xfrm6_tunnel_spi_lookup(xfrm_address_t *saddr) struct xfrm6_tunnel_spi *x6spi; struct hlist_node *pos; - X6TPRINTK3(KERN_DEBUG "%s(saddr=%p)\n", __FUNCTION__, saddr); - hlist_for_each_entry(x6spi, pos, &xfrm6_tunnel_spi_byaddr[xfrm6_tunnel_spi_hash_byaddr(saddr)], list_byaddr) { - if (memcmp(&x6spi->addr, saddr, sizeof(x6spi->addr)) == 0) { - X6SPI_CHECK_MAGIC(x6spi); - X6TPRINTK3(KERN_DEBUG "%s() = %p(%u)\n", __FUNCTION__, x6spi, x6spi->spi); + if (memcmp(&x6spi->addr, saddr, sizeof(x6spi->addr)) == 0) return x6spi; - } } - X6TPRINTK3(KERN_DEBUG "%s() = NULL(0)\n", __FUNCTION__); return NULL; } @@ -205,8 +131,6 @@ u32 xfrm6_tunnel_spi_lookup(xfrm_address_t *saddr) struct xfrm6_tunnel_spi *x6spi; u32 spi; - X6TPRINTK3(KERN_DEBUG "%s(saddr=%p)\n", __FUNCTION__, saddr); - read_lock_bh(&xfrm6_tunnel_spi_lock); x6spi = __xfrm6_tunnel_spi_lookup(saddr); spi = x6spi ? x6spi->spi : 0; @@ -223,8 +147,6 @@ static u32 __xfrm6_tunnel_alloc_spi(xfrm_address_t *saddr) struct hlist_node *pos; unsigned index; - X6TPRINTK3(KERN_DEBUG "%s(saddr=%p)\n", __FUNCTION__, saddr); - if (xfrm6_tunnel_spi < XFRM6_TUNNEL_SPI_MIN || xfrm6_tunnel_spi >= XFRM6_TUNNEL_SPI_MAX) xfrm6_tunnel_spi = XFRM6_TUNNEL_SPI_MIN; @@ -258,18 +180,10 @@ try_next_2:; spi = 0; goto out; alloc_spi: - X6TPRINTK3(KERN_DEBUG "%s(): allocate new spi for " NIP6_FMT "\n", - __FUNCTION__, - NIP6(*(struct in6_addr *)saddr)); x6spi = kmem_cache_alloc(xfrm6_tunnel_spi_kmem, SLAB_ATOMIC); - if (!x6spi) { - X6TPRINTK1(KERN_ERR "%s(): kmem_cache_alloc() failed\n", - __FUNCTION__); + if (!x6spi) goto out; - } -#ifdef XFRM6_TUNNEL_SPI_MAGIC - x6spi->magic = XFRM6_TUNNEL_SPI_MAGIC; -#endif + memcpy(&x6spi->addr, saddr, sizeof(x6spi->addr)); x6spi->spi = spi; atomic_set(&x6spi->refcnt, 1); @@ -278,9 +192,7 @@ alloc_spi: index = xfrm6_tunnel_spi_hash_byaddr(saddr); hlist_add_head(&x6spi->list_byaddr, &xfrm6_tunnel_spi_byaddr[index]); - X6SPI_CHECK_MAGIC(x6spi); out: - X6TPRINTK3(KERN_DEBUG "%s() = %u\n", __FUNCTION__, spi); return spi; } @@ -289,8 +201,6 @@ u32 xfrm6_tunnel_alloc_spi(xfrm_address_t *saddr) struct xfrm6_tunnel_spi *x6spi; u32 spi; - X6TPRINTK3(KERN_DEBUG "%s(saddr=%p)\n", __FUNCTION__, saddr); - write_lock_bh(&xfrm6_tunnel_spi_lock); x6spi = __xfrm6_tunnel_spi_lookup(saddr); if (x6spi) { @@ -300,8 +210,6 @@ u32 xfrm6_tunnel_alloc_spi(xfrm_address_t *saddr) spi = __xfrm6_tunnel_alloc_spi(saddr); write_unlock_bh(&xfrm6_tunnel_spi_lock); - X6TPRINTK3(KERN_DEBUG "%s() = %u\n", __FUNCTION__, spi); - return spi; } @@ -312,8 +220,6 @@ void xfrm6_tunnel_free_spi(xfrm_address_t *saddr) struct xfrm6_tunnel_spi *x6spi; struct hlist_node *pos, *n; - X6TPRINTK3(KERN_DEBUG "%s(saddr=%p)\n", __FUNCTION__, saddr); - write_lock_bh(&xfrm6_tunnel_spi_lock); hlist_for_each_entry_safe(x6spi, pos, n, @@ -321,12 +227,6 @@ void xfrm6_tunnel_free_spi(xfrm_address_t *saddr) list_byaddr) { if (memcmp(&x6spi->addr, saddr, sizeof(x6spi->addr)) == 0) { - X6TPRINTK3(KERN_DEBUG "%s(): x6spi object for " NIP6_FMT - " found at %p\n", - __FUNCTION__, - NIP6(*(struct in6_addr *)saddr), - x6spi); - X6SPI_CHECK_MAGIC(x6spi); if (atomic_dec_and_test(&x6spi->refcnt)) { hlist_del(&x6spi->list_byaddr); hlist_del(&x6spi->list_byspi); @@ -377,20 +277,14 @@ static int xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt, case ICMPV6_ADDR_UNREACH: case ICMPV6_PORT_UNREACH: default: - X6TPRINTK3(KERN_DEBUG - "xfrm6_tunnel: Destination Unreach.\n"); break; } break; case ICMPV6_PKT_TOOBIG: - X6TPRINTK3(KERN_DEBUG - "xfrm6_tunnel: Packet Too Big.\n"); break; case ICMPV6_TIME_EXCEED: switch (code) { case ICMPV6_EXC_HOPLIMIT: - X6TPRINTK3(KERN_DEBUG - "xfrm6_tunnel: Too small Hoplimit.\n"); break; case ICMPV6_EXC_FRAGTIME: default: @@ -447,22 +341,14 @@ static struct xfrm6_tunnel xfrm6_tunnel_handler = { static int __init xfrm6_tunnel_init(void) { - X6TPRINTK3(KERN_DEBUG "%s()\n", __FUNCTION__); - - if (xfrm_register_type(&xfrm6_tunnel_type, AF_INET6) < 0) { - X6TPRINTK1(KERN_ERR - "xfrm6_tunnel init: can't add xfrm type\n"); + if (xfrm_register_type(&xfrm6_tunnel_type, AF_INET6) < 0) return -EAGAIN; - } + if (xfrm6_tunnel_register(&xfrm6_tunnel_handler)) { - X6TPRINTK1(KERN_ERR - "xfrm6_tunnel init(): can't add handler\n"); xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6); return -EAGAIN; } if (xfrm6_tunnel_spi_init() < 0) { - X6TPRINTK1(KERN_ERR - "xfrm6_tunnel init: failed to initialize spi\n"); xfrm6_tunnel_deregister(&xfrm6_tunnel_handler); xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6); return -EAGAIN; @@ -472,15 +358,9 @@ static int __init xfrm6_tunnel_init(void) static void __exit xfrm6_tunnel_fini(void) { - X6TPRINTK3(KERN_DEBUG "%s()\n", __FUNCTION__); - xfrm6_tunnel_spi_fini(); - if (xfrm6_tunnel_deregister(&xfrm6_tunnel_handler)) - X6TPRINTK1(KERN_ERR - "xfrm6_tunnel close: can't remove handler\n"); - if (xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6) < 0) - X6TPRINTK1(KERN_ERR - "xfrm6_tunnel close: can't remove xfrm type\n"); + xfrm6_tunnel_deregister(&xfrm6_tunnel_handler); + xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6); } module_init(xfrm6_tunnel_init); -- cgit v0.10.2 From 6c753c3d3be0f8d1b570ec5720ad1bb4caf8232b Mon Sep 17 00:00:00 2001 From: Stefan Rompf Date: Mon, 24 Jul 2006 13:52:13 -0700 Subject: [VLAN]: Fix link state propagation When the queue of the underlying device is stopped at initialization time or the device is marked "not present", the state will be propagated to the vlan device and never change. Based on an analysis by Patrick McHardy. Signed-off-by: Stefan Rompf ACKed-by: Patrick McHardy Signed-off-by: David S. Miller diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 0ade0c6..18fcb9f 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -67,10 +67,6 @@ static struct packet_type vlan_packet_type = { .func = vlan_skb_recv, /* VLAN receive method */ }; -/* Bits of netdev state that are propagated from real device to virtual */ -#define VLAN_LINK_STATE_MASK \ - ((1<<__LINK_STATE_PRESENT)|(1<<__LINK_STATE_NOCARRIER)|(1<<__LINK_STATE_DORMANT)) - /* End of global variables definitions. */ /* @@ -479,7 +475,9 @@ static struct net_device *register_vlan_device(const char *eth_IF_name, new_dev->flags = real_dev->flags; new_dev->flags &= ~IFF_UP; - new_dev->state = real_dev->state & ~(1<<__LINK_STATE_START); + new_dev->state = (real_dev->state & ((1<<__LINK_STATE_NOCARRIER) | + (1<<__LINK_STATE_DORMANT))) | + (1<<__LINK_STATE_PRESENT); /* need 4 bytes for extra VLAN header info, * hope the underlying device can handle it. -- cgit v0.10.2 From 37182d1bd3264cf9c0dce3408bee48af0755de7e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jul 2006 15:30:28 -0700 Subject: [NET]: Remove CONFIG_HAVE_ARCH_DEV_ALLOC_SKB skbuff.h has an #ifndef CONFIG_HAVE_ARCH_DEV_ALLOC_SKB to allow architectures to reimplement __dev_alloc_skb. It's not set on any architecture and now that we have an architecture-overrideable NET_SKB_PAD there is not point at all to have one either. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 0bf31b8..f987579 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1066,7 +1066,6 @@ static inline void __skb_queue_purge(struct sk_buff_head *list) kfree_skb(skb); } -#ifndef CONFIG_HAVE_ARCH_DEV_ALLOC_SKB /** * __dev_alloc_skb - allocate an skbuff for sending * @length: length to allocate @@ -1087,9 +1086,6 @@ static inline struct sk_buff *__dev_alloc_skb(unsigned int length, skb_reserve(skb, NET_SKB_PAD); return skb; } -#else -extern struct sk_buff *__dev_alloc_skb(unsigned int length, int gfp_mask); -#endif /** * dev_alloc_skb - allocate an skbuff for sending -- cgit v0.10.2 From b4e54de8d34afe7fcf08bfe91070d9dfeae6ed27 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jul 2006 15:31:14 -0700 Subject: [NET]: Correct dev_alloc_skb kerneldoc dev_alloc_skb is designated for RX descriptors, not TX. (Some drivers use it for the latter anyway, but that's a different story) Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f987579..4307e76 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1067,7 +1067,7 @@ static inline void __skb_queue_purge(struct sk_buff_head *list) } /** - * __dev_alloc_skb - allocate an skbuff for sending + * __dev_alloc_skb - allocate an skbuff for receiving * @length: length to allocate * @gfp_mask: get_free_pages mask, passed to alloc_skb * @@ -1088,7 +1088,7 @@ static inline struct sk_buff *__dev_alloc_skb(unsigned int length, } /** - * dev_alloc_skb - allocate an skbuff for sending + * dev_alloc_skb - allocate an skbuff for receiving * @length: length to allocate * * Allocate a new &sk_buff and assign it a usage count of one. The -- cgit v0.10.2 From eb398d1044e0c1c19c2f5041acdb29ddb5bbc9f8 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sat, 22 Jul 2006 01:12:09 -0700 Subject: [SPARC64]: Explicitly print return PC when the kernel fault PC is bogus. That way we'll have at least some debugging info even if the stack dump explodes. Signed-off-by: David S. Miller diff --git a/arch/sparc64/mm/fault.c b/arch/sparc64/mm/fault.c index 1605967..55ae802 100644 --- a/arch/sparc64/mm/fault.c +++ b/arch/sparc64/mm/fault.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -132,6 +133,8 @@ static void bad_kernel_pc(struct pt_regs *regs, unsigned long vaddr) printk(KERN_CRIT "OOPS: Bogus kernel PC [%016lx] in fault handler\n", regs->tpc); + printk(KERN_CRIT "OOPS: RPC [%016lx]\n", regs->u_regs[15]); + print_symbol("RPC: <%s>\n", regs->u_regs[15]); printk(KERN_CRIT "OOPS: Fault was to vaddr[%lx]\n", vaddr); __asm__("mov %%sp, %0" : "=r" (ksp)); show_stack(current, ksp); -- cgit v0.10.2 From 29ed46015dd61f99d203ec7ab307ccf92d2d0cf2 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sat, 22 Jul 2006 02:05:07 -0700 Subject: [SPARC]: Fix SA_STATIC_ALLOC value. It alises IRQF_SHARED which causes all kinds of problems. Signed-off-by: David S. Miller diff --git a/include/asm-sparc/signal.h b/include/asm-sparc/signal.h index 0ae5084..d03a21c 100644 --- a/include/asm-sparc/signal.h +++ b/include/asm-sparc/signal.h @@ -168,7 +168,7 @@ struct sigstack { * statically allocated data.. which is NOT GOOD. * */ -#define SA_STATIC_ALLOC 0x80 +#define SA_STATIC_ALLOC 0x8000 #endif #include -- cgit v0.10.2 From 6bc063d414a815937fc81449fa9ffe8d3a4cdf22 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 24 Jul 2006 22:47:14 -0700 Subject: [SCSI] esp: Fix build. The data_cmd[] member got deleted, so do not use it any more. Scsi commands do not have their ->cmd[] overwritten temporary to probe for status after an error before retrying. Signed-off-by: David S. Miller diff --git a/drivers/scsi/esp.c b/drivers/scsi/esp.c index eaf64c7..98bd227 100644 --- a/drivers/scsi/esp.c +++ b/drivers/scsi/esp.c @@ -2754,18 +2754,15 @@ static int esp_do_data_finale(struct esp *esp) */ static int esp_should_clear_sync(struct scsi_cmnd *sp) { - u8 cmd1 = sp->cmnd[0]; - u8 cmd2 = sp->data_cmnd[0]; + u8 cmd = sp->cmnd[0]; /* These cases are for spinning up a disk and * waiting for that spinup to complete. */ - if (cmd1 == START_STOP || - cmd2 == START_STOP) + if (cmd == START_STOP) return 0; - if (cmd1 == TEST_UNIT_READY || - cmd2 == TEST_UNIT_READY) + if (cmd == TEST_UNIT_READY) return 0; /* One more special case for SCSI tape drives, @@ -2773,8 +2770,7 @@ static int esp_should_clear_sync(struct scsi_cmnd *sp) * completion of a rewind or tape load operation. */ if (sp->device->type == TYPE_TAPE) { - if (cmd1 == MODE_SENSE || - cmd2 == MODE_SENSE) + if (cmd == MODE_SENSE) return 0; } -- cgit v0.10.2 From 083edca05ab1fa6efac1ba414018f7f45a4a83ff Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 24 Jul 2006 22:52:10 -0700 Subject: [NETFILTER]: H.323 helper: fix possible NULL-ptr dereference An RCF message containing a timeout results in a NULL-ptr dereference if no RRQ has been seen before. Noticed by the "SATURN tool", reported by Thomas Dillig and Isil Dillig . Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller diff --git a/net/ipv4/netfilter/ip_conntrack_helper_h323.c b/net/ipv4/netfilter/ip_conntrack_helper_h323.c index af35235..9a39e29 100644 --- a/net/ipv4/netfilter/ip_conntrack_helper_h323.c +++ b/net/ipv4/netfilter/ip_conntrack_helper_h323.c @@ -1200,7 +1200,7 @@ static struct ip_conntrack_expect *find_expect(struct ip_conntrack *ct, tuple.dst.protonum = IPPROTO_TCP; exp = __ip_conntrack_expect_find(&tuple); - if (exp->master == ct) + if (exp && exp->master == ct) return exp; return NULL; } -- cgit v0.10.2 From 3bc38712e3a6e0596ccb6f8299043a826f983701 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 24 Jul 2006 22:52:47 -0700 Subject: [NETFILTER]: nf_queue: handle NF_STOP and unknown verdicts in nf_reinject In case of an unknown verdict or NF_STOP the packet leaks. Unknown verdicts can happen when userspace is buggy. Reinject the packet in case of NF_STOP, drop on unknown verdicts. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index bb6fcee..662a869 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -219,21 +219,20 @@ void nf_reinject(struct sk_buff *skb, struct nf_info *info, switch (verdict & NF_VERDICT_MASK) { case NF_ACCEPT: + case NF_STOP: info->okfn(skb); + case NF_STOLEN: break; - case NF_QUEUE: if (!nf_queue(&skb, elem, info->pf, info->hook, info->indev, info->outdev, info->okfn, verdict >> NF_VERDICT_BITS)) goto next_hook; break; + default: + kfree_skb(skb); } rcu_read_unlock(); - - if (verdict == NF_DROP) - kfree_skb(skb); - kfree(info); return; } -- cgit v0.10.2 From 72b558235950538da8bf5a8de746a194831c6fe6 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Mon, 24 Jul 2006 22:53:12 -0700 Subject: [NETFILTER]: conntrack: fix SYSCTL=n compile Signed-off-by: Adrian Bunk Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c index 7bd3c22..7a9fa04 100644 --- a/net/ipv4/netfilter/ip_conntrack_standalone.c +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c @@ -534,6 +534,8 @@ static struct nf_hook_ops ip_conntrack_ops[] = { /* Sysctl support */ +int ip_conntrack_checksum = 1; + #ifdef CONFIG_SYSCTL /* From ip_conntrack_core.c */ @@ -568,8 +570,6 @@ extern unsigned int ip_ct_generic_timeout; static int log_invalid_proto_min = 0; static int log_invalid_proto_max = 255; -int ip_conntrack_checksum = 1; - static struct ctl_table_header *ip_ct_sysctl_header; static ctl_table ip_ct_sysctl_table[] = { diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 5fcab2e..4ef8366 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -428,6 +428,8 @@ static struct file_operations ct_cpu_seq_fops = { /* Sysctl support */ +int nf_conntrack_checksum = 1; + #ifdef CONFIG_SYSCTL /* From nf_conntrack_core.c */ @@ -459,8 +461,6 @@ extern unsigned int nf_ct_generic_timeout; static int log_invalid_proto_min = 0; static int log_invalid_proto_max = 255; -int nf_conntrack_checksum = 1; - static struct ctl_table_header *nf_ct_sysctl_header; static ctl_table nf_ct_sysctl_table[] = { -- cgit v0.10.2 From 8cf8fb5687bb37737ea419a0b2143aab49295779 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 24 Jul 2006 22:53:35 -0700 Subject: [NETFILTER]: SNMP NAT: fix byteorder confusion Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c index 0b1b416..18b7fbd 100644 --- a/net/ipv4/netfilter/ip_nat_snmp_basic.c +++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c @@ -1255,9 +1255,9 @@ static int help(struct sk_buff **pskb, struct udphdr *udph = (struct udphdr *)((u_int32_t *)iph + iph->ihl); /* SNMP replies and originating SNMP traps get mangled */ - if (udph->source == ntohs(SNMP_PORT) && dir != IP_CT_DIR_REPLY) + if (udph->source == htons(SNMP_PORT) && dir != IP_CT_DIR_REPLY) return NF_ACCEPT; - if (udph->dest == ntohs(SNMP_TRAP_PORT) && dir != IP_CT_DIR_ORIGINAL) + if (udph->dest == htons(SNMP_TRAP_PORT) && dir != IP_CT_DIR_ORIGINAL) return NF_ACCEPT; /* No NAT? */ -- cgit v0.10.2 From 28658c8967da9083be83af0a37be3b190bae79da Mon Sep 17 00:00:00 2001 From: Phil Oester Date: Mon, 24 Jul 2006 22:54:14 -0700 Subject: [NETFILTER]: xt_pkttype: fix mismatches on locally generated packets Locally generated broadcast and multicast packets have pkttype set to PACKET_LOOPBACK instead of PACKET_BROADCAST or PACKET_MULTICAST. This causes the pkttype match to fail to match packets of either type. The below patch remedies this by using the daddr as a hint as to broadcast|multicast. While not pretty, this seems like the only way to solve the problem short of just noting this as a limitation of the match. This resolves netfilter bugzilla #484 Signed-off-by: Phil Oester Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller diff --git a/net/netfilter/xt_pkttype.c b/net/netfilter/xt_pkttype.c index 3ac703b..d2f5320 100644 --- a/net/netfilter/xt_pkttype.c +++ b/net/netfilter/xt_pkttype.c @@ -9,6 +9,8 @@ #include #include #include +#include +#include #include #include @@ -28,9 +30,17 @@ static int match(const struct sk_buff *skb, unsigned int protoff, int *hotdrop) { + u_int8_t type; const struct xt_pkttype_info *info = matchinfo; - return (skb->pkt_type == info->pkttype) ^ info->invert; + if (skb->pkt_type == PACKET_LOOPBACK) + type = (MULTICAST(skb->nh.iph->daddr) + ? PACKET_MULTICAST + : PACKET_BROADCAST); + else + type = skb->pkt_type; + + return (type == info->pkttype) ^ info->invert; } static struct xt_match pkttype_match = { -- cgit v0.10.2 From 10ea6ac895418bd0d23900e3330daa6ba0836d26 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 24 Jul 2006 22:54:55 -0700 Subject: [NETFILTER]: bridge netfilter: add deferred output hooks to feature-removal-schedule Add bridge netfilter deferred output hooks to feature-removal-schedule and disable them by default. Until their removal they will be activated by the physdev match when needed. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 9d3a077..87851ef 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -258,3 +258,19 @@ Why: These drivers never compiled since they were added to the kernel Who: Jean Delvare --------------------------- + +What: Bridge netfilter deferred IPv4/IPv6 output hook calling +When: January 2007 +Why: The deferred output hooks are a layering violation causing unusual + and broken behaviour on bridge devices. Examples of things they + break include QoS classifation using the MARK or CLASSIFY targets, + the IPsec policy match and connection tracking with VLANs on a + bridge. Their only use is to enable bridge output port filtering + within iptables with the physdev match, which can also be done by + combining iptables and ebtables using netfilter marks. Until it + will get removed the hook deferral is disabled by default and is + only enabled when needed. + +Who: Patrick McHardy + +--------------------------- diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h index 8776402..31f02ba03 100644 --- a/include/linux/netfilter_bridge.h +++ b/include/linux/netfilter_bridge.h @@ -79,6 +79,8 @@ struct bridge_skb_cb { __u32 ipv4; } daddr; }; + +extern int brnf_deferred_hooks; #endif /* CONFIG_BRIDGE_NETFILTER */ #endif /* __KERNEL__ */ diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index cbc8a38..05b3de8 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -61,6 +61,9 @@ static int brnf_filter_vlan_tagged = 1; #define brnf_filter_vlan_tagged 1 #endif +int brnf_deferred_hooks; +EXPORT_SYMBOL_GPL(brnf_deferred_hooks); + static __be16 inline vlan_proto(const struct sk_buff *skb) { return vlan_eth_hdr(skb)->h_vlan_encapsulated_proto; @@ -890,6 +893,8 @@ static unsigned int ip_sabotage_out(unsigned int hook, struct sk_buff **pskb, return NF_ACCEPT; else if (ip->version == 6 && !brnf_call_ip6tables) return NF_ACCEPT; + else if (!brnf_deferred_hooks) + return NF_ACCEPT; #endif if (hook == NF_IP_POST_ROUTING) return NF_ACCEPT; diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c index 5fe4c9d..a9f4f6f 100644 --- a/net/netfilter/xt_physdev.c +++ b/net/netfilter/xt_physdev.c @@ -113,6 +113,21 @@ checkentry(const char *tablename, if (!(info->bitmask & XT_PHYSDEV_OP_MASK) || info->bitmask & ~XT_PHYSDEV_OP_MASK) return 0; + if (brnf_deferred_hooks == 0 && + info->bitmask & XT_PHYSDEV_OP_OUT && + (!(info->bitmask & XT_PHYSDEV_OP_BRIDGED) || + info->invert & XT_PHYSDEV_OP_BRIDGED) && + hook_mask & ((1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_FORWARD) | + (1 << NF_IP_POST_ROUTING))) { + printk(KERN_WARNING "physdev match: using --physdev-out in the " + "OUTPUT, FORWARD and POSTROUTING chains for non-bridged " + "traffic is deprecated and breaks other things, it will " + "be removed in January 2007. See Documentation/" + "feature-removal-schedule.txt for details. This doesn't " + "affect you in case you're using it for purely bridged " + "traffic.\n"); + brnf_deferred_hooks = 1; + } return 1; } -- cgit v0.10.2 From d5af981e93aff0de5ad2a1a9935a3f6aa5cd3e3c Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 24 Jul 2006 22:55:29 -0700 Subject: [NETFILTER]: Demote xt_sctp to EXPERIMENTAL After the recent problems with all the SCTP stuff it seems reasonable to mark this as experimental. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 42a178a..a9894ddf 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -386,8 +386,8 @@ config NETFILTER_XT_MATCH_REALM . If unsure, say `N'. config NETFILTER_XT_MATCH_SCTP - tristate '"sctp" protocol match support' - depends on NETFILTER_XTABLES + tristate '"sctp" protocol match support (EXPERIMENTAL)' + depends on NETFILTER_XTABLES && EXPERIMENTAL help With this option enabled, you will be able to use the `sctp' match in order to match on SCTP source/destination ports -- cgit v0.10.2 From 6b7fdc3ae18a0598a999156b62d55ea55220e00f Mon Sep 17 00:00:00 2001 From: Guillaume Chazarain Date: Mon, 24 Jul 2006 23:44:44 -0700 Subject: [IPV6]: Clean skb cb on IPv6 input. Clear the accumulated junk in IP6CB when starting to handle an IPV6 packet. Signed-off-by: Guillaume Chazarain Signed-off-by: David S. Miller diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index df8f051..25c2a9e 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -71,6 +71,8 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt goto out; } + memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); + /* * Store incoming device index. When the packet will * be queued, we cannot refer to skb->dev anymore. -- cgit v0.10.2 From d569f1d72f068992d07ab17f7ff9aea7f0d97cdb Mon Sep 17 00:00:00 2001 From: Guillaume Chazarain Date: Mon, 24 Jul 2006 23:45:16 -0700 Subject: [IPV4]: Clear the whole IPCB, this clears also IPCB(skb)->flags. Signed-off-by: Guillaume Chazarain Signed-off-by: David S. Miller diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 184c78c..212734c 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -429,7 +429,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, } /* Remove any debris in the socket control block */ - memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); + memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); return NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, dev, NULL, ip_rcv_finish); -- cgit v0.10.2 From 7b30f09245d0e6868819b946b2f6879e5d3d106b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 25 Jul 2006 15:02:48 +0200 Subject: [PATCH] cciss: fix stall with softirq handling and CFQ We need to postpone the queue startup until after the softirq handler has actually finished some requests, otherwise we could be racing with cciss_softirq_done() and not actually restart the queue handling. Signed-off-by: Jens Axboe diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 1c4df22..7b0eca7 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -1233,6 +1233,50 @@ static inline void complete_buffers(struct bio *bio, int status) } } +static void cciss_check_queues(ctlr_info_t *h) +{ + int start_queue = h->next_to_run; + int i; + + /* check to see if we have maxed out the number of commands that can + * be placed on the queue. If so then exit. We do this check here + * in case the interrupt we serviced was from an ioctl and did not + * free any new commands. + */ + if ((find_first_zero_bit(h->cmd_pool_bits, NR_CMDS)) == NR_CMDS) + return; + + /* We have room on the queue for more commands. Now we need to queue + * them up. We will also keep track of the next queue to run so + * that every queue gets a chance to be started first. + */ + for (i = 0; i < h->highest_lun + 1; i++) { + int curr_queue = (start_queue + i) % (h->highest_lun + 1); + /* make sure the disk has been added and the drive is real + * because this can be called from the middle of init_one. + */ + if (!(h->drv[curr_queue].queue) || !(h->drv[curr_queue].heads)) + continue; + blk_start_queue(h->gendisk[curr_queue]->queue); + + /* check to see if we have maxed out the number of commands + * that can be placed on the queue. + */ + if ((find_first_zero_bit(h->cmd_pool_bits, NR_CMDS)) == NR_CMDS) { + if (curr_queue == start_queue) { + h->next_to_run = + (start_queue + 1) % (h->highest_lun + 1); + break; + } else { + h->next_to_run = curr_queue; + break; + } + } else { + curr_queue = (curr_queue + 1) % (h->highest_lun + 1); + } + } +} + static void cciss_softirq_done(struct request *rq) { CommandList_struct *cmd = rq->completion_data; @@ -1264,6 +1308,7 @@ static void cciss_softirq_done(struct request *rq) spin_lock_irqsave(&h->lock, flags); end_that_request_last(rq, rq->errors); cmd_free(h, cmd, 1); + cciss_check_queues(h); spin_unlock_irqrestore(&h->lock, flags); } @@ -2528,8 +2573,6 @@ static irqreturn_t do_cciss_intr(int irq, void *dev_id, struct pt_regs *regs) CommandList_struct *c; unsigned long flags; __u32 a, a1, a2; - int j; - int start_queue = h->next_to_run; if (interrupt_not_for_us(h)) return IRQ_NONE; @@ -2588,45 +2631,6 @@ static irqreturn_t do_cciss_intr(int irq, void *dev_id, struct pt_regs *regs) } } - /* check to see if we have maxed out the number of commands that can - * be placed on the queue. If so then exit. We do this check here - * in case the interrupt we serviced was from an ioctl and did not - * free any new commands. - */ - if ((find_first_zero_bit(h->cmd_pool_bits, NR_CMDS)) == NR_CMDS) - goto cleanup; - - /* We have room on the queue for more commands. Now we need to queue - * them up. We will also keep track of the next queue to run so - * that every queue gets a chance to be started first. - */ - for (j = 0; j < h->highest_lun + 1; j++) { - int curr_queue = (start_queue + j) % (h->highest_lun + 1); - /* make sure the disk has been added and the drive is real - * because this can be called from the middle of init_one. - */ - if (!(h->drv[curr_queue].queue) || !(h->drv[curr_queue].heads)) - continue; - blk_start_queue(h->gendisk[curr_queue]->queue); - - /* check to see if we have maxed out the number of commands - * that can be placed on the queue. - */ - if ((find_first_zero_bit(h->cmd_pool_bits, NR_CMDS)) == NR_CMDS) { - if (curr_queue == start_queue) { - h->next_to_run = - (start_queue + 1) % (h->highest_lun + 1); - goto cleanup; - } else { - h->next_to_run = curr_queue; - goto cleanup; - } - } else { - curr_queue = (curr_queue + 1) % (h->highest_lun + 1); - } - } - - cleanup: spin_unlock_irqrestore(CCISS_LOCK(h->ctlr), flags); return IRQ_HANDLED; } -- cgit v0.10.2 From ad01b1ca797e5898cd40bb32cf0dc8c85aa9f7e8 Mon Sep 17 00:00:00 2001 From: Milton Miller Date: Tue, 25 Jul 2006 15:04:13 +0200 Subject: [PATCH] blktrace: fix read-ahead bit It should be toggling the same bit on and off, fix it up. Signed-off-by: Jens Axboe diff --git a/block/blktrace.c b/block/blktrace.c index b8c07027..265f7a8 100644 --- a/block/blktrace.c +++ b/block/blktrace.c @@ -80,7 +80,7 @@ static u32 bio_act[5] __read_mostly = { 0, BLK_TC_ACT(BLK_TC_BARRIER), BLK_TC_AC #define trace_sync_bit(rw) \ (((rw) & (1 << BIO_RW_SYNC)) >> (BIO_RW_SYNC - 1)) #define trace_ahead_bit(rw) \ - (((rw) & (1 << BIO_RW_AHEAD)) << (BIO_RW_AHEAD - 0)) + (((rw) & (1 << BIO_RW_AHEAD)) << (2 - BIO_RW_AHEAD)) /* * The worker for the various blk_add_trace*() types. Fills out a -- cgit v0.10.2 From 44eb123126d289bac398cac0232309c228386671 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 25 Jul 2006 15:05:21 +0200 Subject: [PATCH] cfq-iosched: don't use a hard jiffies value, translate from msecs The CIC_SEEKY() test really wants to use the minimum of either: - 2 msecs (not jiffies) - or, the pending slice time So code it like that. Signed-off-by: Jens Axboe diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 102ebc2..aae3123 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -936,7 +936,7 @@ static int cfq_arm_slice_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq) * seeks. so allow a little bit of time for him to submit a new rq */ if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic)) - sl = 2; + sl = min(sl, msecs_to_jiffies(2)); mod_timer(&cfqd->idle_slice_timer, jiffies + sl); return 1; -- cgit v0.10.2 From b9ec6c1b917e2e43a058a78198d54aeca3d71c6f Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Tue, 25 Jul 2006 16:37:27 -0700 Subject: [TG3]: Add tg3_restart_hw() Add tg3_restart_hw() to handle failures when re-initializing the device. Signed-off-by: Michael Chan Signed-off-by: David S. Miller diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c index ce6f3be..1253cec 100644 --- a/drivers/net/tg3.c +++ b/drivers/net/tg3.c @@ -3590,6 +3590,28 @@ static irqreturn_t tg3_test_isr(int irq, void *dev_id, static int tg3_init_hw(struct tg3 *, int); static int tg3_halt(struct tg3 *, int, int); +/* Restart hardware after configuration changes, self-test, etc. + * Invoked with tp->lock held. + */ +static int tg3_restart_hw(struct tg3 *tp, int reset_phy) +{ + int err; + + err = tg3_init_hw(tp, reset_phy); + if (err) { + printk(KERN_ERR PFX "%s: Failed to re-initialize device, " + "aborting.\n", tp->dev->name); + tg3_halt(tp, RESET_KIND_SHUTDOWN, 1); + tg3_full_unlock(tp); + del_timer_sync(&tp->timer); + tp->irq_sync = 0; + netif_poll_enable(tp->dev); + dev_close(tp->dev); + tg3_full_lock(tp, 0); + } + return err; +} + #ifdef CONFIG_NET_POLL_CONTROLLER static void tg3_poll_controller(struct net_device *dev) { @@ -3630,13 +3652,15 @@ static void tg3_reset_task(void *_data) } tg3_halt(tp, RESET_KIND_SHUTDOWN, 0); - tg3_init_hw(tp, 1); + if (tg3_init_hw(tp, 1)) + goto out; tg3_netif_start(tp); if (restart_timer) mod_timer(&tp->timer, jiffies + 1); +out: tp->tg3_flags &= ~TG3_FLAG_IN_RESET_TASK; tg3_full_unlock(tp); @@ -4124,6 +4148,7 @@ static inline void tg3_set_mtu(struct net_device *dev, struct tg3 *tp, static int tg3_change_mtu(struct net_device *dev, int new_mtu) { struct tg3 *tp = netdev_priv(dev); + int err; if (new_mtu < TG3_MIN_MTU || new_mtu > TG3_MAX_MTU(tp)) return -EINVAL; @@ -4144,13 +4169,14 @@ static int tg3_change_mtu(struct net_device *dev, int new_mtu) tg3_set_mtu(dev, tp, new_mtu); - tg3_init_hw(tp, 0); + err = tg3_restart_hw(tp, 0); - tg3_netif_start(tp); + if (!err) + tg3_netif_start(tp); tg3_full_unlock(tp); - return 0; + return err; } /* Free up pending packets in all rx/tx rings. @@ -5815,6 +5841,7 @@ static int tg3_set_mac_addr(struct net_device *dev, void *p) { struct tg3 *tp = netdev_priv(dev); struct sockaddr *addr = p; + int err = 0; if (!is_valid_ether_addr(addr->sa_data)) return -EINVAL; @@ -5832,9 +5859,9 @@ static int tg3_set_mac_addr(struct net_device *dev, void *p) tg3_full_lock(tp, 1); tg3_halt(tp, RESET_KIND_SHUTDOWN, 1); - tg3_init_hw(tp, 0); - - tg3_netif_start(tp); + err = tg3_restart_hw(tp, 0); + if (!err) + tg3_netif_start(tp); tg3_full_unlock(tp); } else { spin_lock_bh(&tp->lock); @@ -5842,7 +5869,7 @@ static int tg3_set_mac_addr(struct net_device *dev, void *p) spin_unlock_bh(&tp->lock); } - return 0; + return err; } /* tp->lock is held. */ @@ -7956,7 +7983,7 @@ static void tg3_get_ringparam(struct net_device *dev, struct ethtool_ringparam * static int tg3_set_ringparam(struct net_device *dev, struct ethtool_ringparam *ering) { struct tg3 *tp = netdev_priv(dev); - int irq_sync = 0; + int irq_sync = 0, err = 0; if ((ering->rx_pending > TG3_RX_RING_SIZE - 1) || (ering->rx_jumbo_pending > TG3_RX_JUMBO_RING_SIZE - 1) || @@ -7980,13 +8007,14 @@ static int tg3_set_ringparam(struct net_device *dev, struct ethtool_ringparam *e if (netif_running(dev)) { tg3_halt(tp, RESET_KIND_SHUTDOWN, 1); - tg3_init_hw(tp, 1); - tg3_netif_start(tp); + err = tg3_restart_hw(tp, 1); + if (!err) + tg3_netif_start(tp); } tg3_full_unlock(tp); - return 0; + return err; } static void tg3_get_pauseparam(struct net_device *dev, struct ethtool_pauseparam *epause) @@ -8001,7 +8029,7 @@ static void tg3_get_pauseparam(struct net_device *dev, struct ethtool_pauseparam static int tg3_set_pauseparam(struct net_device *dev, struct ethtool_pauseparam *epause) { struct tg3 *tp = netdev_priv(dev); - int irq_sync = 0; + int irq_sync = 0, err = 0; if (netif_running(dev)) { tg3_netif_stop(tp); @@ -8025,13 +8053,14 @@ static int tg3_set_pauseparam(struct net_device *dev, struct ethtool_pauseparam if (netif_running(dev)) { tg3_halt(tp, RESET_KIND_SHUTDOWN, 1); - tg3_init_hw(tp, 1); - tg3_netif_start(tp); + err = tg3_restart_hw(tp, 1); + if (!err) + tg3_netif_start(tp); } tg3_full_unlock(tp); - return 0; + return err; } static u32 tg3_get_rx_csum(struct net_device *dev) @@ -8666,7 +8695,9 @@ static int tg3_test_loopback(struct tg3 *tp) if (!netif_running(tp->dev)) return TG3_LOOPBACK_FAILED; - tg3_reset_hw(tp, 1); + err = tg3_reset_hw(tp, 1); + if (err) + return TG3_LOOPBACK_FAILED; if (tg3_run_loopback(tp, TG3_MAC_LOOPBACK)) err |= TG3_MAC_LOOPBACK_FAILED; @@ -8740,8 +8771,8 @@ static void tg3_self_test(struct net_device *dev, struct ethtool_test *etest, tg3_halt(tp, RESET_KIND_SHUTDOWN, 1); if (netif_running(dev)) { tp->tg3_flags |= TG3_FLAG_INIT_COMPLETE; - tg3_init_hw(tp, 1); - tg3_netif_start(tp); + if (!tg3_restart_hw(tp, 1)) + tg3_netif_start(tp); } tg3_full_unlock(tp); @@ -11699,7 +11730,8 @@ static int tg3_suspend(struct pci_dev *pdev, pm_message_t state) tg3_full_lock(tp, 0); tp->tg3_flags |= TG3_FLAG_INIT_COMPLETE; - tg3_init_hw(tp, 1); + if (tg3_restart_hw(tp, 1)) + goto out; tp->timer.expires = jiffies + tp->timer_offset; add_timer(&tp->timer); @@ -11707,6 +11739,7 @@ static int tg3_suspend(struct pci_dev *pdev, pm_message_t state) netif_device_attach(dev); tg3_netif_start(tp); +out: tg3_full_unlock(tp); } @@ -11733,16 +11766,19 @@ static int tg3_resume(struct pci_dev *pdev) tg3_full_lock(tp, 0); tp->tg3_flags |= TG3_FLAG_INIT_COMPLETE; - tg3_init_hw(tp, 1); + err = tg3_restart_hw(tp, 1); + if (err) + goto out; tp->timer.expires = jiffies + tp->timer_offset; add_timer(&tp->timer); tg3_netif_start(tp); +out: tg3_full_unlock(tp); - return 0; + return err; } static struct pci_driver tg3_driver = { -- cgit v0.10.2 From 32d8c5724b7b05c7d8f7386c49432104cc222e32 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Tue, 25 Jul 2006 16:38:29 -0700 Subject: [TG3]: Handle tg3_init_rings() failures Handle dev_alloc_skb() failures when initializing the RX rings. Without proper handling, the driver will crash when using a partial ring. Thanks to Stephane Doyon for reporting the bug and providing the initial patch. Howie Xu also reported the same issue. Signed-off-by: Michael Chan Signed-off-by: David S. Miller diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c index 1253cec..d66b06f 100644 --- a/drivers/net/tg3.c +++ b/drivers/net/tg3.c @@ -4258,7 +4258,7 @@ static void tg3_free_rings(struct tg3 *tp) * end up in the driver. tp->{tx,}lock are held and thus * we may not sleep. */ -static void tg3_init_rings(struct tg3 *tp) +static int tg3_init_rings(struct tg3 *tp) { u32 i; @@ -4307,18 +4307,38 @@ static void tg3_init_rings(struct tg3 *tp) /* Now allocate fresh SKBs for each rx ring. */ for (i = 0; i < tp->rx_pending; i++) { - if (tg3_alloc_rx_skb(tp, RXD_OPAQUE_RING_STD, - -1, i) < 0) + if (tg3_alloc_rx_skb(tp, RXD_OPAQUE_RING_STD, -1, i) < 0) { + printk(KERN_WARNING PFX + "%s: Using a smaller RX standard ring, " + "only %d out of %d buffers were allocated " + "successfully.\n", + tp->dev->name, i, tp->rx_pending); + if (i == 0) + return -ENOMEM; + tp->rx_pending = i; break; + } } if (tp->tg3_flags & TG3_FLAG_JUMBO_RING_ENABLE) { for (i = 0; i < tp->rx_jumbo_pending; i++) { if (tg3_alloc_rx_skb(tp, RXD_OPAQUE_RING_JUMBO, - -1, i) < 0) + -1, i) < 0) { + printk(KERN_WARNING PFX + "%s: Using a smaller RX jumbo ring, " + "only %d out of %d buffers were " + "allocated successfully.\n", + tp->dev->name, i, tp->rx_jumbo_pending); + if (i == 0) { + tg3_free_rings(tp); + return -ENOMEM; + } + tp->rx_jumbo_pending = i; break; + } } } + return 0; } /* @@ -5969,7 +5989,9 @@ static int tg3_reset_hw(struct tg3 *tp, int reset_phy) * can only do this after the hardware has been * successfully reset. */ - tg3_init_rings(tp); + err = tg3_init_rings(tp); + if (err) + return err; /* This value is determined during the probe time DMA * engine test, tg3_test_dma. -- cgit v0.10.2 From b6e77a5346d8a739227ed73c2269966a4fd652b4 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Tue, 25 Jul 2006 16:39:12 -0700 Subject: [TG3]: Update version and reldate Update version to 3.63. Signed-off-by: Michael Chan Signed-off-by: David S. Miller diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c index d66b06f..1b8138f 100644 --- a/drivers/net/tg3.c +++ b/drivers/net/tg3.c @@ -68,8 +68,8 @@ #define DRV_MODULE_NAME "tg3" #define PFX DRV_MODULE_NAME ": " -#define DRV_MODULE_VERSION "3.62" -#define DRV_MODULE_RELDATE "June 30, 2006" +#define DRV_MODULE_VERSION "3.63" +#define DRV_MODULE_RELDATE "July 25, 2006" #define TG3_DEF_MAC_MODE 0 #define TG3_DEF_RX_MODE 0 -- cgit v0.10.2 From 722874909271a807b243a797c2958e0a12992964 Mon Sep 17 00:00:00 2001 From: Alexey Kuznetsov Date: Tue, 25 Jul 2006 16:45:12 -0700 Subject: [IPV4] ipmr: ip multicast route bug fix. IP multicast route code was reusing an skb which causes use after free and double free. From: Alexey Kuznetsov Note, it is real skb_clone(), not alloc_skb(). Equeued skb contains the whole half-prepared netlink message plus room for the rest. It could be also skb_copy(), if we want to be puristic about mangling cloned data, but original copy is really not going to be used. Acked-by: Stephen Hemminger Signed-off-by: David S. Miller diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 9ccacf5..85893ee 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1578,6 +1578,7 @@ int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait) cache = ipmr_cache_find(rt->rt_src, rt->rt_dst); if (cache==NULL) { + struct sk_buff *skb2; struct net_device *dev; int vif; @@ -1591,12 +1592,18 @@ int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait) read_unlock(&mrt_lock); return -ENODEV; } - skb->nh.raw = skb_push(skb, sizeof(struct iphdr)); - skb->nh.iph->ihl = sizeof(struct iphdr)>>2; - skb->nh.iph->saddr = rt->rt_src; - skb->nh.iph->daddr = rt->rt_dst; - skb->nh.iph->version = 0; - err = ipmr_cache_unresolved(vif, skb); + skb2 = skb_clone(skb, GFP_ATOMIC); + if (!skb2) { + read_unlock(&mrt_lock); + return -ENOMEM; + } + + skb2->nh.raw = skb_push(skb2, sizeof(struct iphdr)); + skb2->nh.iph->ihl = sizeof(struct iphdr)>>2; + skb2->nh.iph->saddr = rt->rt_src; + skb2->nh.iph->daddr = rt->rt_dst; + skb2->nh.iph->version = 0; + err = ipmr_cache_unresolved(vif, skb2); read_unlock(&mrt_lock); return err; } -- cgit v0.10.2 From f59fc7f30b710d45aadf715460b3e60dbe9d3418 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Tue, 25 Jul 2006 17:05:35 -0700 Subject: [IPV4/IPV6]: Setting 0 for unused port field in RAW IP recvmsg(). From: Tetsuo Handa from-linux-kernel@i-love.sakura.ne.jp The recvmsg() for raw socket seems to return random u16 value from the kernel stack memory since port field is not initialized. But I'm not sure this patch is correct. Does raw socket return any information stored in port field? [ BSD defines RAW IP recvmsg to return a sin_port value of zero. This is described in Steven's TCP/IP Illustrated Volume 2 on page 1055, which is discussing the BSD rip_input() implementation. ] Acked-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index bd221ec..62b2762 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -609,6 +609,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, if (sin) { sin->sin_family = AF_INET; sin->sin_addr.s_addr = skb->nh.iph->saddr; + sin->sin_port = 0; memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); } if (inet->cmsg_flags) diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index fa1ce0a..d57e61c 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -411,6 +411,7 @@ static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk, /* Copy the address. */ if (sin6) { sin6->sin6_family = AF_INET6; + sin6->sin6_port = 0; ipv6_addr_copy(&sin6->sin6_addr, &skb->nh.ipv6h->saddr); sin6->sin6_flowinfo = 0; sin6->sin6_scope_id = 0; -- cgit v0.10.2 From 153d7f3fcae7ed4e19328549aa9467acdfbced10 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Wed, 26 Jul 2006 15:40:07 +0200 Subject: [PATCH] Reorganize the cpufreq cpu hotplug locking to not be totally bizare The patch below moves the cpu hotplugging higher up in the cpufreq layering; this is needed to avoid recursive taking of the cpu hotplug lock and to otherwise detangle the mess. The new rules are: 1. you must do lock_cpu_hotplug() around the following functions: __cpufreq_driver_target __cpufreq_governor (for CPUFREQ_GOV_LIMITS operation only) __cpufreq_set_policy 2. governer methods (.governer) must NOT take the lock_cpu_hotplug() lock in any way; they are called with the lock taken already 3. if your governer spawns a thread that does things, like calling __cpufreq_driver_target, your thread must honor rule #1. 4. the policy lock and other cpufreq internal locks nest within the lock_cpu_hotplug() lock. I'm not entirely happy about how the __cpufreq_governor rule ended up (conditional locking rule depending on the argument) but basically all callers pass this as a constant so it's not too horrible. The patch also removes the cpufreq_governor() function since during the locking audit it turned out to be entirely unused (so no need to fix it) The patch works on my testbox, but it could use more testing (otoh... it can't be much worse than the current code) Signed-off-by: Arjan van de Ven Signed-off-by: Linus Torvalds diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 8d32818..bc1088d 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -364,10 +364,12 @@ static ssize_t store_##file_name \ if (ret != 1) \ return -EINVAL; \ \ + lock_cpu_hotplug(); \ mutex_lock(&policy->lock); \ ret = __cpufreq_set_policy(policy, &new_policy); \ policy->user_policy.object = policy->object; \ mutex_unlock(&policy->lock); \ + unlock_cpu_hotplug(); \ \ return ret ? ret : count; \ } @@ -1197,20 +1199,18 @@ EXPORT_SYMBOL(cpufreq_unregister_notifier); *********************************************************************/ +/* Must be called with lock_cpu_hotplug held */ int __cpufreq_driver_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) { int retval = -EINVAL; - lock_cpu_hotplug(); dprintk("target for CPU %u: %u kHz, relation %u\n", policy->cpu, target_freq, relation); if (cpu_online(policy->cpu) && cpufreq_driver->target) retval = cpufreq_driver->target(policy, target_freq, relation); - unlock_cpu_hotplug(); - return retval; } EXPORT_SYMBOL_GPL(__cpufreq_driver_target); @@ -1225,17 +1225,23 @@ int cpufreq_driver_target(struct cpufreq_policy *policy, if (!policy) return -EINVAL; + lock_cpu_hotplug(); mutex_lock(&policy->lock); ret = __cpufreq_driver_target(policy, target_freq, relation); mutex_unlock(&policy->lock); + unlock_cpu_hotplug(); cpufreq_cpu_put(policy); return ret; } EXPORT_SYMBOL_GPL(cpufreq_driver_target); +/* + * Locking: Must be called with the lock_cpu_hotplug() lock held + * when "event" is CPUFREQ_GOV_LIMITS + */ static int __cpufreq_governor(struct cpufreq_policy *policy, unsigned int event) { @@ -1257,24 +1263,6 @@ static int __cpufreq_governor(struct cpufreq_policy *policy, unsigned int event) } -int cpufreq_governor(unsigned int cpu, unsigned int event) -{ - int ret = 0; - struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); - - if (!policy) - return -EINVAL; - - mutex_lock(&policy->lock); - ret = __cpufreq_governor(policy, event); - mutex_unlock(&policy->lock); - - cpufreq_cpu_put(policy); - return ret; -} -EXPORT_SYMBOL_GPL(cpufreq_governor); - - int cpufreq_register_governor(struct cpufreq_governor *governor) { struct cpufreq_governor *t; @@ -1342,6 +1330,9 @@ int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu) EXPORT_SYMBOL(cpufreq_get_policy); +/* + * Locking: Must be called with the lock_cpu_hotplug() lock held + */ static int __cpufreq_set_policy(struct cpufreq_policy *data, struct cpufreq_policy *policy) { int ret = 0; @@ -1436,6 +1427,8 @@ int cpufreq_set_policy(struct cpufreq_policy *policy) if (!data) return -EINVAL; + lock_cpu_hotplug(); + /* lock this CPU */ mutex_lock(&data->lock); @@ -1446,6 +1439,8 @@ int cpufreq_set_policy(struct cpufreq_policy *policy) data->user_policy.governor = data->governor; mutex_unlock(&data->lock); + + unlock_cpu_hotplug(); cpufreq_cpu_put(data); return ret; @@ -1469,6 +1464,7 @@ int cpufreq_update_policy(unsigned int cpu) if (!data) return -ENODEV; + lock_cpu_hotplug(); mutex_lock(&data->lock); dprintk("updating policy for CPU %u\n", cpu); @@ -1494,7 +1490,7 @@ int cpufreq_update_policy(unsigned int cpu) ret = __cpufreq_set_policy(data, &policy); mutex_unlock(&data->lock); - + unlock_cpu_hotplug(); cpufreq_cpu_put(data); return ret; } diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c index b3ebc8f..c4c578d 100644 --- a/drivers/cpufreq/cpufreq_conservative.c +++ b/drivers/cpufreq/cpufreq_conservative.c @@ -525,7 +525,6 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, break; case CPUFREQ_GOV_LIMITS: - lock_cpu_hotplug(); mutex_lock(&dbs_mutex); if (policy->max < this_dbs_info->cur_policy->cur) __cpufreq_driver_target( @@ -536,7 +535,6 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, this_dbs_info->cur_policy, policy->min, CPUFREQ_RELATION_L); mutex_unlock(&dbs_mutex); - unlock_cpu_hotplug(); break; } return 0; diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index 178f0c5..52cf1f0 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -309,7 +309,9 @@ static void do_dbs_timer(void *data) if (!dbs_info->enable) return; + lock_cpu_hotplug(); dbs_check_cpu(dbs_info); + unlock_cpu_hotplug(); queue_delayed_work_on(cpu, kondemand_wq, &dbs_info->work, usecs_to_jiffies(dbs_tuners_ins.sampling_rate)); } @@ -412,7 +414,6 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, break; case CPUFREQ_GOV_LIMITS: - lock_cpu_hotplug(); mutex_lock(&dbs_mutex); if (policy->max < this_dbs_info->cur_policy->cur) __cpufreq_driver_target(this_dbs_info->cur_policy, @@ -423,7 +424,6 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, policy->min, CPUFREQ_RELATION_L); mutex_unlock(&dbs_mutex); - unlock_cpu_hotplug(); break; } return 0; diff --git a/drivers/cpufreq/cpufreq_userspace.c b/drivers/cpufreq/cpufreq_userspace.c index 44ae5e5..a06c204 100644 --- a/drivers/cpufreq/cpufreq_userspace.c +++ b/drivers/cpufreq/cpufreq_userspace.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -70,6 +71,7 @@ static int cpufreq_set(unsigned int freq, struct cpufreq_policy *policy) dprintk("cpufreq_set for cpu %u, freq %u kHz\n", policy->cpu, freq); + lock_cpu_hotplug(); mutex_lock(&userspace_mutex); if (!cpu_is_managed[policy->cpu]) goto err; @@ -92,6 +94,7 @@ static int cpufreq_set(unsigned int freq, struct cpufreq_policy *policy) err: mutex_unlock(&userspace_mutex); + unlock_cpu_hotplug(); return ret; } diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 35e1376..4ea39fe 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -172,9 +172,6 @@ extern int __cpufreq_driver_target(struct cpufreq_policy *policy, unsigned int relation); -/* pass an event to the cpufreq governor */ -int cpufreq_governor(unsigned int cpu, unsigned int event); - int cpufreq_register_governor(struct cpufreq_governor *governor); void cpufreq_unregister_governor(struct cpufreq_governor *governor); -- cgit v0.10.2 From 64821324ca49f24be1a66f2f432108f96a24e596 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jul 2006 09:53:23 +0200 Subject: [PATCH] fix compile regression for a few scsi drivers This fixes three drivers to compile again after my patch that removes the data_cmnd member from struct scsi_cmnd. The fas216 change is trivial, it should have been using ->cmnd all the time. NCR53C9 (which seem to be mostly duplicate driver with esp.c!) is doing something odd, it should only have looked at ->cmnd before not the saved copy that is kept for the error handlers sake. Note that it really should deal with the sync setting themselves but use the generic domain validation code that get this right - but that's for later let's push this simple compile fix for now. And sorry for the late fix for this, I have been busy with OLS and associated activities last week. Signed-off-by: Christoph Hellwig Signed-off-by: Linus Torvalds diff --git a/drivers/scsi/NCR53C9x.c b/drivers/scsi/NCR53C9x.c index 085db48..bdc6bb2 100644 --- a/drivers/scsi/NCR53C9x.c +++ b/drivers/scsi/NCR53C9x.c @@ -2152,29 +2152,23 @@ static int esp_do_data_finale(struct NCR_ESP *esp, */ static int esp_should_clear_sync(Scsi_Cmnd *sp) { - unchar cmd1 = sp->cmnd[0]; - unchar cmd2 = sp->data_cmnd[0]; + unchar cmd = sp->cmnd[0]; /* These cases are for spinning up a disk and * waiting for that spinup to complete. */ - if(cmd1 == START_STOP || - cmd2 == START_STOP) + if(cmd == START_STOP) return 0; - if(cmd1 == TEST_UNIT_READY || - cmd2 == TEST_UNIT_READY) + if(cmd == TEST_UNIT_READY) return 0; /* One more special case for SCSI tape drives, * this is what is used to probe the device for * completion of a rewind or tape load operation. */ - if(sp->device->type == TYPE_TAPE) { - if(cmd1 == MODE_SENSE || - cmd2 == MODE_SENSE) - return 0; - } + if(sp->device->type == TYPE_TAPE && cmd == MODE_SENSE) + return 0; return 1; } diff --git a/drivers/scsi/arm/fas216.c b/drivers/scsi/arm/fas216.c index 3e1053f..4cf7afc 100644 --- a/drivers/scsi/arm/fas216.c +++ b/drivers/scsi/arm/fas216.c @@ -2427,7 +2427,7 @@ int fas216_eh_abort(Scsi_Cmnd *SCpnt) info->stats.aborts += 1; printk(KERN_WARNING "scsi%d: abort command ", info->host->host_no); - __scsi_print_command(SCpnt->data_cmnd); + __scsi_print_command(SCpnt->cmnd); print_debug_list(); fas216_dumpstate(info); -- cgit v0.10.2 From ba4ba8a69dcb446450b5ddeca48a7bd15783f4c2 Mon Sep 17 00:00:00 2001 From: Peter Oberparleiter Date: Thu, 27 Jul 2006 14:00:23 +0200 Subject: [S390] permanent subchannel busy conditions may cause I/O stall In special conditions where a subchannel rejects the HALT I/O- instruction with a busy indication (cc 2), I/O may stall. I/O request termination logic retries HALT I/O indefinitely because it expects HALT I/O to alter the subchannel status which is not true when cc 2 is returned. In case of a busy indication, try CLEAR I/O instruction immediately. Signed-off-by: Peter Oberparleiter Signed-off-by: Martin Schwidefsky diff --git a/drivers/s390/cio/device_fsm.c b/drivers/s390/cio/device_fsm.c index ac6e0c7..7a39e0b 100644 --- a/drivers/s390/cio/device_fsm.c +++ b/drivers/s390/cio/device_fsm.c @@ -152,7 +152,8 @@ ccw_device_cancel_halt_clear(struct ccw_device *cdev) if (cdev->private->iretry) { cdev->private->iretry--; ret = cio_halt(sch); - return (ret == 0) ? -EBUSY : ret; + if (ret != -EBUSY) + return (ret == 0) ? -EBUSY : ret; } /* halt io unsuccessful. */ cdev->private->iretry = 255; /* 255 clear retries. */ -- cgit v0.10.2 From 17088229846c078aa936ca64912ab221d083aca1 Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Thu, 27 Jul 2006 14:00:33 +0200 Subject: [S390] duplicate ccw devices in ccwgroup. Fail to create a ccwgroup device if a ccw device is passed in twice. Signed-off-by: Cornelia Huck Signed-off-by: Martin Schwidefsky diff --git a/drivers/s390/cio/ccwgroup.c b/drivers/s390/cio/ccwgroup.c index f26a2ee..3cba6c9 100644 --- a/drivers/s390/cio/ccwgroup.c +++ b/drivers/s390/cio/ccwgroup.c @@ -152,7 +152,6 @@ ccwgroup_create(struct device *root, struct ccwgroup_device *gdev; int i; int rc; - int del_drvdata; if (argc > 256) /* disallow dumb users */ return -EINVAL; @@ -163,7 +162,6 @@ ccwgroup_create(struct device *root, atomic_set(&gdev->onoff, 0); - del_drvdata = 0; for (i = 0; i < argc; i++) { gdev->cdev[i] = get_ccwdev_by_busid(cdrv, argv[i]); @@ -180,10 +178,8 @@ ccwgroup_create(struct device *root, rc = -EINVAL; goto free_dev; } - } - for (i = 0; i < argc; i++) gdev->cdev[i]->dev.driver_data = gdev; - del_drvdata = 1; + } gdev->creator_id = creator_id; gdev->count = argc; @@ -226,9 +222,9 @@ error: free_dev: for (i = 0; i < argc; i++) if (gdev->cdev[i]) { - put_device(&gdev->cdev[i]->dev); - if (del_drvdata) + if (gdev->cdev[i]->dev.driver_data == gdev) gdev->cdev[i]->dev.driver_data = NULL; + put_device(&gdev->cdev[i]->dev); } kfree(gdev); return rc; -- cgit v0.10.2 From 468310a8a7af4f3933ade2700f01d493fa1a9754 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Thu, 27 Jul 2006 14:04:57 +0200 Subject: [S390] update default configuration Signed-off-by: Martin Schwidefsky diff --git a/arch/s390/defconfig b/arch/s390/defconfig index f4dfc10..f1d4591 100644 --- a/arch/s390/defconfig +++ b/arch/s390/defconfig @@ -1,13 +1,16 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.17-rc1 -# Mon Apr 3 14:34:15 2006 +# Linux kernel version: 2.6.18-rc2 +# Thu Jul 27 13:51:07 2006 # CONFIG_MMU=y +CONFIG_LOCKDEP_SUPPORT=y +CONFIG_STACKTRACE_SUPPORT=y CONFIG_RWSEM_XCHGADD_ALGORITHM=y CONFIG_GENERIC_HWEIGHT=y CONFIG_GENERIC_CALIBRATE_DELAY=y CONFIG_S390=y +CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" # # Code maturity level options @@ -25,6 +28,7 @@ CONFIG_SWAP=y CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y # CONFIG_BSD_PROCESS_ACCT is not set +# CONFIG_TASKSTATS is not set CONFIG_SYSCTL=y CONFIG_AUDIT=y # CONFIG_AUDITSYSCALL is not set @@ -43,10 +47,12 @@ CONFIG_PRINTK=y CONFIG_BUG=y CONFIG_ELF_CORE=y CONFIG_BASE_FULL=y +CONFIG_RT_MUTEXES=y CONFIG_FUTEX=y CONFIG_EPOLL=y CONFIG_SHMEM=y CONFIG_SLAB=y +CONFIG_VM_EVENT_COUNTERS=y # CONFIG_TINY_SHMEM is not set CONFIG_BASE_SMALL=0 # CONFIG_SLOB is not set @@ -94,7 +100,6 @@ CONFIG_HOTPLUG_CPU=y CONFIG_DEFAULT_MIGRATION_COST=1000000 CONFIG_COMPAT=y CONFIG_SYSVIPC_COMPAT=y -CONFIG_BINFMT_ELF32=y # # Code generation options @@ -115,6 +120,7 @@ CONFIG_FLATMEM=y CONFIG_FLAT_NODE_MEM_MAP=y # CONFIG_SPARSEMEM_STATIC is not set CONFIG_SPLIT_PTLOCK_CPUS=4 +CONFIG_RESOURCES_64BIT=y # # I/O subsystem configuration @@ -142,6 +148,7 @@ CONFIG_VIRT_CPU_ACCOUNTING=y # CONFIG_APPLDATA_BASE is not set CONFIG_NO_IDLE_HZ=y CONFIG_NO_IDLE_HZ_INIT=y +CONFIG_S390_HYPFS_FS=y CONFIG_KEXEC=y # @@ -174,6 +181,8 @@ CONFIG_IP_FIB_HASH=y # CONFIG_INET_IPCOMP is not set # CONFIG_INET_XFRM_TUNNEL is not set # CONFIG_INET_TUNNEL is not set +CONFIG_INET_XFRM_MODE_TRANSPORT=y +CONFIG_INET_XFRM_MODE_TUNNEL=y CONFIG_INET_DIAG=y CONFIG_INET_TCP_DIAG=y # CONFIG_TCP_CONG_ADVANCED is not set @@ -186,7 +195,10 @@ CONFIG_IPV6=y # CONFIG_INET6_IPCOMP is not set # CONFIG_INET6_XFRM_TUNNEL is not set # CONFIG_INET6_TUNNEL is not set +CONFIG_INET6_XFRM_MODE_TRANSPORT=y +CONFIG_INET6_XFRM_MODE_TUNNEL=y # CONFIG_IPV6_TUNNEL is not set +# CONFIG_NETWORK_SECMARK is not set # CONFIG_NETFILTER is not set # @@ -263,6 +275,7 @@ CONFIG_NET_ESTIMATOR=y # Network testing # # CONFIG_NET_PKTGEN is not set +# CONFIG_NET_TCPPROBE is not set # CONFIG_HAMRADIO is not set # CONFIG_IRDA is not set # CONFIG_BT is not set @@ -276,6 +289,7 @@ CONFIG_STANDALONE=y CONFIG_PREVENT_FIRMWARE_BUILD=y # CONFIG_FW_LOADER is not set # CONFIG_DEBUG_DRIVER is not set +CONFIG_SYS_HYPERVISOR=y # # Connector - unified userspace <-> kernelspace linker @@ -334,6 +348,7 @@ CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_COUNT=16 CONFIG_BLK_DEV_RAM_SIZE=4096 +CONFIG_BLK_DEV_RAM_BLOCKSIZE=1024 CONFIG_BLK_DEV_INITRD=y # CONFIG_CDROM_PKTCDVD is not set @@ -359,9 +374,7 @@ CONFIG_MD_LINEAR=m CONFIG_MD_RAID0=m CONFIG_MD_RAID1=m # CONFIG_MD_RAID10 is not set -CONFIG_MD_RAID5=m -# CONFIG_MD_RAID5_RESHAPE is not set -# CONFIG_MD_RAID6 is not set +# CONFIG_MD_RAID456 is not set CONFIG_MD_MULTIPATH=m # CONFIG_MD_FAULTY is not set CONFIG_BLK_DEV_DM=y @@ -419,7 +432,8 @@ CONFIG_S390_TAPE_34XX=m # # Cryptographic devices # -CONFIG_Z90CRYPT=m +CONFIG_ZCRYPT=m +# CONFIG_ZCRYPT_MONOLITHIC is not set # # Network device support @@ -509,6 +523,7 @@ CONFIG_FS_MBCACHE=y # CONFIG_MINIX_FS is not set # CONFIG_ROMFS_FS is not set CONFIG_INOTIFY=y +CONFIG_INOTIFY_USER=y # CONFIG_QUOTA is not set CONFIG_DNOTIFY=y # CONFIG_AUTOFS_FS is not set @@ -614,26 +629,36 @@ CONFIG_MSDOS_PARTITION=y # Instrumentation Support # # CONFIG_PROFILING is not set -# CONFIG_STATISTICS is not set +CONFIG_STATISTICS=y +CONFIG_KPROBES=y # # Kernel hacking # +CONFIG_TRACE_IRQFLAGS_SUPPORT=y # CONFIG_PRINTK_TIME is not set CONFIG_MAGIC_SYSRQ=y +# CONFIG_UNUSED_SYMBOLS is not set CONFIG_DEBUG_KERNEL=y CONFIG_LOG_BUF_SHIFT=17 # CONFIG_DETECT_SOFTLOCKUP is not set # CONFIG_SCHEDSTATS is not set # CONFIG_DEBUG_SLAB is not set CONFIG_DEBUG_PREEMPT=y -CONFIG_DEBUG_MUTEXES=y +# CONFIG_DEBUG_RT_MUTEXES is not set +# CONFIG_RT_MUTEX_TESTER is not set CONFIG_DEBUG_SPINLOCK=y +CONFIG_DEBUG_MUTEXES=y +# CONFIG_DEBUG_RWSEMS is not set +# CONFIG_DEBUG_LOCK_ALLOC is not set +# CONFIG_PROVE_LOCKING is not set CONFIG_DEBUG_SPINLOCK_SLEEP=y +# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set # CONFIG_DEBUG_KOBJECT is not set # CONFIG_DEBUG_INFO is not set CONFIG_DEBUG_FS=y # CONFIG_DEBUG_VM is not set +# CONFIG_FRAME_POINTER is not set # CONFIG_UNWIND_INFO is not set CONFIG_FORCED_INLINING=y # CONFIG_RCU_TORTURE_TEST is not set @@ -688,3 +713,4 @@ CONFIG_CRYPTO=y # CONFIG_CRC16 is not set CONFIG_CRC32=m # CONFIG_LIBCRC32C is not set +CONFIG_PLIST=y -- cgit v0.10.2 From 92f282988b4ce3967ee8399f7d1184ebfa04e48b Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 27 Jul 2006 16:49:21 -0700 Subject: [SPARC64]: Fix quad-float multiply emulation. Something is wrong with the 3-multiply (vs. 4-multiply) optimized version of _FP_MUL_MEAT_2_*(), so just use the slower version which actually computes correct values. Noticed by Rene Rebe Signed-off-by: David S. Miller diff --git a/include/asm-sparc64/sfp-machine.h b/include/asm-sparc64/sfp-machine.h index 5015bb8..89d4243 100644 --- a/include/asm-sparc64/sfp-machine.h +++ b/include/asm-sparc64/sfp-machine.h @@ -34,7 +34,7 @@ #define _FP_MUL_MEAT_D(R,X,Y) \ _FP_MUL_MEAT_1_wide(_FP_WFRACBITS_D,R,X,Y,umul_ppmm) #define _FP_MUL_MEAT_Q(R,X,Y) \ - _FP_MUL_MEAT_2_wide_3mul(_FP_WFRACBITS_Q,R,X,Y,umul_ppmm) + _FP_MUL_MEAT_2_wide(_FP_WFRACBITS_Q,R,X,Y,umul_ppmm) #define _FP_DIV_MEAT_S(R,X,Y) _FP_DIV_MEAT_1_imm(S,R,X,Y,_FP_DIV_HELP_imm) #define _FP_DIV_MEAT_D(R,X,Y) _FP_DIV_MEAT_1_udiv_norm(D,R,X,Y) -- cgit v0.10.2 From b8cfac4c2f3d12d0f4cbe6f992d945f2fdfc098d Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 27 Jul 2006 17:57:32 -0700 Subject: [SPARC64]: Fix typo in pgprot_noncached(). The sun4v code sequence was or'ing in the sun4u pte bits by mistake. Signed-off-by: David S. Miller diff --git a/include/asm-sparc64/pgtable.h b/include/asm-sparc64/pgtable.h index 03f5bc9..1ba19eb 100644 --- a/include/asm-sparc64/pgtable.h +++ b/include/asm-sparc64/pgtable.h @@ -339,7 +339,7 @@ static inline pgprot_t pgprot_noncached(pgprot_t prot) " .section .sun4v_2insn_patch, \"ax\"\n" " .word 661b\n" " andn %0, %4, %0\n" - " or %0, %3, %0\n" + " or %0, %5, %0\n" " .previous\n" : "=r" (val) : "0" (val), "i" (_PAGE_CP_4U | _PAGE_CV_4U), "i" (_PAGE_E_4U), -- cgit v0.10.2 From 96ba989d22de779ca19ca214e2b2e53a4ca86b7b Mon Sep 17 00:00:00 2001 From: Bob Breuer Date: Thu, 27 Jul 2006 22:08:01 -0700 Subject: [SPARC]: Defer clock_probe to fs_initcall() From: Bob Breuer That way all the of_driver bits will be ready. Signed-off-by: David S. Miller diff --git a/arch/sparc/kernel/time.c b/arch/sparc/kernel/time.c index 04eb1ea..845081b0 100644 --- a/arch/sparc/kernel/time.c +++ b/arch/sparc/kernel/time.c @@ -225,6 +225,32 @@ static __inline__ int has_low_battery(void) return (data1 == data2); /* Was the write blocked? */ } +static void __init mostek_set_system_time(void) +{ + unsigned int year, mon, day, hour, min, sec; + struct mostek48t02 *mregs; + + mregs = (struct mostek48t02 *)mstk48t02_regs; + if(!mregs) { + prom_printf("Something wrong, clock regs not mapped yet.\n"); + prom_halt(); + } + spin_lock_irq(&mostek_lock); + mregs->creg |= MSTK_CREG_READ; + sec = MSTK_REG_SEC(mregs); + min = MSTK_REG_MIN(mregs); + hour = MSTK_REG_HOUR(mregs); + day = MSTK_REG_DOM(mregs); + mon = MSTK_REG_MONTH(mregs); + year = MSTK_CVT_YEAR( MSTK_REG_YEAR(mregs) ); + xtime.tv_sec = mktime(year, mon, day, hour, min, sec); + xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ); + set_normalized_timespec(&wall_to_monotonic, + -xtime.tv_sec, -xtime.tv_nsec); + mregs->creg &= ~MSTK_CREG_READ; + spin_unlock_irq(&mostek_lock); +} + /* Probe for the real time clock chip on Sun4 */ static __inline__ void sun4_clock_probe(void) { @@ -273,6 +299,7 @@ static __inline__ void sun4_clock_probe(void) #endif } +#ifndef CONFIG_SUN4 static int __devinit clock_probe(struct of_device *op, const struct of_device_id *match) { struct device_node *dp = op->node; @@ -307,6 +334,8 @@ static int __devinit clock_probe(struct of_device *op, const struct of_device_id if (mostek_read(mstk48t02_regs + MOSTEK_SEC) & MSTK_STOP) kick_start_clock(); + mostek_set_system_time(); + return 0; } @@ -325,56 +354,37 @@ static struct of_platform_driver clock_driver = { /* Probe for the mostek real time clock chip. */ -static void clock_init(void) +static int __init clock_init(void) { - of_register_driver(&clock_driver, &of_bus_type); + return of_register_driver(&clock_driver, &of_bus_type); } +/* Must be after subsys_initcall() so that busses are probed. Must + * be before device_initcall() because things like the RTC driver + * need to see the clock registers. + */ +fs_initcall(clock_init); +#endif /* !CONFIG_SUN4 */ + void __init sbus_time_init(void) { - unsigned int year, mon, day, hour, min, sec; - struct mostek48t02 *mregs; - -#ifdef CONFIG_SUN4 - int temp; - struct intersil *iregs; -#endif BTFIXUPSET_CALL(bus_do_settimeofday, sbus_do_settimeofday, BTFIXUPCALL_NORM); btfixup(); if (ARCH_SUN4) sun4_clock_probe(); - else - clock_init(); sparc_init_timers(timer_interrupt); #ifdef CONFIG_SUN4 if(idprom->id_machtype == (SM_SUN4 | SM_4_330)) { -#endif - mregs = (struct mostek48t02 *)mstk48t02_regs; - if(!mregs) { - prom_printf("Something wrong, clock regs not mapped yet.\n"); - prom_halt(); - } - spin_lock_irq(&mostek_lock); - mregs->creg |= MSTK_CREG_READ; - sec = MSTK_REG_SEC(mregs); - min = MSTK_REG_MIN(mregs); - hour = MSTK_REG_HOUR(mregs); - day = MSTK_REG_DOM(mregs); - mon = MSTK_REG_MONTH(mregs); - year = MSTK_CVT_YEAR( MSTK_REG_YEAR(mregs) ); - xtime.tv_sec = mktime(year, mon, day, hour, min, sec); - xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ); - set_normalized_timespec(&wall_to_monotonic, - -xtime.tv_sec, -xtime.tv_nsec); - mregs->creg &= ~MSTK_CREG_READ; - spin_unlock_irq(&mostek_lock); -#ifdef CONFIG_SUN4 + mostek_set_system_time(); } else if(idprom->id_machtype == (SM_SUN4 | SM_4_260) ) { /* initialise the intersil on sun4 */ + unsigned int year, mon, day, hour, min, sec; + int temp; + struct intersil *iregs; iregs=intersil_clock; if(!iregs) { -- cgit v0.10.2 From 361934849e9c0418950bedf667732f36337d88b9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 28 Jul 2006 08:54:59 +0200 Subject: [PATCH] ide: option to disable cache flushes for buggy drives Some drives claim they support cache flushing, but get seriously confused if you try. Add this option to be able to boot with barriers enabled by default. Signed-off-by: Jens Axboe diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c index f712e4c..7cf3eb0 100644 --- a/drivers/ide/ide-disk.c +++ b/drivers/ide/ide-disk.c @@ -776,7 +776,7 @@ static void update_ordered(ide_drive_t *drive) * not available so we don't need to recheck that. */ capacity = idedisk_capacity(drive); - barrier = ide_id_has_flush_cache(id) && + barrier = ide_id_has_flush_cache(id) && !drive->noflush && (drive->addressing == 0 || capacity <= (1ULL << 28) || ide_id_has_flush_cache_ext(id)); diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c index 05fbd92..defd4b4 100644 --- a/drivers/ide/ide.c +++ b/drivers/ide/ide.c @@ -1539,7 +1539,7 @@ static int __init ide_setup(char *s) const char *hd_words[] = { "none", "noprobe", "nowerr", "cdrom", "serialize", "autotune", "noautotune", "minus8", "swapdata", "bswap", - "minus11", "remap", "remap63", "scsi", NULL }; + "noflush", "remap", "remap63", "scsi", NULL }; unit = s[2] - 'a'; hw = unit / MAX_DRIVES; unit = unit % MAX_DRIVES; @@ -1578,6 +1578,9 @@ static int __init ide_setup(char *s) case -10: /* "bswap" */ drive->bswap = 1; goto done; + case -11: /* noflush */ + drive->noflush = 1; + goto done; case -12: /* "remap" */ drive->remap_0_to_1 = 1; goto done; diff --git a/include/linux/ide.h b/include/linux/ide.h index dc7abef..9962045 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -571,6 +571,7 @@ typedef struct ide_drive_s { u8 waiting_for_dma; /* dma currently in progress */ u8 unmask; /* okay to unmask other irqs */ u8 bswap; /* byte swap data */ + u8 noflush; /* don't attempt flushes */ u8 dsc_overlap; /* DSC overlap */ u8 nice1; /* give potential excess bandwidth */ -- cgit v0.10.2 From 0a8348d08677ad77ee353f96eb8745c693a05a13 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 28 Jul 2006 08:58:26 +0200 Subject: [PATCH] ide: if the id fields looks screwy, disable DMA It's the safer choice. Originally due to a bug in itx821x, but a generally sound thing to do. Signed-off-by: Jens Axboe diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c index 98918fb..7c3a13e 100644 --- a/drivers/ide/ide-dma.c +++ b/drivers/ide/ide-dma.c @@ -750,7 +750,7 @@ void ide_dma_verbose(ide_drive_t *drive) goto bug_dma_off; printk(", DMA"); } else if (id->field_valid & 1) { - printk(", BUG"); + goto bug_dma_off; } return; bug_dma_off: -- cgit v0.10.2 From 71ef51cc1756d1c56b57c70e7cc27a3559c81ee6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 28 Jul 2006 09:02:17 +0200 Subject: [PATCH] it821x: fix ide dma setup bug Only enable dma for a valid speed setting. Signed-off-by: Jens Axboe diff --git a/drivers/ide/pci/it821x.c b/drivers/ide/pci/it821x.c index 3cb0442..e9bad18 100644 --- a/drivers/ide/pci/it821x.c +++ b/drivers/ide/pci/it821x.c @@ -498,9 +498,14 @@ static int config_chipset_for_dma (ide_drive_t *drive) { u8 speed = ide_dma_speed(drive, it821x_ratemask(drive)); - config_it821x_chipset_for_pio(drive, !speed); - it821x_tune_chipset(drive, speed); - return ide_dma_enable(drive); + if (speed) { + config_it821x_chipset_for_pio(drive, 0); + it821x_tune_chipset(drive, speed); + + return ide_dma_enable(drive); + } + + return 0; } /** -- cgit v0.10.2 From a75ad3c27a6ad78c4306cac939938050dcde54f3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 28 Jul 2006 09:04:09 +0200 Subject: [PATCH] scsi: kill overeager "not-ready" messages HAL and friends have a tendency to trigger this one all the time. It's not really interesting, so kill it. The vendor kernels all do anyways. Signed-off-by: Jens Axboe diff --git a/drivers/scsi/scsi_ioctl.c b/drivers/scsi/scsi_ioctl.c index a89c411..32293f4 100644 --- a/drivers/scsi/scsi_ioctl.c +++ b/drivers/scsi/scsi_ioctl.c @@ -110,11 +110,8 @@ static int ioctl_internal_command(struct scsi_device *sdev, char *cmd, sshdr.asc, sshdr.ascq); break; case NOT_READY: /* This happens if there is no disc in drive */ - if (sdev->removable && (cmd[0] != TEST_UNIT_READY)) { - printk(KERN_INFO "Device not ready. Make sure" - " there is a disc in the drive.\n"); + if (sdev->removable) break; - } case UNIT_ATTENTION: if (sdev->removable) { sdev->changed = 1; -- cgit v0.10.2 From 2a293b7d5aa2f0d1e3d87b642f7ac263c2d664e3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 28 Jul 2006 17:04:26 +1000 Subject: [XFS] All xfs_disk_dquot_t values are (as the name says) disk endian. Before putting them into struct statfs they should be endian-swapped. SGI-PV: 954580 SGI-Modid: xfs-linux-melb:xfs-kern:26550a Signed-off-by: Christoph Hellwig Signed-off-by: Nathan Scott diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c index e95e99f..f137856 100644 --- a/fs/xfs/quota/xfs_qm_bhv.c +++ b/fs/xfs/quota/xfs_qm_bhv.c @@ -217,17 +217,24 @@ xfs_qm_statvfs( return 0; dp = &dqp->q_core; - limit = dp->d_blk_softlimit ? dp->d_blk_softlimit : dp->d_blk_hardlimit; + limit = dp->d_blk_softlimit ? + be64_to_cpu(dp->d_blk_softlimit) : + be64_to_cpu(dp->d_blk_hardlimit); if (limit && statp->f_blocks > limit) { statp->f_blocks = limit; - statp->f_bfree = (statp->f_blocks > dp->d_bcount) ? - (statp->f_blocks - dp->d_bcount) : 0; + statp->f_bfree = + (statp->f_blocks > be64_to_cpu(dp->d_bcount)) ? + (statp->f_blocks - be64_to_cpu(dp->d_bcount)) : 0; } - limit = dp->d_ino_softlimit ? dp->d_ino_softlimit : dp->d_ino_hardlimit; + + limit = dp->d_ino_softlimit ? + be64_to_cpu(dp->d_ino_softlimit) : + be64_to_cpu(dp->d_ino_hardlimit); if (limit && statp->f_files > limit) { statp->f_files = limit; - statp->f_ffree = (statp->f_files > dp->d_icount) ? - (statp->f_ffree - dp->d_icount) : 0; + statp->f_ffree = + (statp->f_files > be64_to_cpu(dp->d_icount)) ? + (statp->f_ffree - be64_to_cpu(dp->d_icount)) : 0; } xfs_qm_dqput(dqp); -- cgit v0.10.2 From f5faad799475c4058416264f672bb33bf8b5ef41 Mon Sep 17 00:00:00 2001 From: Nathan Scott Date: Fri, 28 Jul 2006 17:04:44 +1000 Subject: [XFS] Fix remount vs no/barrier options by ensuring we clear unwanted flags from iclog buffers before submitting them for writing. SGI-PV: 954772 SGI-Modid: xfs-linux-melb:xfs-kern:26605a Signed-off-by: Nathan Scott diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index ceda3a2..7858703 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -246,8 +246,8 @@ extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *); #define BUF_BUSY XBF_DONT_BLOCK #define XFS_BUF_BFLAGS(bp) ((bp)->b_flags) -#define XFS_BUF_ZEROFLAGS(bp) \ - ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI)) +#define XFS_BUF_ZEROFLAGS(bp) ((bp)->b_flags &= \ + ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED)) #define XFS_BUF_STALE(bp) ((bp)->b_flags |= XFS_B_STALE) #define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XFS_B_STALE) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index e730328..21ac1a6 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1413,7 +1413,7 @@ xlog_sync(xlog_t *log, ops = iclog->ic_header.h_num_logops; INT_SET(iclog->ic_header.h_num_logops, ARCH_CONVERT, ops); - bp = iclog->ic_bp; + bp = iclog->ic_bp; ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long)1); XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)2); XFS_BUF_SET_ADDR(bp, BLOCK_LSN(INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT))); @@ -1430,15 +1430,14 @@ xlog_sync(xlog_t *log, } XFS_BUF_SET_PTR(bp, (xfs_caddr_t) &(iclog->ic_header), count); XFS_BUF_SET_FSPRIVATE(bp, iclog); /* save for later */ + XFS_BUF_ZEROFLAGS(bp); XFS_BUF_BUSY(bp); XFS_BUF_ASYNC(bp); /* * Do an ordered write for the log block. - * - * It may not be needed to flush the first split block in the log wrap - * case, but do it anyways to be safe -AK + * Its unnecessary to flush the first split block in the log wrap case. */ - if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) + if (!split && (log->l_mp->m_flags & XFS_MOUNT_BARRIER)) XFS_BUF_ORDERED(bp); ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); @@ -1460,7 +1459,7 @@ xlog_sync(xlog_t *log, return error; } if (split) { - bp = iclog->ic_log->l_xbuf; + bp = iclog->ic_log->l_xbuf; ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long)1); XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)2); @@ -1468,6 +1467,7 @@ xlog_sync(xlog_t *log, XFS_BUF_SET_PTR(bp, (xfs_caddr_t)((__psint_t)&(iclog->ic_header)+ (__psint_t)count), split); XFS_BUF_SET_FSPRIVATE(bp, iclog); + XFS_BUF_ZEROFLAGS(bp); XFS_BUF_BUSY(bp); XFS_BUF_ASYNC(bp); if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) -- cgit v0.10.2 From b2ea401bac39e75ebb64038609ed22efbc799905 Mon Sep 17 00:00:00 2001 From: Nathan Scott Date: Fri, 28 Jul 2006 17:05:13 +1000 Subject: [XFS] Fix a barrier related forced shutdown on mounts with quota enabled. SGI-PV: 912426 SGI-Modid: xfs-linux-melb:xfs-kern:26622a Signed-off-by: Nathan Scott diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 9bdef9d..4754f34 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -314,6 +314,13 @@ xfs_mountfs_check_barriers(xfs_mount_t *mp) return; } + if (xfs_readonly_buftarg(mp->m_ddev_targp)) { + xfs_fs_cmn_err(CE_NOTE, mp, + "Disabling barriers, underlying device is readonly"); + mp->m_flags &= ~XFS_MOUNT_BARRIER; + return; + } + error = xfs_barrier_test(mp); if (error) { xfs_fs_cmn_err(CE_NOTE, mp, diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c index 6c96391..b427d22 100644 --- a/fs/xfs/xfs_vfsops.c +++ b/fs/xfs/xfs_vfsops.c @@ -515,7 +515,7 @@ xfs_mount( if (error) goto error2; - if ((mp->m_flags & XFS_MOUNT_BARRIER) && !(vfsp->vfs_flag & VFS_RDONLY)) + if (mp->m_flags & XFS_MOUNT_BARRIER) xfs_mountfs_check_barriers(mp); error = XFS_IOINIT(vfsp, args, flags); -- cgit v0.10.2 From 41ff715abc49324fb2cb20e66bc4e0290cfdbe51 Mon Sep 17 00:00:00 2001 From: Nathan Scott Date: Fri, 28 Jul 2006 17:05:51 +1000 Subject: [XFS] Ensure bulkstat from an invalid inode number gets caught always with EINVAL. SGI-PV: 953819 SGI-Modid: xfs-linux-melb:xfs-kern:26629a Signed-off-by: Nathan Scott diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 86c1bf0..1f8ecff 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -334,10 +334,9 @@ xfs_itobp( #if !defined(__KERNEL__) ni = 0; #elif defined(DEBUG) - ni = (imap_flags & XFS_IMAP_BULKSTAT) ? 0 : - (BBTOB(imap.im_len) >> mp->m_sb.sb_inodelog); + ni = BBTOB(imap.im_len) >> mp->m_sb.sb_inodelog; #else /* usual case */ - ni = (imap_flags & XFS_IMAP_BULKSTAT) ? 0 : 1; + ni = 1; #endif for (i = 0; i < ni; i++) { @@ -348,11 +347,15 @@ xfs_itobp( (i << mp->m_sb.sb_inodelog)); di_ok = INT_GET(dip->di_core.di_magic, ARCH_CONVERT) == XFS_DINODE_MAGIC && XFS_DINODE_GOOD_VERSION(INT_GET(dip->di_core.di_version, ARCH_CONVERT)); - if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP, - XFS_RANDOM_ITOBP_INOTOBP))) { + if (unlikely(XFS_TEST_ERROR(!di_ok, mp, + XFS_ERRTAG_ITOBP_INOTOBP, + XFS_RANDOM_ITOBP_INOTOBP))) { + if (imap_flags & XFS_IMAP_BULKSTAT) { + xfs_trans_brelse(tp, bp); + return XFS_ERROR(EINVAL); + } #ifdef DEBUG - if (!(imap_flags & XFS_IMAP_BULKSTAT)) - cmn_err(CE_ALERT, + cmn_err(CE_ALERT, "Device %s - bad inode magic/vsn " "daddr %lld #%d (magic=%x)", XFS_BUFTARG_NAME(mp->m_ddev_targp), -- cgit v0.10.2 From 93853fd0d492524e9172297d8e8b8364dc2c4c59 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 28 Jul 2006 01:09:40 -0700 Subject: [SUNLANCE]: fix compilation on sparc-UP Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller diff --git a/drivers/net/sunlance.c b/drivers/net/sunlance.c index 1ef9fd3..0e3fdf7 100644 --- a/drivers/net/sunlance.c +++ b/drivers/net/sunlance.c @@ -1537,7 +1537,7 @@ static int __init sparc_lance_init(void) { if ((idprom->id_machtype == (SM_SUN4|SM_4_330)) || (idprom->id_machtype == (SM_SUN4|SM_4_470))) { - memset(&sun4_sdev, 0, sizeof(sdev)); + memset(&sun4_sdev, 0, sizeof(struct sbus_dev)); sun4_sdev.reg_addrs[0].phys_addr = sun4_eth_physaddr; sun4_sdev.irqs[0] = 6; return sparc_lance_probe_one(&sun4_sdev, NULL, NULL); @@ -1547,16 +1547,16 @@ static int __init sparc_lance_init(void) static int __exit sunlance_sun4_remove(void) { - struct lance_private *lp = dev_get_drvdata(&sun4_sdev->dev); + struct lance_private *lp = dev_get_drvdata(&sun4_sdev.ofdev.dev); struct net_device *net_dev = lp->dev; unregister_netdevice(net_dev); - lance_free_hwresources(root_lance_dev); + lance_free_hwresources(lp); free_netdev(net_dev); - dev_set_drvdata(&sun4_sdev->dev, NULL); + dev_set_drvdata(&sun4_sdev.ofdev.dev, NULL); return 0; } -- cgit v0.10.2 From facf014792093d95e308b5d6ce9bc86d3c90e5b1 Mon Sep 17 00:00:00 2001 From: Chuck Ebbert <76306.1226@compuserve.com> Date: Tue, 25 Jul 2006 16:15:16 -0400 Subject: [PATCH] i386: switch_to(): misplaced parentheses Recent changes in i386 __switch_to() have a misplaced closing parenthesis causing an unlikely() to terminate early. Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com> Acked-by: Steven Rostedt Signed-off-by: Linus Torvalds diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 923bb29..8657c73 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -690,8 +690,8 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas /* * Now maybe handle debug registers and/or IO bitmaps */ - if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW)) - || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) + if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW) + || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))) __switch_to_xtra(next_p, tss); disable_tsc(prev_p, next_p); -- cgit v0.10.2 From d5a2601734bcc740ee78dc4cb0c56b5687da7bd9 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 28 Jul 2006 14:44:42 +0200 Subject: [PATCH] i386/x86-64: Add user_mode checks to profile_pc for oprofile Fixes a obscure user space triggerable crash during oprofiling. Oprofile calls profile_pc from NMIs even when user_mode(regs) is not true and the program counter is inside the kernel lock section. This opens a race - when a user program jumps to a kernel lock address and a NMI happens before the illegal page fault exception is raised and the program has a unmapped esp or ebp then the kernel could oops. NMIs have a higher priority than exceptions so that could happen. Add user_mode checks to i386/x86-64 profile_pc to prevent that. Cc: John Levon Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c index 8705c0f..edd00f6 100644 --- a/arch/i386/kernel/time.c +++ b/arch/i386/kernel/time.c @@ -135,7 +135,7 @@ unsigned long profile_pc(struct pt_regs *regs) { unsigned long pc = instruction_pointer(regs); - if (in_lock_functions(pc)) + if (!user_mode_vm(regs) && in_lock_functions(pc)) return *(unsigned long *)(regs->ebp + 4); return pc; diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index b9ff759..e0341c6 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -193,7 +193,7 @@ unsigned long profile_pc(struct pt_regs *regs) is just accounted to the spinlock function. Better would be to write these functions in assembler again and check exactly. */ - if (in_lock_functions(pc)) { + if (!user_mode(regs) && in_lock_functions(pc)) { char *v = *(char **)regs->rsp; if ((v >= _stext && v <= _etext) || (v >= _sinittext && v <= _einittext) || -- cgit v0.10.2 From 0e92da4acb763272c6060f0b14adc2377b627d07 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 28 Jul 2006 14:44:45 +0200 Subject: [PATCH] x86_64: Don't clobber r8-r11 in int 0x80 handler When int 0x80 is called from long mode r8-r11 would leak out of the kernel (or rather they would be filled with some values from the kernel stack). I don't think it's a security issue because the values come from the fixed stack frame which should be near always user registers from a previous interrupt. Still better fix it. Longer term the register save macros need to be cleaned up to avoid such mistakes in the future. Original analysis from Richard Brunner, fix by me. Cc: Richard.Brunner@amd.com Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index 9b5bb41..5d4a7d1 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -103,7 +103,7 @@ ENTRY(ia32_sysenter_target) pushq %rax CFI_ADJUST_CFA_OFFSET 8 cld - SAVE_ARGS 0,0,1 + SAVE_ARGS 0,0,0 /* no need to do an access_ok check here because rbp has been 32bit zero extended */ 1: movl (%rbp),%r9d -- cgit v0.10.2 From a4045dff782a8692637c24a0222120082c887caa Mon Sep 17 00:00:00 2001 From: bibo mao Date: Fri, 28 Jul 2006 14:44:48 +0200 Subject: [PATCH] x86_64: Enlarge debug stack for nested kprobes In x86_64 platform, INT1 and INT3 trap stack is IST stack called DEBUG_STACK, when INT1/INT3 trap happens, system will switch to DEBUG_STACK by hardware. Current DEBUG_STACK size is 4K, when int1/int3 trap happens, kernel will minus current DEBUG_STACK IST value by 4k. But if int3/int1 trap is nested, it will destroy other vector's IST stack. This patch modifies this, it sets DEBUG_STACK size as 8K and allows two level of nested int1/int3 trap. Kprobe DEBUG_STACK may be nested, because kprobe handler may be probed by other kprobes. Thanks jbeulich for pointing out error in the first patch. [AK: nested kprobes are pretty dubious. Hopefully one nest will be enough. This will cost 8K per CPU (4K more than before)] Signed-off-by: bibo, mao Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds diff --git a/include/asm-x86_64/page.h b/include/asm-x86_64/page.h index f7bf875..10f3461 100644 --- a/include/asm-x86_64/page.h +++ b/include/asm-x86_64/page.h @@ -19,7 +19,7 @@ #define EXCEPTION_STACK_ORDER 0 #define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER) -#define DEBUG_STACK_ORDER EXCEPTION_STACK_ORDER +#define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1) #define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER) #define IRQSTACK_ORDER 2 -- cgit v0.10.2 From b13761ecd1d9977d2083da243e051e9f29097aef Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 28 Jul 2006 14:44:51 +0200 Subject: [PATCH] x86_64: Dump leftover backtrace entries when dwarf2 unwinder got stuck The dwarf2 unwinder currently often gets stuck because a lot of assembly code doesn't have proper dwarf2 annotiation yet. This currently often happens with __down. Should fix this by adding proper dwarf2 annotation to all inline assembly. However until that's done we need a quick fix for 2.6.18 to avoid incomplete backtraces. So when this happens dump the rest of the stack with the old unwinder instead of silently not dumping it. There was already a optional "both" mode that dumped both, but that was too ugly. I also clarified the headers for the different backtraces a bit. Also add a clear error message for missing dwarf2 annotation that people can work on. And I removed a dead variable left over from Ingo's changes. Cc: mingo@elte.hu Cc: jbeulich@novell.com Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index eb39a27..f7a9d14 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -254,7 +254,6 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s { const unsigned cpu = safe_smp_processor_id(); unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; - int i = 11; unsigned used = 0; printk("\nCall Trace:\n"); @@ -275,11 +274,20 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s if (unwind_init_blocked(&info, tsk) == 0) unw_ret = show_trace_unwind(&info, NULL); } - if (unw_ret > 0) { - if (call_trace > 0) + if (unw_ret > 0 && !arch_unw_user_mode(&info)) { +#ifdef CONFIG_STACK_UNWIND + unsigned long rip = info.regs.rip; + print_symbol("DWARF2 unwinder stuck at %s\n", rip); + if (call_trace == 1) { + printk("Leftover inexact backtrace:\n"); + stack = (unsigned long *)info.regs.rsp; + } else if (call_trace > 1) return; - printk("Legacy call trace:"); - i = 18; + else + printk("Full inexact backtrace again:\n"); +#else + printk("Inexact backtrace:\n"); +#endif } } @@ -1118,8 +1126,10 @@ static int __init call_trace_setup(char *s) call_trace = -1; else if (strcmp(s, "both") == 0) call_trace = 0; - else if (strcmp(s, "new") == 0) + else if (strcmp(s, "newfallback") == 0) call_trace = 1; + else if (strcmp(s, "new") == 0) + call_trace = 2; return 1; } __setup("call_trace=", call_trace_setup); -- cgit v0.10.2 From b783fd925cdd56d24d164e5bdcb072f2a67aedf4 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 28 Jul 2006 14:44:54 +0200 Subject: [PATCH] x86_64: Document backtracer selection options Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt index 6887d44..6da24e7 100644 --- a/Documentation/x86_64/boot-options.txt +++ b/Documentation/x86_64/boot-options.txt @@ -238,6 +238,13 @@ Debugging pagefaulttrace Dump all page faults. Only useful for extreme debugging and will create a lot of output. + call_trace=[old|both|newfallback|new] + old: use old inexact backtracer + new: use new exact dwarf2 unwinder + both: print entries from both + newfallback: use new unwinder but fall back to old if it gets + stuck (default) + Misc noreplacement Don't replace instructions with more appropriate ones -- cgit v0.10.2 From c97d20a6c51067a38f53680d9609b4cf2867d077 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 28 Jul 2006 14:44:57 +0200 Subject: [PATCH] i386: Do backtrace fallback too Similar patch to earlier x86-64 patch. When the dwarf2 unwinder fails dump the left over stack with the old unwinder. Also some clarifications in the headers. Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 313ac1f..3facc8f 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -187,10 +187,21 @@ static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, if (unwind_init_blocked(&info, task) == 0) unw_ret = show_trace_unwind(&info, log_lvl); } - if (unw_ret > 0) { - if (call_trace > 0) + if (unw_ret > 0 && !arch_unw_user_mode(&info)) { +#ifdef CONFIG_STACK_UNWIND + print_symbol("DWARF2 unwinder stuck at %s\n", + UNW_PC(info.regs)); + if (call_trace == 1) { + printk("Leftover inexact backtrace:\n"); + if (UNW_SP(info.regs)) + stack = (void *)UNW_SP(info.regs); + } else if (call_trace > 1) return; - printk("%sLegacy call trace:\n", log_lvl); + else + printk("Full inexact backtrace again:\n"); +#else + printk("Inexact backtrace:\n"); +#endif } } -- cgit v0.10.2 From 627371d73cdd04ed23fe098755b4f855138ad9e0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 29 Jul 2006 05:16:20 +0200 Subject: [PATCH] pi-futex: robust-futex exit crash fix Fix pi_state->list handling bugs: list handling mishap, locking error. Plus add more debug checks and fix a few style issues i noticed while debugging this. (reported by Ulrich Drepper and Jakub Jelinek.) Signed-off-by: Ingo Molnar Signed-off-by: Linus Torvalds diff --git a/kernel/futex.c b/kernel/futex.c index cf0c8e2..f59003b 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -415,15 +415,15 @@ out_unlock: */ void exit_pi_state_list(struct task_struct *curr) { - struct futex_hash_bucket *hb; struct list_head *next, *head = &curr->pi_state_list; struct futex_pi_state *pi_state; + struct futex_hash_bucket *hb; union futex_key key; /* * We are a ZOMBIE and nobody can enqueue itself on * pi_state_list anymore, but we have to be careful - * versus waiters unqueueing themselfs + * versus waiters unqueueing themselves: */ spin_lock_irq(&curr->pi_lock); while (!list_empty(head)) { @@ -431,21 +431,24 @@ void exit_pi_state_list(struct task_struct *curr) next = head->next; pi_state = list_entry(next, struct futex_pi_state, list); key = pi_state->key; + hb = hash_futex(&key); spin_unlock_irq(&curr->pi_lock); - hb = hash_futex(&key); spin_lock(&hb->lock); spin_lock_irq(&curr->pi_lock); + /* + * We dropped the pi-lock, so re-check whether this + * task still owns the PI-state: + */ if (head->next != next) { spin_unlock(&hb->lock); continue; } - list_del_init(&pi_state->list); - WARN_ON(pi_state->owner != curr); - + WARN_ON(list_empty(&pi_state->list)); + list_del_init(&pi_state->list); pi_state->owner = NULL; spin_unlock_irq(&curr->pi_lock); @@ -470,7 +473,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) head = &hb->chain; list_for_each_entry_safe(this, next, head, list) { - if (match_futex (&this->key, &me->key)) { + if (match_futex(&this->key, &me->key)) { /* * Another waiter already exists - bump up * the refcount and return its pi_state: @@ -482,6 +485,8 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) if (unlikely(!pi_state)) return -EINVAL; + WARN_ON(!atomic_read(&pi_state->refcount)); + atomic_inc(&pi_state->refcount); me->pi_state = pi_state; @@ -510,6 +515,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) pi_state->key = me->key; spin_lock_irq(&p->pi_lock); + WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &p->pi_state_list); pi_state->owner = p; spin_unlock_irq(&p->pi_lock); @@ -584,9 +590,17 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) if (curval != uval) return -EINVAL; - list_del_init(&pi_state->owner->pi_state_list); + spin_lock_irq(&pi_state->owner->pi_lock); + WARN_ON(list_empty(&pi_state->list)); + list_del_init(&pi_state->list); + spin_unlock_irq(&pi_state->owner->pi_lock); + + spin_lock_irq(&new_owner->pi_lock); + WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &new_owner->pi_state_list); pi_state->owner = new_owner; + spin_unlock_irq(&new_owner->pi_lock); + rt_mutex_unlock(&pi_state->pi_mutex); return 0; @@ -1236,6 +1250,7 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock, /* Owner died? */ if (q.pi_state->owner != NULL) { spin_lock_irq(&q.pi_state->owner->pi_lock); + WARN_ON(list_empty(&q.pi_state->list)); list_del_init(&q.pi_state->list); spin_unlock_irq(&q.pi_state->owner->pi_lock); } else @@ -1244,6 +1259,7 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock, q.pi_state->owner = current; spin_lock_irq(¤t->pi_lock); + WARN_ON(!list_empty(&q.pi_state->list)); list_add(&q.pi_state->list, ¤t->pi_state_list); spin_unlock_irq(¤t->pi_lock); -- cgit v0.10.2 From e3f2ddeac718c768fdac4b7fe69d465172f788a8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 29 Jul 2006 05:17:57 +0200 Subject: [PATCH] pi-futex: robust-futex exit Fix robust PI-futexes to be properly unlocked on unexpected exit. For this to work the kernel has to know whether a futex is a PI or a non-PI one, because the semantics are different. Since the space in relevant glibc data structures is extremely scarce, the best solution is to encode the 'PI' information in bit 0 of the robust list pointer. Existing (non-PI) glibc robust futexes have this bit always zero, so the ABI is kept. New glibc with PI-robust-futexes will set this bit. Further fixes from Thomas Gleixner Signed-off-by: Ingo Molnar Signed-off-by: Ulrich Drepper Signed-off-by: Thomas Gleixner Signed-off-by: Linus Torvalds diff --git a/include/linux/futex.h b/include/linux/futex.h index 34c3a21..d097b5b 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -96,7 +96,8 @@ struct robust_list_head { long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout, u32 __user *uaddr2, u32 val2, u32 val3); -extern int handle_futex_death(u32 __user *uaddr, struct task_struct *curr); +extern int +handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi); #ifdef CONFIG_FUTEX extern void exit_robust_list(struct task_struct *curr); diff --git a/kernel/futex.c b/kernel/futex.c index f59003b..dda2049 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -495,10 +495,13 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) } /* - * We are the first waiter - try to look up the real owner and - * attach the new pi_state to it: + * We are the first waiter - try to look up the real owner and attach + * the new pi_state to it, but bail out when the owner died bit is set + * and TID = 0: */ pid = uval & FUTEX_TID_MASK; + if (!pid && (uval & FUTEX_OWNER_DIED)) + return -ESRCH; p = futex_find_get_task(pid); if (!p) return -ESRCH; @@ -579,16 +582,17 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) * kept enabled while there is PI state around. We must also * preserve the owner died bit.) */ - newval = (uval & FUTEX_OWNER_DIED) | FUTEX_WAITERS | new_owner->pid; - - inc_preempt_count(); - curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); - dec_preempt_count(); + if (!(uval & FUTEX_OWNER_DIED)) { + newval = FUTEX_WAITERS | new_owner->pid; - if (curval == -EFAULT) - return -EFAULT; - if (curval != uval) - return -EINVAL; + inc_preempt_count(); + curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); + dec_preempt_count(); + if (curval == -EFAULT) + return -EFAULT; + if (curval != uval) + return -EINVAL; + } spin_lock_irq(&pi_state->owner->pi_lock); WARN_ON(list_empty(&pi_state->list)); @@ -1443,9 +1447,11 @@ retry_locked: * again. If it succeeds then we can return without waking * anyone else up: */ - inc_preempt_count(); - uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); - dec_preempt_count(); + if (!(uval & FUTEX_OWNER_DIED)) { + inc_preempt_count(); + uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); + dec_preempt_count(); + } if (unlikely(uval == -EFAULT)) goto pi_faulted; @@ -1478,9 +1484,11 @@ retry_locked: /* * No waiters - kernel unlocks the futex: */ - ret = unlock_futex_pi(uaddr, uval); - if (ret == -EFAULT) - goto pi_faulted; + if (!(uval & FUTEX_OWNER_DIED)) { + ret = unlock_futex_pi(uaddr, uval); + if (ret == -EFAULT) + goto pi_faulted; + } out_unlock: spin_unlock(&hb->lock); @@ -1699,9 +1707,9 @@ err_unlock: * Process a futex-list entry, check whether it's owned by the * dying task, and do notification if so: */ -int handle_futex_death(u32 __user *uaddr, struct task_struct *curr) +int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) { - u32 uval, nval; + u32 uval, nval, mval; retry: if (get_user(uval, uaddr)) @@ -1718,21 +1726,45 @@ retry: * thread-death.) The rest of the cleanup is done in * userspace. */ - nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, - uval | FUTEX_OWNER_DIED); + mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; + nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval); + if (nval == -EFAULT) return -1; if (nval != uval) goto retry; - if (uval & FUTEX_WAITERS) - futex_wake(uaddr, 1); + /* + * Wake robust non-PI futexes here. The wakeup of + * PI futexes happens in exit_pi_state(): + */ + if (!pi) { + if (uval & FUTEX_WAITERS) + futex_wake(uaddr, 1); + } } return 0; } /* + * Fetch a robust-list pointer. Bit 0 signals PI futexes: + */ +static inline int fetch_robust_entry(struct robust_list __user **entry, + struct robust_list __user **head, int *pi) +{ + unsigned long uentry; + + if (get_user(uentry, (unsigned long *)head)) + return -EFAULT; + + *entry = (void *)(uentry & ~1UL); + *pi = uentry & 1; + + return 0; +} + +/* * Walk curr->robust_list (very carefully, it's a userspace list!) * and mark any locks found there dead, and notify any waiters. * @@ -1742,14 +1774,14 @@ void exit_robust_list(struct task_struct *curr) { struct robust_list_head __user *head = curr->robust_list; struct robust_list __user *entry, *pending; - unsigned int limit = ROBUST_LIST_LIMIT; + unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; unsigned long futex_offset; /* * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): */ - if (get_user(entry, &head->list.next)) + if (fetch_robust_entry(&entry, &head->list.next, &pi)) return; /* * Fetch the relative futex offset: @@ -1760,10 +1792,11 @@ void exit_robust_list(struct task_struct *curr) * Fetch any possibly pending lock-add first, and handle it * if it exists: */ - if (get_user(pending, &head->list_op_pending)) + if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) return; + if (pending) - handle_futex_death((void *)pending + futex_offset, curr); + handle_futex_death((void *)pending + futex_offset, curr, pip); while (entry != &head->list) { /* @@ -1772,12 +1805,12 @@ void exit_robust_list(struct task_struct *curr) */ if (entry != pending) if (handle_futex_death((void *)entry + futex_offset, - curr)) + curr, pi)) return; /* * Fetch the next entry in the list: */ - if (get_user(entry, &entry->next)) + if (fetch_robust_entry(&entry, &entry->next, &pi)) return; /* * Avoid excessively long or circular lists: diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index d1d92b4..d1aab1a 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -12,6 +12,23 @@ #include + +/* + * Fetch a robust-list pointer. Bit 0 signals PI futexes: + */ +static inline int +fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, + compat_uptr_t *head, int *pi) +{ + if (get_user(*uentry, head)) + return -EFAULT; + + *entry = compat_ptr((*uentry) & ~1); + *pi = (unsigned int)(*uentry) & 1; + + return 0; +} + /* * Walk curr->robust_list (very carefully, it's a userspace list!) * and mark any locks found there dead, and notify any waiters. @@ -22,17 +39,16 @@ void compat_exit_robust_list(struct task_struct *curr) { struct compat_robust_list_head __user *head = curr->compat_robust_list; struct robust_list __user *entry, *pending; + unsigned int limit = ROBUST_LIST_LIMIT, pi; compat_uptr_t uentry, upending; - unsigned int limit = ROBUST_LIST_LIMIT; compat_long_t futex_offset; /* * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): */ - if (get_user(uentry, &head->list.next)) + if (fetch_robust_entry(&uentry, &entry, &head->list.next, &pi)) return; - entry = compat_ptr(uentry); /* * Fetch the relative futex offset: */ @@ -42,11 +58,11 @@ void compat_exit_robust_list(struct task_struct *curr) * Fetch any possibly pending lock-add first, and handle it * if it exists: */ - if (get_user(upending, &head->list_op_pending)) + if (fetch_robust_entry(&upending, &pending, + &head->list_op_pending, &pi)) return; - pending = compat_ptr(upending); if (upending) - handle_futex_death((void *)pending + futex_offset, curr); + handle_futex_death((void *)pending + futex_offset, curr, pi); while (compat_ptr(uentry) != &head->list) { /* @@ -55,15 +71,15 @@ void compat_exit_robust_list(struct task_struct *curr) */ if (entry != pending) if (handle_futex_death((void *)entry + futex_offset, - curr)) + curr, pi)) return; /* * Fetch the next entry in the list: */ - if (get_user(uentry, (compat_uptr_t *)&entry->next)) + if (fetch_robust_entry(&uentry, &entry, + (compat_uptr_t *)&entry->next, &pi)) return; - entry = compat_ptr(uentry); /* * Avoid excessively long or circular lists: */ -- cgit v0.10.2