From 43dff98b022ded593e73c3784bac03bc9fc7ec55 Mon Sep 17 00:00:00 2001 From: Shaun Pereira Date: Fri, 28 Apr 2006 12:00:17 -0700 Subject: [X25]: fix for spinlock recurse and spinlock lockup with timer handler When the sk_timer function x25_heartbeat_expiry() is called by the kernel in a running/terminating process, spinlock-recursion and spinlock-lockup locks up the kernel. This has happened with testing on some distro's and the patch below fixed it. Signed-off-by: Shaun Pereira Signed-off-by: David S. Miller diff --git a/net/x25/x25_timer.c b/net/x25/x25_timer.c index 0a92e1d..71ff308 100644 --- a/net/x25/x25_timer.c +++ b/net/x25/x25_timer.c @@ -114,8 +114,9 @@ static void x25_heartbeat_expiry(unsigned long param) if (sock_flag(sk, SOCK_DESTROY) || (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_DEAD))) { + bh_unlock_sock(sk); x25_destroy_socket(sk); - goto unlock; + return; } break; @@ -128,7 +129,6 @@ static void x25_heartbeat_expiry(unsigned long param) } restart_heartbeat: x25_start_heartbeat(sk); -unlock: bh_unlock_sock(sk); } -- cgit v0.10.2 From 89bbb0a361cdae50eec863f10a876b58abf7d312 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Fri, 28 Apr 2006 12:11:36 -0700 Subject: [PKT_SCHED] netem: fix loss The following one line fix is needed to make loss function of netem work right when doing loss on the local host. Otherwise, higher layers just recover. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 7228d30..5a4a4d0 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -167,7 +167,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (count == 0) { sch->qstats.drops++; kfree_skb(skb); - return NET_XMIT_DROP; + return NET_XMIT_BYPASS; } /* -- cgit v0.10.2 From 09493abfdbe8144ce0805efac7a8a3c4eb869c12 Mon Sep 17 00:00:00 2001 From: Soyoung Park Date: Fri, 28 Apr 2006 14:59:44 -0700 Subject: [NETLINK]: cleanup unused macro in net/netlink/af_netlink.c 1 line removal, of unused macro. ran 'egrep -r' from linux-2.6.16/ for Nprintk and didn't see it anywhere else but here, in #define... Signed-off-by: Soyoung Park Signed-off-by: David S. Miller diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 2a233ff..b8ea61f 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -61,7 +61,6 @@ #include #include -#define Nprintk(a...) #define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) struct netlink_sock { -- cgit v0.10.2 From a536e0778781c1f2ce38cf8e1d82ce258f0193c1 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 28 Apr 2006 15:19:17 -0700 Subject: [IPV4]: inet_init() -> fs_initcall Convert inet_init to an fs_initcall to make sure its called before any device driver's initcall. Signed-off-by: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: David S. Miller diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index dc206f1..0a27745 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1257,7 +1257,7 @@ out_unregister_udp_proto: goto out; } -module_init(inet_init); +fs_initcall(inet_init); /* ------------------------------------------------------------------------ */ -- cgit v0.10.2 From da753beaeb1446aa87bcca7e8a0026633a8914f0 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Fri, 28 Apr 2006 15:21:23 -0700 Subject: [NET]: use hlist_unhashed() Use hlist_unhashed() rather than accessing inside data structure. Signed-off-by: Akinobu Mita Signed-off-by: Andrew Morton Signed-off-by: David S. Miller diff --git a/include/linux/list.h b/include/linux/list.h index 67258b4..76f0571 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -619,7 +619,7 @@ static inline void hlist_del_rcu(struct hlist_node *n) static inline void hlist_del_init(struct hlist_node *n) { - if (n->pprev) { + if (!hlist_unhashed(n)) { __hlist_del(n); INIT_HLIST_NODE(n); } diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 1da294c..e837f98 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -150,7 +150,7 @@ static inline void inet_twsk_add_bind_node(struct inet_timewait_sock *tw, static inline int inet_twsk_dead_hashed(const struct inet_timewait_sock *tw) { - return tw->tw_death_node.pprev != NULL; + return !hlist_unhashed(&tw->tw_death_node); } static inline void inet_twsk_dead_node_init(struct inet_timewait_sock *tw) diff --git a/include/net/sock.h b/include/net/sock.h index ff8b0da..c9fad6f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -279,7 +279,7 @@ static inline int sk_unhashed(const struct sock *sk) static inline int sk_hashed(const struct sock *sk) { - return sk->sk_node.pprev != NULL; + return !sk_unhashed(sk); } static __inline__ void sk_node_init(struct hlist_node *node) -- cgit v0.10.2 From a76e07acd0de635122c5e60ccd5e55f9d5082391 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 28 Apr 2006 15:22:13 -0700 Subject: [IPSEC]: Fix IP ID selection I was looking through the xfrm input/output code in order to abstract out the address family specific encapsulation/decapsulation code. During that process I found this bug in the IP ID selection code in xfrm4_output.c. At that point dst is still the xfrm_dst for the current SA which represents an internal flow as far as the IPsec tunnel is concerned. Since the IP ID is going to sit on the outside of the encapsulated packet, we obviously want the external flow which is just dst->child. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 32ad229..4ef8efa 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -62,7 +62,7 @@ static void xfrm4_encap(struct sk_buff *skb) top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ? 0 : (iph->frag_off & htons(IP_DF)); if (!top_iph->frag_off) - __ip_select_ident(top_iph, dst, 0); + __ip_select_ident(top_iph, dst->child, 0); top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT); -- cgit v0.10.2 From 8dff7c29707b7514043539f5ab5e0a6eb7bd9dcd Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 28 Apr 2006 15:23:59 -0700 Subject: [XFRM]: fix softirq-unsafe xfrm typemap->lock use xfrm typemap->lock may be used in softirq context, so all write_lock() uses must be softirq-safe. Signed-off-by: Ingo Molnar Signed-off-by: David S. Miller diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index c3725fe..e5b0afc 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -57,12 +57,12 @@ int xfrm_register_type(struct xfrm_type *type, unsigned short family) return -EAFNOSUPPORT; typemap = afinfo->type_map; - write_lock(&typemap->lock); + write_lock_bh(&typemap->lock); if (likely(typemap->map[type->proto] == NULL)) typemap->map[type->proto] = type; else err = -EEXIST; - write_unlock(&typemap->lock); + write_unlock_bh(&typemap->lock); xfrm_policy_put_afinfo(afinfo); return err; } @@ -78,12 +78,12 @@ int xfrm_unregister_type(struct xfrm_type *type, unsigned short family) return -EAFNOSUPPORT; typemap = afinfo->type_map; - write_lock(&typemap->lock); + write_lock_bh(&typemap->lock); if (unlikely(typemap->map[type->proto] != type)) err = -ENOENT; else typemap->map[type->proto] = NULL; - write_unlock(&typemap->lock); + write_unlock_bh(&typemap->lock); xfrm_policy_put_afinfo(afinfo); return err; } -- cgit v0.10.2 From 83de47cd0c5738105f40e65191b0761dfa7431ac Mon Sep 17 00:00:00 2001 From: Hua Zhong Date: Fri, 28 Apr 2006 15:26:50 -0700 Subject: [TCP]: Fix unlikely usage in tcp_transmit_skb() The following unlikely should be replaced by likely because the condition happens every time unless there is a hard error to transmit a packet. Signed-off-by: Hua Zhong Signed-off-by: David S. Miller diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a28ae59..743016b 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -465,7 +465,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, TCP_INC_STATS(TCP_MIB_OUTSEGS); err = icsk->icsk_af_ops->queue_xmit(skb, 0); - if (unlikely(err <= 0)) + if (likely(err <= 0)) return err; tcp_enter_cwr(sk); -- cgit v0.10.2 From f3111502c065d32dcb021f55e30398aaebd8fb0f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 28 Apr 2006 15:30:03 -0700 Subject: [XFRM]: fix incorrect xfrm_state_afinfo_lock use xfrm_state_afinfo_lock can be read-locked from bh context, so take it in a bh-safe manner in xfrm_state_register_afinfo() and xfrm_state_unregister_afinfo(). Found by the lock validator. Signed-off-by: Ingo Molnar Signed-off-by: David S. Miller diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 3dc3e1f..93a2f36 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1061,7 +1061,7 @@ int xfrm_state_register_afinfo(struct xfrm_state_afinfo *afinfo) return -EINVAL; if (unlikely(afinfo->family >= NPROTO)) return -EAFNOSUPPORT; - write_lock(&xfrm_state_afinfo_lock); + write_lock_bh(&xfrm_state_afinfo_lock); if (unlikely(xfrm_state_afinfo[afinfo->family] != NULL)) err = -ENOBUFS; else { @@ -1069,7 +1069,7 @@ int xfrm_state_register_afinfo(struct xfrm_state_afinfo *afinfo) afinfo->state_byspi = xfrm_state_byspi; xfrm_state_afinfo[afinfo->family] = afinfo; } - write_unlock(&xfrm_state_afinfo_lock); + write_unlock_bh(&xfrm_state_afinfo_lock); return err; } EXPORT_SYMBOL(xfrm_state_register_afinfo); @@ -1081,7 +1081,7 @@ int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo) return -EINVAL; if (unlikely(afinfo->family >= NPROTO)) return -EAFNOSUPPORT; - write_lock(&xfrm_state_afinfo_lock); + write_lock_bh(&xfrm_state_afinfo_lock); if (likely(xfrm_state_afinfo[afinfo->family] != NULL)) { if (unlikely(xfrm_state_afinfo[afinfo->family] != afinfo)) err = -EINVAL; @@ -1091,7 +1091,7 @@ int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo) afinfo->state_bydst = NULL; } } - write_unlock(&xfrm_state_afinfo_lock); + write_unlock_bh(&xfrm_state_afinfo_lock); return err; } EXPORT_SYMBOL(xfrm_state_unregister_afinfo); -- cgit v0.10.2 From e959d8121fcbfee6ec049cc617e9423d1799f2e4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 28 Apr 2006 15:32:29 -0700 Subject: [XFRM]: fix incorrect xfrm_policy_afinfo_lock use xfrm_policy_afinfo_lock can be taken in bh context, at: [] lockdep_acquire_read+0x54/0x6d [] _read_lock+0x15/0x22 [] xfrm_policy_get_afinfo+0x1a/0x3d [] xfrm_decode_session+0x12/0x32 [] ip_route_me_harder+0x1c9/0x25b [] ip_nat_local_fn+0x94/0xad [] nf_iterate+0x2e/0x7a [] nf_hook_slow+0x3c/0x9e [] ip_push_pending_frames+0x2de/0x3a7 [] icmp_push_reply+0x136/0x141 [] icmp_reply+0x118/0x1a0 [] icmp_echo+0x44/0x46 [] icmp_rcv+0x111/0x138 [] ip_local_deliver+0x150/0x1f9 [] ip_rcv+0x3d5/0x413 [] netif_receive_skb+0x337/0x356 [] process_backlog+0x95/0x110 [] net_rx_action+0xa5/0x16d [] __do_softirq+0x6f/0xe6 [] do_softirq+0x52/0xb1 this means that all write-locking of xfrm_policy_afinfo_lock must be bh-safe. This patch fixes xfrm_policy_register_afinfo() and xfrm_policy_unregister_afinfo(). Signed-off-by: Ingo Molnar Signed-off-by: David S. Miller diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index e5b0afc..b469c8b 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1251,7 +1251,7 @@ int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo) return -EINVAL; if (unlikely(afinfo->family >= NPROTO)) return -EAFNOSUPPORT; - write_lock(&xfrm_policy_afinfo_lock); + write_lock_bh(&xfrm_policy_afinfo_lock); if (unlikely(xfrm_policy_afinfo[afinfo->family] != NULL)) err = -ENOBUFS; else { @@ -1268,7 +1268,7 @@ int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo) afinfo->garbage_collect = __xfrm_garbage_collect; xfrm_policy_afinfo[afinfo->family] = afinfo; } - write_unlock(&xfrm_policy_afinfo_lock); + write_unlock_bh(&xfrm_policy_afinfo_lock); return err; } EXPORT_SYMBOL(xfrm_policy_register_afinfo); @@ -1280,7 +1280,7 @@ int xfrm_policy_unregister_afinfo(struct xfrm_policy_afinfo *afinfo) return -EINVAL; if (unlikely(afinfo->family >= NPROTO)) return -EAFNOSUPPORT; - write_lock(&xfrm_policy_afinfo_lock); + write_lock_bh(&xfrm_policy_afinfo_lock); if (likely(xfrm_policy_afinfo[afinfo->family] != NULL)) { if (unlikely(xfrm_policy_afinfo[afinfo->family] != afinfo)) err = -EINVAL; @@ -1294,7 +1294,7 @@ int xfrm_policy_unregister_afinfo(struct xfrm_policy_afinfo *afinfo) afinfo->garbage_collect = NULL; } } - write_unlock(&xfrm_policy_afinfo_lock); + write_unlock_bh(&xfrm_policy_afinfo_lock); return err; } EXPORT_SYMBOL(xfrm_policy_unregister_afinfo); -- cgit v0.10.2 From c302e6d54e468ee9b1e18152e2e9da06953f3473 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Fri, 28 Apr 2006 15:59:15 -0700 Subject: [IPV6]: Fix race in route selection. We eliminated rt6_dflt_lock (to protect default router pointer) at 2.6.17-rc1, and introduced rt6_select() for general router selection. The function is called in the context of rt6_lock read-lock held, but this means, we have some race conditions when we do round-robin. Signed-off-by; YOSHIFUJI Hideaki Signed-off-by: David S. Miller diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 7907874..0190e39 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -317,7 +317,7 @@ static struct rt6_info *rt6_select(struct rt6_info **head, int oif, __FUNCTION__, head, head ? *head : NULL, oif); for (rt = rt0, metric = rt0->rt6i_metric; - rt && rt->rt6i_metric == metric; + rt && rt->rt6i_metric == metric && (!last || rt != rt0); rt = rt->u.next) { int m; @@ -343,9 +343,12 @@ static struct rt6_info *rt6_select(struct rt6_info **head, int oif, (strict & RT6_SELECT_F_REACHABLE) && last && last != rt0) { /* no entries matched; do round-robin */ + static spinlock_t lock = SPIN_LOCK_UNLOCKED; + spin_lock(&lock); *head = rt0->u.next; rt0->u.next = last->u.next; last->u.next = rt0; + spin_unlock(&lock); } RT6_TRACE("%s() => %p, score=%d\n", -- cgit v0.10.2 From c8e1e82b6a97ad44517517aa58b7b794ead0bf33 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Sat, 29 Apr 2006 18:55:17 -0700 Subject: [TG3]: Call netif_carrier_off() during phy reset Add netif_carrier_off() call during tg3_phy_reset(). This is needed to properly track the netif_carrier state in cases where we do a PHY reset with interrupts disabled. The SerDes code will not run properly if the netif_carrier state is wrong. Signed-off-by: Michael Chan Signed-off-by: David S. Miller diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c index 73e271e..a28accb 100644 --- a/drivers/net/tg3.c +++ b/drivers/net/tg3.c @@ -974,6 +974,8 @@ static int tg3_phy_reset_5703_4_5(struct tg3 *tp) return err; } +static void tg3_link_report(struct tg3 *); + /* This will reset the tigon3 PHY if there is no valid * link unless the FORCE argument is non-zero. */ @@ -987,6 +989,11 @@ static int tg3_phy_reset(struct tg3 *tp) if (err != 0) return -EBUSY; + if (netif_running(tp->dev) && netif_carrier_ok(tp->dev)) { + netif_carrier_off(tp->dev); + tg3_link_report(tp); + } + if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5703 || GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5704 || GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5705) { -- cgit v0.10.2 From c424cb249dae10fb7f118f89091f1329b62b92f4 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Sat, 29 Apr 2006 18:56:34 -0700 Subject: [TG3]: Add phy workaround Add some PHY workaround code to reduce jitter on some PHYs. Signed-off-by: Michael Chan Signed-off-by: David S. Miller diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c index a28accb..a307340 100644 --- a/drivers/net/tg3.c +++ b/drivers/net/tg3.c @@ -1030,6 +1030,12 @@ out: tg3_writephy(tp, MII_TG3_DSP_RW_PORT, 0x14e2); tg3_writephy(tp, MII_TG3_AUX_CTRL, 0x0400); } + else if (tp->tg3_flags2 & TG3_FLG2_PHY_JITTER_BUG) { + tg3_writephy(tp, MII_TG3_AUX_CTRL, 0x0c00); + tg3_writephy(tp, MII_TG3_DSP_ADDRESS, 0x000a); + tg3_writephy(tp, MII_TG3_DSP_RW_PORT, 0x010b); + tg3_writephy(tp, MII_TG3_AUX_CTRL, 0x0400); + } /* Set Extended packet length bit (bit 14) on all chips that */ /* support jumbo frames */ if ((tp->phy_id & PHY_ID_MASK) == PHY_ID_BCM5401) { @@ -10360,10 +10366,13 @@ static int __devinit tg3_get_invariants(struct tg3 *tp) if (tp->pci_chip_rev_id == CHIPREV_ID_5704_A0) tp->tg3_flags2 |= TG3_FLG2_PHY_5704_A0_BUG; - if ((tp->tg3_flags2 & TG3_FLG2_5705_PLUS) && - (GET_ASIC_REV(tp->pci_chip_rev_id) != ASIC_REV_5755) && - (GET_ASIC_REV(tp->pci_chip_rev_id) != ASIC_REV_5787)) - tp->tg3_flags2 |= TG3_FLG2_PHY_BER_BUG; + if (tp->tg3_flags2 & TG3_FLG2_5705_PLUS) { + if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5755 || + GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5787) + tp->tg3_flags2 |= TG3_FLG2_PHY_JITTER_BUG; + else + tp->tg3_flags2 |= TG3_FLG2_PHY_BER_BUG; + } tp->coalesce_mode = 0; if (GET_CHIP_REV(tp->pci_chip_rev_id) != CHIPREV_5700_AX && diff --git a/drivers/net/tg3.h b/drivers/net/tg3.h index 8c8b987..0e29b88 100644 --- a/drivers/net/tg3.h +++ b/drivers/net/tg3.h @@ -2215,6 +2215,7 @@ struct tg3 { #define TG3_FLG2_HW_TSO_2 0x08000000 #define TG3_FLG2_HW_TSO (TG3_FLG2_HW_TSO_1 | TG3_FLG2_HW_TSO_2) #define TG3_FLG2_1SHOT_MSI 0x10000000 +#define TG3_FLG2_PHY_JITTER_BUG 0x20000000 u32 split_mode_max_reqs; #define SPLIT_MODE_5704_MAX_REQ 3 -- cgit v0.10.2 From 58712ef9f2cbaaeac5b32ac11810a4bbd0eeacc5 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Sat, 29 Apr 2006 18:58:01 -0700 Subject: [TG3]: Reset chip when changing MAC address Do the full chip reset when changing MAC address if ASF is enabled. ASF sometimes uses a different MAC address than the driver. Without the reset, the ASF MAC address may be overwritten when the driver's MAC address is changed. Signed-off-by: Michael Chan Signed-off-by: David S. Miller diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c index a307340..0ccfb63 100644 --- a/drivers/net/tg3.c +++ b/drivers/net/tg3.c @@ -5732,9 +5732,23 @@ static int tg3_set_mac_addr(struct net_device *dev, void *p) if (!netif_running(dev)) return 0; - spin_lock_bh(&tp->lock); - __tg3_set_mac_addr(tp); - spin_unlock_bh(&tp->lock); + if (tp->tg3_flags & TG3_FLAG_ENABLE_ASF) { + /* Reset chip so that ASF can re-init any MAC addresses it + * needs. + */ + tg3_netif_stop(tp); + tg3_full_lock(tp, 1); + + tg3_halt(tp, RESET_KIND_SHUTDOWN, 1); + tg3_init_hw(tp); + + tg3_netif_start(tp); + tg3_full_unlock(tp); + } else { + spin_lock_bh(&tp->lock); + __tg3_set_mac_addr(tp); + spin_unlock_bh(&tp->lock); + } return 0; } -- cgit v0.10.2 From 8e7a22e3eb49042c048f24bab40cf5cf8915487d Mon Sep 17 00:00:00 2001 From: Gary Zambrano Date: Sat, 29 Apr 2006 18:59:13 -0700 Subject: [TG3]: Add reset_phy parameter to chip reset functions Add a reset_phy parameter to tg3_reset_hw() and tg3_init_hw(). With the full chip reset during MAC address change, the automatic PHY reset during chip reset will cause a link down and bonding will not work properly as a result. With this reset_phy parameter, we can do a chip reset without link down when changing MAC address or MTU. Signed-off-by: Gary Zambrano Signed-off-by: Michael Chan Signed-off-by: David S. Miller diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c index 0ccfb63..97e27d8 100644 --- a/drivers/net/tg3.c +++ b/drivers/net/tg3.c @@ -3544,7 +3544,7 @@ static irqreturn_t tg3_test_isr(int irq, void *dev_id, return IRQ_RETVAL(0); } -static int tg3_init_hw(struct tg3 *); +static int tg3_init_hw(struct tg3 *, int); static int tg3_halt(struct tg3 *, int, int); #ifdef CONFIG_NET_POLL_CONTROLLER @@ -3580,7 +3580,7 @@ static void tg3_reset_task(void *_data) tp->tg3_flags2 &= ~TG3_FLG2_RESTART_TIMER; tg3_halt(tp, RESET_KIND_SHUTDOWN, 0); - tg3_init_hw(tp); + tg3_init_hw(tp, 1); tg3_netif_start(tp); @@ -4055,7 +4055,7 @@ static int tg3_change_mtu(struct net_device *dev, int new_mtu) tg3_set_mtu(dev, tp, new_mtu); - tg3_init_hw(tp); + tg3_init_hw(tp, 0); tg3_netif_start(tp); @@ -5740,7 +5740,7 @@ static int tg3_set_mac_addr(struct net_device *dev, void *p) tg3_full_lock(tp, 1); tg3_halt(tp, RESET_KIND_SHUTDOWN, 1); - tg3_init_hw(tp); + tg3_init_hw(tp, 0); tg3_netif_start(tp); tg3_full_unlock(tp); @@ -5798,7 +5798,7 @@ static void __tg3_set_coalesce(struct tg3 *tp, struct ethtool_coalesce *ec) } /* tp->lock is held. */ -static int tg3_reset_hw(struct tg3 *tp) +static int tg3_reset_hw(struct tg3 *tp, int reset_phy) { u32 val, rdmac_mode; int i, err, limit; @@ -5813,7 +5813,7 @@ static int tg3_reset_hw(struct tg3 *tp) tg3_abort_hw(tp, 1); } - if (tp->tg3_flags2 & TG3_FLG2_MII_SERDES) + if ((tp->tg3_flags2 & TG3_FLG2_MII_SERDES) && reset_phy) tg3_phy_reset(tp); err = tg3_chip_reset(tp); @@ -6354,7 +6354,7 @@ static int tg3_reset_hw(struct tg3 *tp) tw32(GRC_LOCAL_CTRL, tp->grc_local_ctrl); } - err = tg3_setup_phy(tp, 1); + err = tg3_setup_phy(tp, reset_phy); if (err) return err; @@ -6427,7 +6427,7 @@ static int tg3_reset_hw(struct tg3 *tp) /* Called at device open time to get the chip ready for * packet processing. Invoked with tp->lock held. */ -static int tg3_init_hw(struct tg3 *tp) +static int tg3_init_hw(struct tg3 *tp, int reset_phy) { int err; @@ -6440,7 +6440,7 @@ static int tg3_init_hw(struct tg3 *tp) tw32(TG3PCI_MEM_WIN_BASE_ADDR, 0); - err = tg3_reset_hw(tp); + err = tg3_reset_hw(tp, reset_phy); out: return err; @@ -6710,7 +6710,7 @@ static int tg3_test_msi(struct tg3 *tp) tg3_full_lock(tp, 1); tg3_halt(tp, RESET_KIND_SHUTDOWN, 1); - err = tg3_init_hw(tp); + err = tg3_init_hw(tp, 1); tg3_full_unlock(tp); @@ -6775,7 +6775,7 @@ static int tg3_open(struct net_device *dev) tg3_full_lock(tp, 0); - err = tg3_init_hw(tp); + err = tg3_init_hw(tp, 1); if (err) { tg3_halt(tp, RESET_KIND_SHUTDOWN, 1); tg3_free_rings(tp); @@ -7866,7 +7866,7 @@ static int tg3_set_ringparam(struct net_device *dev, struct ethtool_ringparam *e if (netif_running(dev)) { tg3_halt(tp, RESET_KIND_SHUTDOWN, 1); - tg3_init_hw(tp); + tg3_init_hw(tp, 1); tg3_netif_start(tp); } @@ -7911,7 +7911,7 @@ static int tg3_set_pauseparam(struct net_device *dev, struct ethtool_pauseparam if (netif_running(dev)) { tg3_halt(tp, RESET_KIND_SHUTDOWN, 1); - tg3_init_hw(tp); + tg3_init_hw(tp, 1); tg3_netif_start(tp); } @@ -8549,7 +8549,7 @@ static int tg3_test_loopback(struct tg3 *tp) if (!netif_running(tp->dev)) return TG3_LOOPBACK_FAILED; - tg3_reset_hw(tp); + tg3_reset_hw(tp, 1); if (tg3_run_loopback(tp, TG3_MAC_LOOPBACK)) err |= TG3_MAC_LOOPBACK_FAILED; @@ -8623,7 +8623,7 @@ static void tg3_self_test(struct net_device *dev, struct ethtool_test *etest, tg3_halt(tp, RESET_KIND_SHUTDOWN, 1); if (netif_running(dev)) { tp->tg3_flags |= TG3_FLAG_INIT_COMPLETE; - tg3_init_hw(tp); + tg3_init_hw(tp, 1); tg3_netif_start(tp); } @@ -11599,7 +11599,7 @@ static int tg3_suspend(struct pci_dev *pdev, pm_message_t state) tg3_full_lock(tp, 0); tp->tg3_flags |= TG3_FLAG_INIT_COMPLETE; - tg3_init_hw(tp); + tg3_init_hw(tp, 1); tp->timer.expires = jiffies + tp->timer_offset; add_timer(&tp->timer); @@ -11633,7 +11633,7 @@ static int tg3_resume(struct pci_dev *pdev) tg3_full_lock(tp, 0); tp->tg3_flags |= TG3_FLAG_INIT_COMPLETE; - tg3_init_hw(tp); + tg3_init_hw(tp, 1); tp->timer.expires = jiffies + tp->timer_offset; add_timer(&tp->timer); -- cgit v0.10.2 From f6d9a2565bc754043f43b8f51b19f77ee0269411 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Sat, 29 Apr 2006 19:00:24 -0700 Subject: [TG3]: Fix bug in nvram write Fix bug in nvram write function. If the starting nvram address offset happens to be the last dword of the page, the NVRAM_CMD_LAST bit will not get set in the existing code. This patch fixes the bug by changing the "else if" to "if" so that the last dword condition always gets checked. Signed-off-by: Michael Chan Signed-off-by: David S. Miller diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c index 97e27d8..0779544 100644 --- a/drivers/net/tg3.c +++ b/drivers/net/tg3.c @@ -9404,7 +9404,7 @@ static int tg3_nvram_write_block_buffered(struct tg3 *tp, u32 offset, u32 len, if ((page_off == 0) || (i == 0)) nvram_cmd |= NVRAM_CMD_FIRST; - else if (page_off == (tp->nvram_pagesize - 4)) + if (page_off == (tp->nvram_pagesize - 4)) nvram_cmd |= NVRAM_CMD_LAST; if (i == (len - 4)) -- cgit v0.10.2 From b276764091cf241cf0b31e8cb76c67dcf9a9c1d8 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Sat, 29 Apr 2006 19:01:06 -0700 Subject: [TG3]: Update version and reldate Update version to 3.57. Signed-off-by: Michael Chan Signed-off-by: David S. Miller diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c index 0779544..beeb612 100644 --- a/drivers/net/tg3.c +++ b/drivers/net/tg3.c @@ -69,8 +69,8 @@ #define DRV_MODULE_NAME "tg3" #define PFX DRV_MODULE_NAME ": " -#define DRV_MODULE_VERSION "3.56" -#define DRV_MODULE_RELDATE "Apr 1, 2006" +#define DRV_MODULE_VERSION "3.57" +#define DRV_MODULE_RELDATE "Apr 28, 2006" #define TG3_DEF_MAC_MODE 0 #define TG3_DEF_RX_MODE 0 -- cgit v0.10.2 From cd95842ca0ffb0e3df3b459832a60f9f4544ed9e Mon Sep 17 00:00:00 2001 From: Markus Gutschke Date: Sun, 30 Apr 2006 15:34:29 +0100 Subject: [ARM] 3486/1: Mark memory as clobbered by the ARM _syscallX() macros Patch from Markus Gutschke In order to prevent gcc from making incorrect optimizations, all asm() statements that define system calls should report memory as clobbered. Recent versions of the headers for i386 have been changed accordingly, but the ARM headers are still defective. This patch fixes the bug tracked at http://bugzilla.kernel.org/show_bug.cgi?id=6205 Signed-off-by: Markus Gutschke Signed-off-by: Russell King diff --git a/drivers/input/touchscreen/corgi_ts.c b/drivers/input/touchscreen/corgi_ts.c index 1042987..5013703 100644 --- a/drivers/input/touchscreen/corgi_ts.c +++ b/drivers/input/touchscreen/corgi_ts.c @@ -17,7 +17,7 @@ #include #include #include -#include +//#include #include #include diff --git a/include/asm-arm/unistd.h b/include/asm-arm/unistd.h index ee8dfea..26f2f48 100644 --- a/include/asm-arm/unistd.h +++ b/include/asm-arm/unistd.h @@ -410,7 +410,8 @@ type name(void) { \ __asm__ __volatile__ ( \ __syscall(name) \ : "=r" (__res_r0) \ - : __SYS_REG_LIST() ); \ + : __SYS_REG_LIST() \ + : "memory" ); \ __res = __res_r0; \ __syscall_return(type,__res); \ } @@ -424,7 +425,8 @@ type name(type1 arg1) { \ __asm__ __volatile__ ( \ __syscall(name) \ : "=r" (__res_r0) \ - : __SYS_REG_LIST( "0" (__r0) ) ); \ + : __SYS_REG_LIST( "0" (__r0) ) \ + : "memory" ); \ __res = __res_r0; \ __syscall_return(type,__res); \ } @@ -439,7 +441,8 @@ type name(type1 arg1,type2 arg2) { \ __asm__ __volatile__ ( \ __syscall(name) \ : "=r" (__res_r0) \ - : __SYS_REG_LIST( "0" (__r0), "r" (__r1) ) ); \ + : __SYS_REG_LIST( "0" (__r0), "r" (__r1) ) \ + : "memory" ); \ __res = __res_r0; \ __syscall_return(type,__res); \ } @@ -456,7 +459,8 @@ type name(type1 arg1,type2 arg2,type3 arg3) { \ __asm__ __volatile__ ( \ __syscall(name) \ : "=r" (__res_r0) \ - : __SYS_REG_LIST( "0" (__r0), "r" (__r1), "r" (__r2) ) ); \ + : __SYS_REG_LIST( "0" (__r0), "r" (__r1), "r" (__r2) ) \ + : "memory" ); \ __res = __res_r0; \ __syscall_return(type,__res); \ } @@ -474,7 +478,8 @@ type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4) { \ __asm__ __volatile__ ( \ __syscall(name) \ : "=r" (__res_r0) \ - : __SYS_REG_LIST( "0" (__r0), "r" (__r1), "r" (__r2), "r" (__r3) ) ); \ + : __SYS_REG_LIST( "0" (__r0), "r" (__r1), "r" (__r2), "r" (__r3) ) \ + : "memory" ); \ __res = __res_r0; \ __syscall_return(type,__res); \ } @@ -494,7 +499,8 @@ type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5) { \ __syscall(name) \ : "=r" (__res_r0) \ : __SYS_REG_LIST( "0" (__r0), "r" (__r1), "r" (__r2), \ - "r" (__r3), "r" (__r4) ) ); \ + "r" (__r3), "r" (__r4) ) \ + : "memory" ); \ __res = __res_r0; \ __syscall_return(type,__res); \ } @@ -514,7 +520,8 @@ type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6 __syscall(name) \ : "=r" (__res_r0) \ : __SYS_REG_LIST( "0" (__r0), "r" (__r1), "r" (__r2), \ - "r" (__r3), "r" (__r4), "r" (__r5) ) ); \ + "r" (__r3), "r" (__r4), "r" (__r5) ) \ + : "memory" ); \ __res = __res_r0; \ __syscall_return(type,__res); \ } -- cgit v0.10.2 From 76bbb00288e569e7bd9ec18f45e4f814352260dd Mon Sep 17 00:00:00 2001 From: Deepak Saxena Date: Sun, 30 Apr 2006 15:34:29 +0100 Subject: [ARM] 3487/1: IXP4xx: Support non-PCI systems Patch from Deepak Saxena This patch allows for the addition of IXP4xx systems that do not make use of the PCI interface by moving the CONFIG_PCI symbol selection to be platform-specific instead of for all of IXP4xx. If at least one machine with PCI support is built, the PCI code will be compiled in, but when building !PCI, this will drastically shrink the kernel size. Signed-off-by: Deepak Saxena Signed-off-by: Russell King diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 1dbf6dd..08b7cc9 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -150,8 +150,6 @@ config ARCH_IOP3XX config ARCH_IXP4XX bool "IXP4xx-based" - select DMABOUNCE - select PCI help Support for Intel's IXP4XX (XScale) family of processors. diff --git a/arch/arm/mach-ixp4xx/Kconfig b/arch/arm/mach-ixp4xx/Kconfig index 5bf50a2..2a39f9e 100644 --- a/arch/arm/mach-ixp4xx/Kconfig +++ b/arch/arm/mach-ixp4xx/Kconfig @@ -11,6 +11,7 @@ comment "IXP4xx Platforms" config MACH_NSLU2 bool prompt "Linksys NSLU2" + select PCI help Say 'Y' here if you want your kernel to support Linksys's NSLU2 NAS device. For more information on this platform, @@ -18,6 +19,7 @@ config MACH_NSLU2 config ARCH_AVILA bool "Avila" + select PCI help Say 'Y' here if you want your kernel to support the Gateworks Avila Network Platform. For more information on this platform, @@ -25,6 +27,7 @@ config ARCH_AVILA config ARCH_ADI_COYOTE bool "Coyote" + select PCI help Say 'Y' here if you want your kernel to support the ADI Engineering Coyote Gateway Reference Platform. For more @@ -32,6 +35,7 @@ config ARCH_ADI_COYOTE config ARCH_IXDP425 bool "IXDP425" + select PCI help Say 'Y' here if you want your kernel to support Intel's IXDP425 Development Platform (Also known as Richfield). @@ -39,6 +43,7 @@ config ARCH_IXDP425 config MACH_IXDPG425 bool "IXDPG425" + select PCI help Say 'Y' here if you want your kernel to support Intel's IXDPG425 Development Platform (Also known as Montajade). @@ -46,6 +51,7 @@ config MACH_IXDPG425 config MACH_IXDP465 bool "IXDP465" + select PCI help Say 'Y' here if you want your kernel to support Intel's IXDP465 Development Platform (Also known as BMP). @@ -72,6 +78,7 @@ config ARCH_PRPMC1100 config MACH_NAS100D bool prompt "NAS100D" + select PCI help Say 'Y' here if you want your kernel to support Iomega's NAS 100d device. For more information on this platform, @@ -96,6 +103,7 @@ config CPU_IXP46X config MACH_GTWX5715 bool "Gemtek WX5715 (Linksys WRV54G)" depends on ARCH_IXP4XX + select PCI help This board is currently inside the Linksys WRV54G Gateways. @@ -110,11 +118,16 @@ config MACH_GTWX5715 "High Speed" UART is n/c (as far as I can tell) 20 Pin ARM/Xscale JTAG interface on J2 - comment "IXP4xx Options" +config DMABOUNCE + bool + default y + depends on PCI + config IXP4XX_INDIRECT_PCI bool "Use indirect PCI memory access" + depends on PCI help IXP4xx provides two methods of accessing PCI memory space: diff --git a/arch/arm/mach-ixp4xx/Makefile b/arch/arm/mach-ixp4xx/Makefile index 0471044..5a4aaa0 100644 --- a/arch/arm/mach-ixp4xx/Makefile +++ b/arch/arm/mach-ixp4xx/Makefile @@ -2,8 +2,9 @@ # Makefile for the linux kernel. # -obj-y += common.o common-pci.o +obj-y += common.o +obj-$(CONFIG_PCI) += common-pci.o obj-$(CONFIG_ARCH_IXDP4XX) += ixdp425-pci.o ixdp425-setup.o obj-$(CONFIG_MACH_IXDPG425) += ixdpg425-pci.o coyote-setup.o obj-$(CONFIG_ARCH_ADI_COYOTE) += coyote-pci.o coyote-setup.o diff --git a/include/asm-arm/arch-ixp4xx/io.h b/include/asm-arm/arch-ixp4xx/io.h index 942b622..b59520e 100644 --- a/include/asm-arm/arch-ixp4xx/io.h +++ b/include/asm-arm/arch-ixp4xx/io.h @@ -260,6 +260,12 @@ out: #endif +#ifndef CONFIG_PCI + +#define __io(v) v + +#else + /* * IXP4xx does not have a transparent cpu -> PCI I/O translation * window. Instead, it has a set of registers that must be tweaked @@ -578,6 +584,7 @@ __ixp4xx_iowrite32_rep(void __iomem *addr, const void *vaddr, u32 count) #define ioport_map(port, nr) ((void __iomem*)(port + PIO_OFFSET)) #define ioport_unmap(addr) +#endif // !CONFIG_PCI #endif // __ASM_ARM_ARCH_IO_H diff --git a/include/asm-arm/arch-ixp4xx/memory.h b/include/asm-arm/arch-ixp4xx/memory.h index ee211d2..af9667b 100644 --- a/include/asm-arm/arch-ixp4xx/memory.h +++ b/include/asm-arm/arch-ixp4xx/memory.h @@ -14,7 +14,7 @@ */ #define PHYS_OFFSET UL(0x00000000) -#ifndef __ASSEMBLY__ +#if !defined(__ASSEMBLY__) && defined(CONFIG_PCI) void ixp4xx_adjust_zones(int node, unsigned long *size, unsigned long *holes); -- cgit v0.10.2 From 81d38428df26377c91e7e193aa4d2fdfdcda300a Mon Sep 17 00:00:00 2001 From: Pavel Pisa Date: Sun, 30 Apr 2006 15:35:54 +0100 Subject: [ARM] 3485/1: i.MX: MX1 SD/MMC fix of unintentional double start possibility Patch from Pavel Pisa The clock starting imxmci_start_clock() function contains hardware issue workaround, which repeats start attempt, if SDHC does not react on the first trial. But the second start attempt can be taken even, if the first succeed and test code misses time limited clock running phase due to delay caused by schedule to other task or some another device interrupt. This change enables to detect such situation. The performance is not issue, because usually at full clock rate only about six loops in delay cycle are needed. Signed-off-by: Pavel Pisa Acked-by: Sascha Hauer Signed-off-by: Russell King diff --git a/drivers/mmc/imxmmc.c b/drivers/mmc/imxmmc.c index ffb7f55..07f36c4 100644 --- a/drivers/mmc/imxmmc.c +++ b/drivers/mmc/imxmmc.c @@ -102,6 +102,7 @@ struct imxmci_host { #define IMXMCI_PEND_CPU_DATA_b 5 #define IMXMCI_PEND_CARD_XCHG_b 6 #define IMXMCI_PEND_SET_INIT_b 7 +#define IMXMCI_PEND_STARTED_b 8 #define IMXMCI_PEND_IRQ_m (1 << IMXMCI_PEND_IRQ_b) #define IMXMCI_PEND_DMA_END_m (1 << IMXMCI_PEND_DMA_END_b) @@ -111,6 +112,7 @@ struct imxmci_host { #define IMXMCI_PEND_CPU_DATA_m (1 << IMXMCI_PEND_CPU_DATA_b) #define IMXMCI_PEND_CARD_XCHG_m (1 << IMXMCI_PEND_CARD_XCHG_b) #define IMXMCI_PEND_SET_INIT_m (1 << IMXMCI_PEND_SET_INIT_b) +#define IMXMCI_PEND_STARTED_m (1 << IMXMCI_PEND_STARTED_b) static void imxmci_stop_clock(struct imxmci_host *host) { @@ -131,23 +133,52 @@ static void imxmci_stop_clock(struct imxmci_host *host) dev_dbg(mmc_dev(host->mmc), "imxmci_stop_clock blocked, no luck\n"); } -static void imxmci_start_clock(struct imxmci_host *host) +static int imxmci_start_clock(struct imxmci_host *host) { - int i = 0; + unsigned int trials = 0; + unsigned int delay_limit = 128; + unsigned long flags; + MMC_STR_STP_CLK &= ~STR_STP_CLK_STOP_CLK; - while(i < 0x1000) { - if(!(i & 0x7f)) - MMC_STR_STP_CLK |= STR_STP_CLK_START_CLK; - if(MMC_STATUS & STATUS_CARD_BUS_CLK_RUN) { - /* Check twice before cut */ + clear_bit(IMXMCI_PEND_STARTED_b, &host->pending_events); + + /* + * Command start of the clock, this usually succeeds in less + * then 6 delay loops, but during card detection (low clockrate) + * it takes up to 5000 delay loops and sometimes fails for the first time + */ + MMC_STR_STP_CLK |= STR_STP_CLK_START_CLK; + + do { + unsigned int delay = delay_limit; + + while(delay--){ if(MMC_STATUS & STATUS_CARD_BUS_CLK_RUN) - return; + /* Check twice before cut */ + if(MMC_STATUS & STATUS_CARD_BUS_CLK_RUN) + return 0; + + if(test_bit(IMXMCI_PEND_STARTED_b, &host->pending_events)) + return 0; } - i++; - } - dev_dbg(mmc_dev(host->mmc), "imxmci_start_clock blocked, no luck\n"); + local_irq_save(flags); + /* + * Ensure, that request is not doubled under all possible circumstances. + * It is possible, that cock running state is missed, because some other + * IRQ or schedule delays this function execution and the clocks has + * been already stopped by other means (response processing, SDHC HW) + */ + if(!test_bit(IMXMCI_PEND_STARTED_b, &host->pending_events)) + MMC_STR_STP_CLK |= STR_STP_CLK_START_CLK; + local_irq_restore(flags); + + } while(++trials<256); + + dev_err(mmc_dev(host->mmc), "imxmci_start_clock blocked, no luck\n"); + + return -1; } static void imxmci_softreset(void) @@ -622,6 +653,7 @@ static irqreturn_t imxmci_irq(int irq, void *devid, struct pt_regs *regs) atomic_set(&host->stuck_timeout, 0); host->status_reg = stat; set_bit(IMXMCI_PEND_IRQ_b, &host->pending_events); + set_bit(IMXMCI_PEND_STARTED_b, &host->pending_events); tasklet_schedule(&host->tasklet); return IRQ_RETVAL(handled);; -- cgit v0.10.2 From 46e678c96bbd775abd05d3ddbe2fd334794f9157 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 30 Apr 2006 16:36:32 +0200 Subject: [PATCH] splice: fix bugs with stealing regular pipe pages - Check that page has suitable count for stealing in the regular pipes. - pipe_to_file() assumes that the page is locked on succesful steal, so do that in the pipe steal hook - Missing unlock_page() in add_to_page_cache() failure. Signed-off-by: Jens Axboe diff --git a/fs/pipe.c b/fs/pipe.c index 7fefb10..5a36927 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -127,8 +127,15 @@ static void anon_pipe_buf_unmap(struct pipe_inode_info *pipe, static int anon_pipe_buf_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { - buf->flags |= PIPE_BUF_FLAG_STOLEN; - return 0; + struct page *page = buf->page; + + if (page_count(page) == 1) { + buf->flags |= PIPE_BUF_FLAG_STOLEN; + lock_page(page); + return 0; + } + + return 1; } static void anon_pipe_buf_get(struct pipe_inode_info *info, diff --git a/fs/splice.c b/fs/splice.c index a46ddd2..9df28d3 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -599,8 +599,10 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, goto find_page; page = buf->page; - if (add_to_page_cache(page, mapping, index, gfp_mask)) + if (add_to_page_cache(page, mapping, index, gfp_mask)) { + unlock_page(page); goto find_page; + } if (!(buf->flags & PIPE_BUF_FLAG_LRU)) lru_cache_add(page); -- cgit v0.10.2 From 1241140f5183db38393556832198a3b109bf9085 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sun, 30 Apr 2006 21:40:13 -0700 Subject: [SPARC64]: Kill __flush_tlb_page() prototype. This function no longer exists. Signed-off-by: David S. Miller diff --git a/include/asm-sparc64/tlbflush.h b/include/asm-sparc64/tlbflush.h index 9ad5d9c..e3a7c45 100644 --- a/include/asm-sparc64/tlbflush.h +++ b/include/asm-sparc64/tlbflush.h @@ -22,8 +22,6 @@ extern void flush_tlb_pending(void); /* Local cpu only. */ extern void __flush_tlb_all(void); -extern void __flush_tlb_page(unsigned long context, unsigned long page, unsigned long r); - extern void __flush_tlb_kernel_range(unsigned long start, unsigned long end); #ifndef CONFIG_SMP -- cgit v0.10.2 From c9f2946fbec88d4baa3a6d47eb3a8e6b08b05cd9 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sun, 30 Apr 2006 22:54:27 -0700 Subject: [SPARC64]: Disable preemption during flush_tlb_pending(). A context switch will force a call to flush_tlb_pending() (via switch_to()), so if we test tlb_nr to be non-zero, then sleep, it would become zero and later back at the original context we'll pass zero down into the TLB flushing code which should never see a nr argument of zero. Signed-off-by: David S. Miller diff --git a/arch/sparc64/mm/tlb.c b/arch/sparc64/mm/tlb.c index a079cf4..3f10fc9 100644 --- a/arch/sparc64/mm/tlb.c +++ b/arch/sparc64/mm/tlb.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -24,6 +25,8 @@ void flush_tlb_pending(void) { struct mmu_gather *mp = &__get_cpu_var(mmu_gathers); + preempt_disable(); + if (mp->tlb_nr) { flush_tsb_user(mp); @@ -38,6 +41,8 @@ void flush_tlb_pending(void) } mp->tlb_nr = 0; } + + preempt_enable(); } void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr, pte_t *ptep, pte_t orig) -- cgit v0.10.2 From 45d9bb0e37668b7c64d1e49e98fbc4733c23b334 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 29 Mar 2006 20:02:55 -0500 Subject: [PATCH] deal with deadlocks in audit_free() Don't assume that audit_log_exit() et.al. are called for the context of current; pass task explictly. Signed-off-by: Al Viro diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 7f160df..4052f0a 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -536,13 +536,13 @@ error_path: return; } -static void audit_log_task_info(struct audit_buffer *ab, gfp_t gfp_mask) +static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk, gfp_t gfp_mask) { - char name[sizeof(current->comm)]; - struct mm_struct *mm = current->mm; + char name[sizeof(tsk->comm)]; + struct mm_struct *mm = tsk->mm; struct vm_area_struct *vma; - get_task_comm(name, current); + get_task_comm(name, tsk); audit_log_format(ab, " comm="); audit_log_untrustedstring(ab, name); @@ -551,7 +551,7 @@ static void audit_log_task_info(struct audit_buffer *ab, gfp_t gfp_mask) /* * this is brittle; all callers that pass GFP_ATOMIC will have - * NULL current->mm and we won't get here. + * NULL tsk->mm and we won't get here. */ down_read(&mm->mmap_sem); vma = mm->mmap; @@ -569,7 +569,7 @@ static void audit_log_task_info(struct audit_buffer *ab, gfp_t gfp_mask) audit_log_task_context(ab, gfp_mask); } -static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask) +static void audit_log_exit(struct audit_context *context, struct task_struct *tsk, gfp_t gfp_mask) { int i; struct audit_buffer *ab; @@ -587,8 +587,8 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask) audit_log_format(ab, " success=%s exit=%ld", (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", context->return_code); - if (current->signal->tty && current->signal->tty->name) - tty = current->signal->tty->name; + if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) + tty = tsk->signal->tty->name; else tty = "(none)"; audit_log_format(ab, @@ -720,7 +720,7 @@ void audit_free(struct task_struct *tsk) * We use GFP_ATOMIC here because we might be doing this * in the context of the idle thread */ if (context->in_syscall && context->auditable) - audit_log_exit(context, GFP_ATOMIC); + audit_log_exit(context, tsk, GFP_ATOMIC); audit_free_context(context); } @@ -839,7 +839,7 @@ void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code) goto out; if (context->in_syscall && context->auditable) - audit_log_exit(context, GFP_KERNEL); + audit_log_exit(context, tsk, GFP_KERNEL); context->in_syscall = 0; context->auditable = 0; -- cgit v0.10.2 From d6fe3945b42d09a1eca7ad180a1646e585b8594f Mon Sep 17 00:00:00 2001 From: Steve Grubb Date: Thu, 30 Mar 2006 12:20:22 -0500 Subject: [PATCH] sockaddr patch On Thursday 23 March 2006 09:08, John D. Ramsdell wrote: > I noticed that a socketcall(bind) and socketcall(connect) event contain a > record of type=SOCKADDR, but I cannot see one for a system call event > associated with socketcall(accept). Recording the sockaddr of an accepted > socket is important for cross platform information flow analys Thanks for pointing this out. The following patch should address this. Signed-off-by: Steve Grubb Signed-off-by: Al Viro diff --git a/net/socket.c b/net/socket.c index 0ce12df..02948b6 100644 --- a/net/socket.c +++ b/net/socket.c @@ -267,6 +267,8 @@ int move_addr_to_user(void *kaddr, int klen, void __user *uaddr, int __user *ule return -EINVAL; if(len) { + if (audit_sockaddr(klen, kaddr)) + return -ENOMEM; if(copy_to_user(uaddr,kaddr,len)) return -EFAULT; } -- cgit v0.10.2 From fa84cb935d4ec601528f5e2f0d5d31e7876a5044 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 29 Mar 2006 20:30:19 -0500 Subject: [PATCH] move call of audit_free() into do_exit() Signed-off-by: Al Viro diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 4052f0a..8ec52ff 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -698,19 +698,12 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts * audit_free - free a per-task audit context * @tsk: task whose audit context block to free * - * Called from copy_process and __put_task_struct. + * Called from copy_process and do_exit */ void audit_free(struct task_struct *tsk) { struct audit_context *context; - /* - * No need to lock the task - when we execute audit_free() - * then the task has no external references anymore, and - * we are tearing it down. (The locking also confuses - * DEBUG_LOCKDEP - this freeing may occur in softirq - * contexts as well, via RCU.) - */ context = audit_get_context(tsk, 0, 0); if (likely(!context)) return; diff --git a/kernel/exit.c b/kernel/exit.c index f86434d..e95b932 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -35,6 +35,7 @@ #include #include #include +#include /* for audit_free() */ #include #include @@ -910,6 +911,8 @@ fastcall NORET_TYPE void do_exit(long code) if (unlikely(tsk->compat_robust_list)) compat_exit_robust_list(tsk); #endif + if (unlikely(tsk->audit_context)) + audit_free(tsk); exit_mm(tsk); exit_sem(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index d2fa57d..ac8100e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -114,8 +114,6 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(atomic_read(&tsk->usage)); WARN_ON(tsk == current); - if (unlikely(tsk->audit_context)) - audit_free(tsk); security_task_free(tsk); free_uid(tsk->user); put_group_info(tsk->group_info); -- cgit v0.10.2 From e495149b173d8e133e1f6f2eb86fd97be7e92010 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 29 Mar 2006 20:17:10 -0500 Subject: [PATCH] drop gfp_mask in audit_log_exit() now we can do that - all callers are process-synchronous and do not hold any locks. Signed-off-by: Al Viro diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 8ec52ff..ba0ec1b 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -506,7 +506,7 @@ static inline void audit_free_context(struct audit_context *context) printk(KERN_ERR "audit: freed %d contexts\n", count); } -static void audit_log_task_context(struct audit_buffer *ab, gfp_t gfp_mask) +static void audit_log_task_context(struct audit_buffer *ab) { char *ctx = NULL; ssize_t len = 0; @@ -518,7 +518,7 @@ static void audit_log_task_context(struct audit_buffer *ab, gfp_t gfp_mask) return; } - ctx = kmalloc(len, gfp_mask); + ctx = kmalloc(len, GFP_KERNEL); if (!ctx) goto error_path; @@ -536,47 +536,46 @@ error_path: return; } -static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk, gfp_t gfp_mask) +static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) { char name[sizeof(tsk->comm)]; struct mm_struct *mm = tsk->mm; struct vm_area_struct *vma; + /* tsk == current */ + get_task_comm(name, tsk); audit_log_format(ab, " comm="); audit_log_untrustedstring(ab, name); - if (!mm) - return; - - /* - * this is brittle; all callers that pass GFP_ATOMIC will have - * NULL tsk->mm and we won't get here. - */ - down_read(&mm->mmap_sem); - vma = mm->mmap; - while (vma) { - if ((vma->vm_flags & VM_EXECUTABLE) && - vma->vm_file) { - audit_log_d_path(ab, "exe=", - vma->vm_file->f_dentry, - vma->vm_file->f_vfsmnt); - break; + if (mm) { + down_read(&mm->mmap_sem); + vma = mm->mmap; + while (vma) { + if ((vma->vm_flags & VM_EXECUTABLE) && + vma->vm_file) { + audit_log_d_path(ab, "exe=", + vma->vm_file->f_dentry, + vma->vm_file->f_vfsmnt); + break; + } + vma = vma->vm_next; } - vma = vma->vm_next; + up_read(&mm->mmap_sem); } - up_read(&mm->mmap_sem); - audit_log_task_context(ab, gfp_mask); + audit_log_task_context(ab); } -static void audit_log_exit(struct audit_context *context, struct task_struct *tsk, gfp_t gfp_mask) +static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) { int i; struct audit_buffer *ab; struct audit_aux_data *aux; const char *tty; - ab = audit_log_start(context, gfp_mask, AUDIT_SYSCALL); + /* tsk == current */ + + ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL); if (!ab) return; /* audit_panic has been called */ audit_log_format(ab, "arch=%x syscall=%d", @@ -607,12 +606,12 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts context->gid, context->euid, context->suid, context->fsuid, context->egid, context->sgid, context->fsgid, tty); - audit_log_task_info(ab, gfp_mask); + audit_log_task_info(ab, tsk); audit_log_end(ab); for (aux = context->aux; aux; aux = aux->next) { - ab = audit_log_start(context, gfp_mask, aux->type); + ab = audit_log_start(context, GFP_KERNEL, aux->type); if (!ab) continue; /* audit_panic has been called */ @@ -649,7 +648,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts } if (context->pwd && context->pwdmnt) { - ab = audit_log_start(context, gfp_mask, AUDIT_CWD); + ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD); if (ab) { audit_log_d_path(ab, "cwd=", context->pwd, context->pwdmnt); audit_log_end(ab); @@ -659,7 +658,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts unsigned long ino = context->names[i].ino; unsigned long pino = context->names[i].pino; - ab = audit_log_start(context, gfp_mask, AUDIT_PATH); + ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); if (!ab) continue; /* audit_panic has been called */ @@ -712,8 +711,9 @@ void audit_free(struct task_struct *tsk) * function (e.g., exit_group), then free context block. * We use GFP_ATOMIC here because we might be doing this * in the context of the idle thread */ + /* that can happen only if we are called from do_exit() */ if (context->in_syscall && context->auditable) - audit_log_exit(context, tsk, GFP_ATOMIC); + audit_log_exit(context, tsk); audit_free_context(context); } @@ -821,6 +821,8 @@ void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code) { struct audit_context *context; + /* tsk == current */ + get_task_struct(tsk); task_lock(tsk); context = audit_get_context(tsk, valid, return_code); @@ -832,7 +834,7 @@ void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code) goto out; if (context->in_syscall && context->auditable) - audit_log_exit(context, tsk, GFP_KERNEL); + audit_log_exit(context, tsk); context->in_syscall = 0; context->auditable = 0; -- cgit v0.10.2 From 5411be59db80333039386f3b1ccfe5eb9023a916 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 29 Mar 2006 20:23:36 -0500 Subject: [PATCH] drop task argument of audit_syscall_{entry,exit} ... it's always current, and that's a good thing - allows simpler locking. Signed-off-by: Al Viro diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c index 506462e..fd7eaf7 100644 --- a/arch/i386/kernel/ptrace.c +++ b/arch/i386/kernel/ptrace.c @@ -671,7 +671,7 @@ int do_syscall_trace(struct pt_regs *regs, int entryexit) if (unlikely(current->audit_context)) { if (entryexit) - audit_syscall_exit(current, AUDITSC_RESULT(regs->eax), + audit_syscall_exit(AUDITSC_RESULT(regs->eax), regs->eax); /* Debug traps, when using PTRACE_SINGLESTEP, must be sent only * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is @@ -720,14 +720,13 @@ int do_syscall_trace(struct pt_regs *regs, int entryexit) ret = is_sysemu; out: if (unlikely(current->audit_context) && !entryexit) - audit_syscall_entry(current, AUDIT_ARCH_I386, regs->orig_eax, + audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_eax, regs->ebx, regs->ecx, regs->edx, regs->esi); if (ret == 0) return 0; regs->orig_eax = -1; /* force skip of syscall restarting */ if (unlikely(current->audit_context)) - audit_syscall_exit(current, AUDITSC_RESULT(regs->eax), - regs->eax); + audit_syscall_exit(AUDITSC_RESULT(regs->eax), regs->eax); return 1; } diff --git a/arch/i386/kernel/vm86.c b/arch/i386/kernel/vm86.c index aee14fa..00e0118 100644 --- a/arch/i386/kernel/vm86.c +++ b/arch/i386/kernel/vm86.c @@ -312,7 +312,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk /*call audit_syscall_exit since we do not exit via the normal paths */ if (unlikely(current->audit_context)) - audit_syscall_exit(current, AUDITSC_RESULT(eax), eax); + audit_syscall_exit(AUDITSC_RESULT(eax), eax); __asm__ __volatile__( "movl %0,%%esp\n\t" diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c index 9887c87..e61e15e 100644 --- a/arch/ia64/kernel/ptrace.c +++ b/arch/ia64/kernel/ptrace.c @@ -1644,7 +1644,7 @@ syscall_trace_enter (long arg0, long arg1, long arg2, long arg3, arch = AUDIT_ARCH_IA64; } - audit_syscall_entry(current, arch, syscall, arg0, arg1, arg2, arg3); + audit_syscall_entry(arch, syscall, arg0, arg1, arg2, arg3); } } @@ -1662,7 +1662,7 @@ syscall_trace_leave (long arg0, long arg1, long arg2, long arg3, if (success != AUDITSC_SUCCESS) result = -result; - audit_syscall_exit(current, success, result); + audit_syscall_exit(success, result); } if (test_thread_flag(TIF_SYSCALL_TRACE) diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index f3106d0..9b4733c 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -483,7 +483,7 @@ static inline int audit_arch(void) asmlinkage void do_syscall_trace(struct pt_regs *regs, int entryexit) { if (unlikely(current->audit_context) && entryexit) - audit_syscall_exit(current, AUDITSC_RESULT(regs->regs[2]), + audit_syscall_exit(AUDITSC_RESULT(regs->regs[2]), regs->regs[2]); if (!(current->ptrace & PT_PTRACED)) @@ -507,7 +507,7 @@ asmlinkage void do_syscall_trace(struct pt_regs *regs, int entryexit) } out: if (unlikely(current->audit_context) && !entryexit) - audit_syscall_entry(current, audit_arch(), regs->regs[2], + audit_syscall_entry(audit_arch(), regs->regs[2], regs->regs[4], regs->regs[5], regs->regs[6], regs->regs[7]); } diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index bcb8357..4a677d1 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -538,7 +538,7 @@ void do_syscall_trace_enter(struct pt_regs *regs) do_syscall_trace(); if (unlikely(current->audit_context)) - audit_syscall_entry(current, + audit_syscall_entry( #ifdef CONFIG_PPC32 AUDIT_ARCH_PPC, #else @@ -556,8 +556,7 @@ void do_syscall_trace_leave(struct pt_regs *regs) #endif if (unlikely(current->audit_context)) - audit_syscall_exit(current, - (regs->ccr&0x1000)?AUDITSC_FAILURE:AUDITSC_SUCCESS, + audit_syscall_exit((regs->ccr&0x1000)?AUDITSC_FAILURE:AUDITSC_SUCCESS, regs->result); if ((test_thread_flag(TIF_SYSCALL_TRACE) diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index 37dfe33..8f36504 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -734,7 +734,7 @@ asmlinkage void syscall_trace(struct pt_regs *regs, int entryexit) { if (unlikely(current->audit_context) && entryexit) - audit_syscall_exit(current, AUDITSC_RESULT(regs->gprs[2]), regs->gprs[2]); + audit_syscall_exit(AUDITSC_RESULT(regs->gprs[2]), regs->gprs[2]); if (!test_thread_flag(TIF_SYSCALL_TRACE)) goto out; @@ -761,8 +761,7 @@ syscall_trace(struct pt_regs *regs, int entryexit) } out: if (unlikely(current->audit_context) && !entryexit) - audit_syscall_entry(current, - test_thread_flag(TIF_31BIT)?AUDIT_ARCH_S390:AUDIT_ARCH_S390X, + audit_syscall_entry(test_thread_flag(TIF_31BIT)?AUDIT_ARCH_S390:AUDIT_ARCH_S390X, regs->gprs[2], regs->orig_gpr2, regs->gprs[3], regs->gprs[4], regs->gprs[5]); } diff --git a/arch/sparc64/kernel/ptrace.c b/arch/sparc64/kernel/ptrace.c index 49e6ded..d31975e 100644 --- a/arch/sparc64/kernel/ptrace.c +++ b/arch/sparc64/kernel/ptrace.c @@ -653,7 +653,7 @@ asmlinkage void syscall_trace(struct pt_regs *regs, int syscall_exit_p) if (unlikely(tstate & (TSTATE_XCARRY | TSTATE_ICARRY))) result = AUDITSC_FAILURE; - audit_syscall_exit(current, result, regs->u_regs[UREG_I0]); + audit_syscall_exit(result, regs->u_regs[UREG_I0]); } if (!(current->ptrace & PT_PTRACED)) @@ -677,8 +677,7 @@ asmlinkage void syscall_trace(struct pt_regs *regs, int syscall_exit_p) out: if (unlikely(current->audit_context) && !syscall_exit_p) - audit_syscall_entry(current, - (test_thread_flag(TIF_32BIT) ? + audit_syscall_entry((test_thread_flag(TIF_32BIT) ? AUDIT_ARCH_SPARC : AUDIT_ARCH_SPARC64), regs->u_regs[UREG_G1], diff --git a/arch/um/kernel/ptrace.c b/arch/um/kernel/ptrace.c index 60d2eda..9a77fb3 100644 --- a/arch/um/kernel/ptrace.c +++ b/arch/um/kernel/ptrace.c @@ -275,15 +275,13 @@ void syscall_trace(union uml_pt_regs *regs, int entryexit) if (unlikely(current->audit_context)) { if (!entryexit) - audit_syscall_entry(current, - HOST_AUDIT_ARCH, + audit_syscall_entry(HOST_AUDIT_ARCH, UPT_SYSCALL_NR(regs), UPT_SYSCALL_ARG1(regs), UPT_SYSCALL_ARG2(regs), UPT_SYSCALL_ARG3(regs), UPT_SYSCALL_ARG4(regs)); - else audit_syscall_exit(current, - AUDITSC_RESULT(UPT_SYSCALL_RET(regs)), + else audit_syscall_exit(AUDITSC_RESULT(UPT_SYSCALL_RET(regs)), UPT_SYSCALL_RET(regs)); } diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c index da8e790..2d50024 100644 --- a/arch/x86_64/kernel/ptrace.c +++ b/arch/x86_64/kernel/ptrace.c @@ -600,12 +600,12 @@ asmlinkage void syscall_trace_enter(struct pt_regs *regs) if (unlikely(current->audit_context)) { if (test_thread_flag(TIF_IA32)) { - audit_syscall_entry(current, AUDIT_ARCH_I386, + audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_rax, regs->rbx, regs->rcx, regs->rdx, regs->rsi); } else { - audit_syscall_entry(current, AUDIT_ARCH_X86_64, + audit_syscall_entry(AUDIT_ARCH_X86_64, regs->orig_rax, regs->rdi, regs->rsi, regs->rdx, regs->r10); @@ -616,7 +616,7 @@ asmlinkage void syscall_trace_enter(struct pt_regs *regs) asmlinkage void syscall_trace_leave(struct pt_regs *regs) { if (unlikely(current->audit_context)) - audit_syscall_exit(current, AUDITSC_RESULT(regs->rax), regs->rax); + audit_syscall_exit(AUDITSC_RESULT(regs->rax), regs->rax); if ((test_thread_flag(TIF_SYSCALL_TRACE) || test_thread_flag(TIF_SINGLESTEP)) diff --git a/include/linux/audit.h b/include/linux/audit.h index 1c47c59..39fef6e 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -287,10 +287,10 @@ struct netlink_skb_parms; /* Public API */ extern int audit_alloc(struct task_struct *task); extern void audit_free(struct task_struct *task); -extern void audit_syscall_entry(struct task_struct *task, int arch, +extern void audit_syscall_entry(int arch, int major, unsigned long a0, unsigned long a1, unsigned long a2, unsigned long a3); -extern void audit_syscall_exit(struct task_struct *task, int failed, long return_code); +extern void audit_syscall_exit(int failed, long return_code); extern void audit_getname(const char *name); extern void audit_putname(const char *name); extern void __audit_inode(const char *name, const struct inode *inode, unsigned flags); @@ -323,8 +323,8 @@ extern int audit_set_macxattr(const char *name); #else #define audit_alloc(t) ({ 0; }) #define audit_free(t) do { ; } while (0) -#define audit_syscall_entry(t,ta,a,b,c,d,e) do { ; } while (0) -#define audit_syscall_exit(t,f,r) do { ; } while (0) +#define audit_syscall_entry(ta,a,b,c,d,e) do { ; } while (0) +#define audit_syscall_exit(f,r) do { ; } while (0) #define audit_getname(n) do { ; } while (0) #define audit_putname(n) do { ; } while (0) #define __audit_inode(n,i,f) do { ; } while (0) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index ba0ec1b..7ed82b0 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -736,10 +736,11 @@ void audit_free(struct task_struct *tsk) * will only be written if another part of the kernel requests that it * be written). */ -void audit_syscall_entry(struct task_struct *tsk, int arch, int major, +void audit_syscall_entry(int arch, int major, unsigned long a1, unsigned long a2, unsigned long a3, unsigned long a4) { + struct task_struct *tsk = current; struct audit_context *context = tsk->audit_context; enum audit_state state; @@ -817,12 +818,11 @@ void audit_syscall_entry(struct task_struct *tsk, int arch, int major, * message), then write out the syscall information. In call cases, * free the names stored from getname(). */ -void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code) +void audit_syscall_exit(int valid, long return_code) { + struct task_struct *tsk = current; struct audit_context *context; - /* tsk == current */ - get_task_struct(tsk); task_lock(tsk); context = audit_get_context(tsk, valid, return_code); -- cgit v0.10.2 From 97e94c453073a2aba4bb5e0825ddc5e923debf11 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 29 Mar 2006 20:26:24 -0500 Subject: [PATCH] no need to wank with task_lock() and pinning task down in audit_syscall_exit() Signed-off-by: Al Viro diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 7ed82b0..8aca4ab 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -329,7 +329,6 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, return AUDIT_BUILD_CONTEXT; } -/* This should be called with task_lock() held. */ static inline struct audit_context *audit_get_context(struct task_struct *tsk, int return_valid, int return_code) @@ -823,15 +822,10 @@ void audit_syscall_exit(int valid, long return_code) struct task_struct *tsk = current; struct audit_context *context; - get_task_struct(tsk); - task_lock(tsk); context = audit_get_context(tsk, valid, return_code); - task_unlock(tsk); - /* Not having a context here is ok, since the parent may have - * called __put_task_struct. */ if (likely(!context)) - goto out; + return; if (context->in_syscall && context->auditable) audit_log_exit(context, tsk); @@ -849,8 +843,6 @@ void audit_syscall_exit(int valid, long return_code) audit_free_aux(context); tsk->audit_context = context; } - out: - put_task_struct(tsk); } /** -- cgit v0.10.2 From 376bd9cb357ec945ac893feaeb63af7370a6e70b Mon Sep 17 00:00:00 2001 From: Darrel Goeddel Date: Fri, 24 Feb 2006 15:44:05 -0600 Subject: [PATCH] support for context based audit filtering The following patch provides selinux interfaces that will allow the audit system to perform filtering based on the process context (user, role, type, sensitivity, and clearance). These interfaces will allow the selinux module to perform efficient matches based on lower level selinux constructs, rather than relying on context retrievals and string comparisons within the audit module. It also allows for dominance checks on the mls portion of the contexts that are impossible with only string comparisons. Signed-off-by: Darrel Goeddel Signed-off-by: Al Viro diff --git a/include/linux/audit.h b/include/linux/audit.h index 39fef6e..740f950 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -145,6 +145,11 @@ #define AUDIT_PERS 10 #define AUDIT_ARCH 11 #define AUDIT_MSGTYPE 12 +#define AUDIT_SE_USER 13 /* security label user */ +#define AUDIT_SE_ROLE 14 /* security label role */ +#define AUDIT_SE_TYPE 15 /* security label type */ +#define AUDIT_SE_SEN 16 /* security label sensitivity label */ +#define AUDIT_SE_CLR 17 /* security label clearance label */ /* These are ONLY useful when checking * at syscall exit time (AUDIT_AT_EXIT). */ diff --git a/include/linux/selinux.h b/include/linux/selinux.h new file mode 100644 index 0000000..9d684b1 --- /dev/null +++ b/include/linux/selinux.h @@ -0,0 +1,112 @@ +/* + * SELinux services exported to the rest of the kernel. + * + * Author: James Morris + * + * Copyright (C) 2005 Red Hat, Inc., James Morris + * Copyright (C) 2006 Trusted Computer Solutions, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2, + * as published by the Free Software Foundation. + */ +#ifndef _LINUX_SELINUX_H +#define _LINUX_SELINUX_H + +struct selinux_audit_rule; +struct audit_context; + +#ifdef CONFIG_SECURITY_SELINUX + +/** + * selinux_audit_rule_init - alloc/init an selinux audit rule structure. + * @field: the field this rule refers to + * @op: the operater the rule uses + * @rulestr: the text "target" of the rule + * @rule: pointer to the new rule structure returned via this + * + * Returns 0 if successful, -errno if not. On success, the rule structure + * will be allocated internally. The caller must free this structure with + * selinux_audit_rule_free() after use. + */ +int selinux_audit_rule_init(u32 field, u32 op, char *rulestr, + struct selinux_audit_rule **rule); + +/** + * selinux_audit_rule_free - free an selinux audit rule structure. + * @rule: pointer to the audit rule to be freed + * + * This will free all memory associated with the given rule. + * If @rule is NULL, no operation is performed. + */ +void selinux_audit_rule_free(struct selinux_audit_rule *rule); + +/** + * selinux_audit_rule_match - determine if a context ID matches a rule. + * @ctxid: the context ID to check + * @field: the field this rule refers to + * @op: the operater the rule uses + * @rule: pointer to the audit rule to check against + * @actx: the audit context (can be NULL) associated with the check + * + * Returns 1 if the context id matches the rule, 0 if it does not, and + * -errno on failure. + */ +int selinux_audit_rule_match(u32 ctxid, u32 field, u32 op, + struct selinux_audit_rule *rule, + struct audit_context *actx); + +/** + * selinux_audit_set_callback - set the callback for policy reloads. + * @callback: the function to call when the policy is reloaded + * + * This sets the function callback function that will update the rules + * upon policy reloads. This callback should rebuild all existing rules + * using selinux_audit_rule_init(). + */ +void selinux_audit_set_callback(int (*callback)(void)); + +/** + * selinux_task_ctxid - determine a context ID for a process. + * @tsk: the task object + * @ctxid: ID value returned via this + * + * On return, ctxid will contain an ID for the context. This value + * should only be used opaquely. + */ +void selinux_task_ctxid(struct task_struct *tsk, u32 *ctxid); + +#else + +static inline int selinux_audit_rule_init(u32 field, u32 op, + char *rulestr, + struct selinux_audit_rule **rule) +{ + return -ENOTSUPP; +} + +static inline void selinux_audit_rule_free(struct selinux_audit_rule *rule) +{ + return; +} + +static inline int selinux_audit_rule_match(u32 ctxid, u32 field, u32 op, + struct selinux_audit_rule *rule, + struct audit_context *actx) +{ + return 0; +} + +static inline void selinux_audit_set_callback(int (*callback)(void)) +{ + return; +} + +static inline void selinux_task_ctxid(struct task_struct *tsk, u32 *ctxid) +{ + *ctxid = 0; +} + +#endif /* CONFIG_SECURITY_SELINUX */ + +#endif /* _LINUX_SELINUX_H */ diff --git a/security/selinux/Makefile b/security/selinux/Makefile index 688c0a2..faf2e02 100644 --- a/security/selinux/Makefile +++ b/security/selinux/Makefile @@ -4,7 +4,7 @@ obj-$(CONFIG_SECURITY_SELINUX) := selinux.o ss/ -selinux-y := avc.o hooks.o selinuxfs.o netlink.o nlmsgtab.o netif.o +selinux-y := avc.o hooks.o selinuxfs.o netlink.o nlmsgtab.o netif.o exports.o selinux-$(CONFIG_SECURITY_NETWORK_XFRM) += xfrm.o diff --git a/security/selinux/avc.c b/security/selinux/avc.c index ac5d69b..a300702 100644 --- a/security/selinux/avc.c +++ b/security/selinux/avc.c @@ -800,7 +800,7 @@ out: int avc_ss_reset(u32 seqno) { struct avc_callback_node *c; - int i, rc = 0; + int i, rc = 0, tmprc; unsigned long flag; struct avc_node *node; @@ -813,15 +813,16 @@ int avc_ss_reset(u32 seqno) for (c = avc_callbacks; c; c = c->next) { if (c->events & AVC_CALLBACK_RESET) { - rc = c->callback(AVC_CALLBACK_RESET, - 0, 0, 0, 0, NULL); - if (rc) - goto out; + tmprc = c->callback(AVC_CALLBACK_RESET, + 0, 0, 0, 0, NULL); + /* save the first error encountered for the return + value and continue processing the callbacks */ + if (!rc) + rc = tmprc; } } avc_latest_notif_update(seqno, 0); -out: return rc; } diff --git a/security/selinux/exports.c b/security/selinux/exports.c new file mode 100644 index 0000000..333c4c7 --- /dev/null +++ b/security/selinux/exports.c @@ -0,0 +1,28 @@ +/* + * SELinux services exported to the rest of the kernel. + * + * Author: James Morris + * + * Copyright (C) 2005 Red Hat, Inc., James Morris + * Copyright (C) 2006 Trusted Computer Solutions, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2, + * as published by the Free Software Foundation. + */ +#include +#include +#include +#include + +#include "security.h" +#include "objsec.h" + +void selinux_task_ctxid(struct task_struct *tsk, u32 *ctxid) +{ + struct task_security_struct *tsec = tsk->security; + if (selinux_enabled) + *ctxid = tsec->sid; + else + *ctxid = 0; +} diff --git a/security/selinux/ss/mls.c b/security/selinux/ss/mls.c index 84047f6..7bc5b64 100644 --- a/security/selinux/ss/mls.c +++ b/security/selinux/ss/mls.c @@ -8,7 +8,7 @@ * * Support for enhanced MLS infrastructure. * - * Copyright (C) 2004-2005 Trusted Computer Solutions, Inc. + * Copyright (C) 2004-2006 Trusted Computer Solutions, Inc. */ #include @@ -385,6 +385,34 @@ out: } /* + * Set the MLS fields in the security context structure + * `context' based on the string representation in + * the string `str'. This function will allocate temporary memory with the + * given constraints of gfp_mask. + */ +int mls_from_string(char *str, struct context *context, gfp_t gfp_mask) +{ + char *tmpstr, *freestr; + int rc; + + if (!selinux_mls_enabled) + return -EINVAL; + + /* we need freestr because mls_context_to_sid will change + the value of tmpstr */ + tmpstr = freestr = kstrdup(str, gfp_mask); + if (!tmpstr) { + rc = -ENOMEM; + } else { + rc = mls_context_to_sid(':', &tmpstr, context, + NULL, SECSID_NULL); + kfree(freestr); + } + + return rc; +} + +/* * Copies the effective MLS range from `src' into `dst'. */ static inline int mls_scopy_context(struct context *dst, diff --git a/security/selinux/ss/mls.h b/security/selinux/ss/mls.h index 03de697..fbb42f0 100644 --- a/security/selinux/ss/mls.h +++ b/security/selinux/ss/mls.h @@ -8,7 +8,7 @@ * * Support for enhanced MLS infrastructure. * - * Copyright (C) 2004-2005 Trusted Computer Solutions, Inc. + * Copyright (C) 2004-2006 Trusted Computer Solutions, Inc. */ #ifndef _SS_MLS_H_ @@ -27,6 +27,8 @@ int mls_context_to_sid(char oldc, struct sidtab *s, u32 def_sid); +int mls_from_string(char *str, struct context *context, gfp_t gfp_mask); + int mls_convert_context(struct policydb *oldp, struct policydb *newp, struct context *context); diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index 6149248..7177e98 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -7,12 +7,13 @@ * Updated: Trusted Computer Solutions, Inc. * * Support for enhanced MLS infrastructure. + * Support for context based audit filters. * * Updated: Frank Mayer and Karl MacMillan * * Added conditional policy language extensions * - * Copyright (C) 2004-2005 Trusted Computer Solutions, Inc. + * Copyright (C) 2004-2006 Trusted Computer Solutions, Inc. * Copyright (C) 2003 - 2004 Tresys Technology, LLC * Copyright (C) 2003 Red Hat, Inc., James Morris * This program is free software; you can redistribute it and/or modify @@ -1811,3 +1812,235 @@ out: POLICY_RDUNLOCK; return rc; } + +struct selinux_audit_rule { + u32 au_seqno; + struct context au_ctxt; +}; + +void selinux_audit_rule_free(struct selinux_audit_rule *rule) +{ + if (rule) { + context_destroy(&rule->au_ctxt); + kfree(rule); + } +} + +int selinux_audit_rule_init(u32 field, u32 op, char *rulestr, + struct selinux_audit_rule **rule) +{ + struct selinux_audit_rule *tmprule; + struct role_datum *roledatum; + struct type_datum *typedatum; + struct user_datum *userdatum; + int rc = 0; + + *rule = NULL; + + if (!ss_initialized) + return -ENOTSUPP; + + switch (field) { + case AUDIT_SE_USER: + case AUDIT_SE_ROLE: + case AUDIT_SE_TYPE: + /* only 'equals' and 'not equals' fit user, role, and type */ + if (op != AUDIT_EQUAL && op != AUDIT_NOT_EQUAL) + return -EINVAL; + break; + case AUDIT_SE_SEN: + case AUDIT_SE_CLR: + /* we do not allow a range, indicated by the presense of '-' */ + if (strchr(rulestr, '-')) + return -EINVAL; + break; + default: + /* only the above fields are valid */ + return -EINVAL; + } + + tmprule = kzalloc(sizeof(struct selinux_audit_rule), GFP_KERNEL); + if (!tmprule) + return -ENOMEM; + + context_init(&tmprule->au_ctxt); + + POLICY_RDLOCK; + + tmprule->au_seqno = latest_granting; + + switch (field) { + case AUDIT_SE_USER: + userdatum = hashtab_search(policydb.p_users.table, rulestr); + if (!userdatum) + rc = -EINVAL; + else + tmprule->au_ctxt.user = userdatum->value; + break; + case AUDIT_SE_ROLE: + roledatum = hashtab_search(policydb.p_roles.table, rulestr); + if (!roledatum) + rc = -EINVAL; + else + tmprule->au_ctxt.role = roledatum->value; + break; + case AUDIT_SE_TYPE: + typedatum = hashtab_search(policydb.p_types.table, rulestr); + if (!typedatum) + rc = -EINVAL; + else + tmprule->au_ctxt.type = typedatum->value; + break; + case AUDIT_SE_SEN: + case AUDIT_SE_CLR: + rc = mls_from_string(rulestr, &tmprule->au_ctxt, GFP_ATOMIC); + break; + } + + POLICY_RDUNLOCK; + + if (rc) { + selinux_audit_rule_free(tmprule); + tmprule = NULL; + } + + *rule = tmprule; + + return rc; +} + +int selinux_audit_rule_match(u32 ctxid, u32 field, u32 op, + struct selinux_audit_rule *rule, + struct audit_context *actx) +{ + struct context *ctxt; + struct mls_level *level; + int match = 0; + + if (!rule) { + audit_log(actx, GFP_ATOMIC, AUDIT_SELINUX_ERR, + "selinux_audit_rule_match: missing rule\n"); + return -ENOENT; + } + + POLICY_RDLOCK; + + if (rule->au_seqno < latest_granting) { + audit_log(actx, GFP_ATOMIC, AUDIT_SELINUX_ERR, + "selinux_audit_rule_match: stale rule\n"); + match = -ESTALE; + goto out; + } + + ctxt = sidtab_search(&sidtab, ctxid); + if (!ctxt) { + audit_log(actx, GFP_ATOMIC, AUDIT_SELINUX_ERR, + "selinux_audit_rule_match: unrecognized SID %d\n", + ctxid); + match = -ENOENT; + goto out; + } + + /* a field/op pair that is not caught here will simply fall through + without a match */ + switch (field) { + case AUDIT_SE_USER: + switch (op) { + case AUDIT_EQUAL: + match = (ctxt->user == rule->au_ctxt.user); + break; + case AUDIT_NOT_EQUAL: + match = (ctxt->user != rule->au_ctxt.user); + break; + } + break; + case AUDIT_SE_ROLE: + switch (op) { + case AUDIT_EQUAL: + match = (ctxt->role == rule->au_ctxt.role); + break; + case AUDIT_NOT_EQUAL: + match = (ctxt->role != rule->au_ctxt.role); + break; + } + break; + case AUDIT_SE_TYPE: + switch (op) { + case AUDIT_EQUAL: + match = (ctxt->type == rule->au_ctxt.type); + break; + case AUDIT_NOT_EQUAL: + match = (ctxt->type != rule->au_ctxt.type); + break; + } + break; + case AUDIT_SE_SEN: + case AUDIT_SE_CLR: + level = (op == AUDIT_SE_SEN ? + &ctxt->range.level[0] : &ctxt->range.level[1]); + switch (op) { + case AUDIT_EQUAL: + match = mls_level_eq(&rule->au_ctxt.range.level[0], + level); + break; + case AUDIT_NOT_EQUAL: + match = !mls_level_eq(&rule->au_ctxt.range.level[0], + level); + break; + case AUDIT_LESS_THAN: + match = (mls_level_dom(&rule->au_ctxt.range.level[0], + level) && + !mls_level_eq(&rule->au_ctxt.range.level[0], + level)); + break; + case AUDIT_LESS_THAN_OR_EQUAL: + match = mls_level_dom(&rule->au_ctxt.range.level[0], + level); + break; + case AUDIT_GREATER_THAN: + match = (mls_level_dom(level, + &rule->au_ctxt.range.level[0]) && + !mls_level_eq(level, + &rule->au_ctxt.range.level[0])); + break; + case AUDIT_GREATER_THAN_OR_EQUAL: + match = mls_level_dom(level, + &rule->au_ctxt.range.level[0]); + break; + } + } + +out: + POLICY_RDUNLOCK; + return match; +} + +static int (*aurule_callback)(void) = NULL; + +static int aurule_avc_callback(u32 event, u32 ssid, u32 tsid, + u16 class, u32 perms, u32 *retained) +{ + int err = 0; + + if (event == AVC_CALLBACK_RESET && aurule_callback) + err = aurule_callback(); + return err; +} + +static int __init aurule_init(void) +{ + int err; + + err = avc_add_callback(aurule_avc_callback, AVC_CALLBACK_RESET, + SECSID_NULL, SECSID_NULL, SECCLASS_NULL, 0); + if (err) + panic("avc_add_callback() failed, error %d\n", err); + + return err; +} +__initcall(aurule_init); + +void selinux_audit_set_callback(int (*callback)(void)) +{ + aurule_callback = callback; +} -- cgit v0.10.2 From 3dc7e3153eddfcf7ba8b50628775ba516e5f759f Mon Sep 17 00:00:00 2001 From: Darrel Goeddel Date: Fri, 10 Mar 2006 18:14:06 -0600 Subject: [PATCH] support for context based audit filtering, part 2 This patch provides the ability to filter audit messages based on the elements of the process' SELinux context (user, role, type, mls sensitivity, and mls clearance). It uses the new interfaces from selinux to opaquely store information related to the selinux context and to filter based on that information. It also uses the callback mechanism provided by selinux to refresh the information when a new policy is loaded. Signed-off-by: Al Viro diff --git a/kernel/audit.c b/kernel/audit.c index c8ccbd0..9060be7 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -55,6 +55,9 @@ #include #include #include +#include + +#include "audit.h" /* No auditing will take place until audit_initialized != 0. * (Initialization happens after skb_init is called.) */ @@ -564,6 +567,11 @@ static int __init audit_init(void) skb_queue_head_init(&audit_skb_queue); audit_initialized = 1; audit_enabled = audit_default; + + /* Register the callback with selinux. This callback will be invoked + * when a new policy is loaded. */ + selinux_audit_set_callback(&selinux_audit_rule_update); + audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); return 0; } diff --git a/kernel/audit.h b/kernel/audit.h index bc53920..6f73392 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -54,9 +54,11 @@ enum audit_state { /* Rule lists */ struct audit_field { - u32 type; - u32 val; - u32 op; + u32 type; + u32 val; + u32 op; + char *se_str; + struct selinux_audit_rule *se_rule; }; struct audit_krule { @@ -86,3 +88,5 @@ extern void audit_send_reply(int pid, int seq, int type, extern void audit_log_lost(const char *message); extern void audit_panic(const char *message); extern struct mutex audit_netlink_mutex; + +extern int selinux_audit_rule_update(void); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index d3a8539..85a7862 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "audit.h" /* There are three lists of rules -- one to search at task creation @@ -42,6 +43,13 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { static inline void audit_free_rule(struct audit_entry *e) { + int i; + if (e->rule.fields) + for (i = 0; i < e->rule.field_count; i++) { + struct audit_field *f = &e->rule.fields[i]; + kfree(f->se_str); + selinux_audit_rule_free(f->se_rule); + } kfree(e->rule.fields); kfree(e); } @@ -52,9 +60,29 @@ static inline void audit_free_rule_rcu(struct rcu_head *head) audit_free_rule(e); } +/* Initialize an audit filterlist entry. */ +static inline struct audit_entry *audit_init_entry(u32 field_count) +{ + struct audit_entry *entry; + struct audit_field *fields; + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (unlikely(!entry)) + return NULL; + + fields = kzalloc(sizeof(*fields) * field_count, GFP_KERNEL); + if (unlikely(!fields)) { + kfree(entry); + return NULL; + } + entry->rule.fields = fields; + + return entry; +} + /* Unpack a filter field's string representation from user-space * buffer. */ -static __attribute__((unused)) char *audit_unpack_string(void **bufp, size_t *remain, size_t len) +static char *audit_unpack_string(void **bufp, size_t *remain, size_t len) { char *str; @@ -84,7 +112,6 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) { unsigned listnr; struct audit_entry *entry; - struct audit_field *fields; int i, err; err = -EINVAL; @@ -108,23 +135,14 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) goto exit_err; err = -ENOMEM; - entry = kmalloc(sizeof(*entry), GFP_KERNEL); - if (unlikely(!entry)) - goto exit_err; - fields = kmalloc(sizeof(*fields) * rule->field_count, GFP_KERNEL); - if (unlikely(!fields)) { - kfree(entry); + entry = audit_init_entry(rule->field_count); + if (!entry) goto exit_err; - } - - memset(&entry->rule, 0, sizeof(struct audit_krule)); - memset(fields, 0, sizeof(struct audit_field)); entry->rule.flags = rule->flags & AUDIT_FILTER_PREPEND; entry->rule.listnr = listnr; entry->rule.action = rule->action; entry->rule.field_count = rule->field_count; - entry->rule.fields = fields; for (i = 0; i < AUDIT_BITMASK_SIZE; i++) entry->rule.mask[i] = rule->mask[i]; @@ -150,15 +168,20 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) for (i = 0; i < rule->field_count; i++) { struct audit_field *f = &entry->rule.fields[i]; - if (rule->fields[i] & AUDIT_UNUSED_BITS) { - err = -EINVAL; - goto exit_free; - } - f->op = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS); f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS); f->val = rule->values[i]; + if (f->type & AUDIT_UNUSED_BITS || + f->type == AUDIT_SE_USER || + f->type == AUDIT_SE_ROLE || + f->type == AUDIT_SE_TYPE || + f->type == AUDIT_SE_SEN || + f->type == AUDIT_SE_CLR) { + err = -EINVAL; + goto exit_free; + } + entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1; /* Support for legacy operators where @@ -188,8 +211,9 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, int err = 0; struct audit_entry *entry; void *bufp; - /* size_t remain = datasz - sizeof(struct audit_rule_data); */ + size_t remain = datasz - sizeof(struct audit_rule_data); int i; + char *str; entry = audit_to_entry_common((struct audit_rule *)data); if (IS_ERR(entry)) @@ -207,10 +231,35 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, f->op = data->fieldflags[i] & AUDIT_OPERATORS; f->type = data->fields[i]; + f->val = data->values[i]; + f->se_str = NULL; + f->se_rule = NULL; switch(f->type) { - /* call type-specific conversion routines here */ - default: - f->val = data->values[i]; + case AUDIT_SE_USER: + case AUDIT_SE_ROLE: + case AUDIT_SE_TYPE: + case AUDIT_SE_SEN: + case AUDIT_SE_CLR: + str = audit_unpack_string(&bufp, &remain, f->val); + if (IS_ERR(str)) + goto exit_free; + entry->rule.buflen += f->val; + + err = selinux_audit_rule_init(f->type, f->op, str, + &f->se_rule); + /* Keep currently invalid fields around in case they + * become valid after a policy reload. */ + if (err == -EINVAL) { + printk(KERN_WARNING "audit rule for selinux " + "\'%s\' is invalid\n", str); + err = 0; + } + if (err) { + kfree(str); + goto exit_free; + } else + f->se_str = str; + break; } } @@ -286,7 +335,14 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) data->fields[i] = f->type; data->fieldflags[i] = f->op; switch(f->type) { - /* call type-specific conversion routines here */ + case AUDIT_SE_USER: + case AUDIT_SE_ROLE: + case AUDIT_SE_TYPE: + case AUDIT_SE_SEN: + case AUDIT_SE_CLR: + data->buflen += data->values[i] = + audit_pack_string(&bufp, f->se_str); + break; default: data->values[i] = f->val; } @@ -314,7 +370,14 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) return 1; switch(a->fields[i].type) { - /* call type-specific comparison routines here */ + case AUDIT_SE_USER: + case AUDIT_SE_ROLE: + case AUDIT_SE_TYPE: + case AUDIT_SE_SEN: + case AUDIT_SE_CLR: + if (strcmp(a->fields[i].se_str, b->fields[i].se_str)) + return 1; + break; default: if (a->fields[i].val != b->fields[i].val) return 1; @@ -328,6 +391,81 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) return 0; } +/* Duplicate selinux field information. The se_rule is opaque, so must be + * re-initialized. */ +static inline int audit_dupe_selinux_field(struct audit_field *df, + struct audit_field *sf) +{ + int ret = 0; + char *se_str; + + /* our own copy of se_str */ + se_str = kstrdup(sf->se_str, GFP_KERNEL); + if (unlikely(IS_ERR(se_str))) + return -ENOMEM; + df->se_str = se_str; + + /* our own (refreshed) copy of se_rule */ + ret = selinux_audit_rule_init(df->type, df->op, df->se_str, + &df->se_rule); + /* Keep currently invalid fields around in case they + * become valid after a policy reload. */ + if (ret == -EINVAL) { + printk(KERN_WARNING "audit rule for selinux \'%s\' is " + "invalid\n", df->se_str); + ret = 0; + } + + return ret; +} + +/* Duplicate an audit rule. This will be a deep copy with the exception + * of the watch - that pointer is carried over. The selinux specific fields + * will be updated in the copy. The point is to be able to replace the old + * rule with the new rule in the filterlist, then free the old rule. */ +static struct audit_entry *audit_dupe_rule(struct audit_krule *old) +{ + u32 fcount = old->field_count; + struct audit_entry *entry; + struct audit_krule *new; + int i, err = 0; + + entry = audit_init_entry(fcount); + if (unlikely(!entry)) + return ERR_PTR(-ENOMEM); + + new = &entry->rule; + new->vers_ops = old->vers_ops; + new->flags = old->flags; + new->listnr = old->listnr; + new->action = old->action; + for (i = 0; i < AUDIT_BITMASK_SIZE; i++) + new->mask[i] = old->mask[i]; + new->buflen = old->buflen; + new->field_count = old->field_count; + memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount); + + /* deep copy this information, updating the se_rule fields, because + * the originals will all be freed when the old rule is freed. */ + for (i = 0; i < fcount; i++) { + switch (new->fields[i].type) { + case AUDIT_SE_USER: + case AUDIT_SE_ROLE: + case AUDIT_SE_TYPE: + case AUDIT_SE_SEN: + case AUDIT_SE_CLR: + err = audit_dupe_selinux_field(&new->fields[i], + &old->fields[i]); + } + if (err) { + audit_free_rule(entry); + return ERR_PTR(err); + } + } + + return entry; +} + /* Add rule to given filterlist if not a duplicate. Protected by * audit_netlink_mutex. */ static inline int audit_add_rule(struct audit_entry *entry, @@ -628,3 +766,62 @@ unlock_and_return: rcu_read_unlock(); return result; } + +/* Check to see if the rule contains any selinux fields. Returns 1 if there + are selinux fields specified in the rule, 0 otherwise. */ +static inline int audit_rule_has_selinux(struct audit_krule *rule) +{ + int i; + + for (i = 0; i < rule->field_count; i++) { + struct audit_field *f = &rule->fields[i]; + switch (f->type) { + case AUDIT_SE_USER: + case AUDIT_SE_ROLE: + case AUDIT_SE_TYPE: + case AUDIT_SE_SEN: + case AUDIT_SE_CLR: + return 1; + } + } + + return 0; +} + +/* This function will re-initialize the se_rule field of all applicable rules. + * It will traverse the filter lists serarching for rules that contain selinux + * specific filter fields. When such a rule is found, it is copied, the + * selinux field is re-initialized, and the old rule is replaced with the + * updated rule. */ +int selinux_audit_rule_update(void) +{ + struct audit_entry *entry, *n, *nentry; + int i, err = 0; + + /* audit_netlink_mutex synchronizes the writers */ + mutex_lock(&audit_netlink_mutex); + + for (i = 0; i < AUDIT_NR_FILTERS; i++) { + list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) { + if (!audit_rule_has_selinux(&entry->rule)) + continue; + + nentry = audit_dupe_rule(&entry->rule); + if (unlikely(IS_ERR(nentry))) { + /* save the first error encountered for the + * return value */ + if (!err) + err = PTR_ERR(nentry); + audit_panic("error updating selinux filters"); + list_del_rcu(&entry->list); + } else { + list_replace_rcu(&entry->list, &nentry->list); + } + call_rcu(&entry->rcu, audit_free_rule_rcu); + } + } + + mutex_unlock(&audit_netlink_mutex); + + return err; +} diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 8aca4ab..d3d97d2 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -58,6 +58,7 @@ #include #include #include +#include #include "audit.h" @@ -168,6 +169,9 @@ static int audit_filter_rules(struct task_struct *tsk, enum audit_state *state) { int i, j; + u32 sid; + + selinux_task_ctxid(tsk, &sid); for (i = 0; i < rule->field_count; i++) { struct audit_field *f = &rule->fields[i]; @@ -257,6 +261,22 @@ static int audit_filter_rules(struct task_struct *tsk, if (ctx) result = audit_comparator(ctx->loginuid, f->op, f->val); break; + case AUDIT_SE_USER: + case AUDIT_SE_ROLE: + case AUDIT_SE_TYPE: + case AUDIT_SE_SEN: + case AUDIT_SE_CLR: + /* NOTE: this may return negative values indicating + a temporary error. We simply treat this as a + match for now to avoid losing information that + may be wanted. An error message will also be + logged upon error */ + if (f->se_rule) + result = selinux_audit_rule_match(sid, f->type, + f->op, + f->se_rule, + ctx); + break; case AUDIT_ARG0: case AUDIT_ARG1: case AUDIT_ARG2: -- cgit v0.10.2 From 1b50eed9cac0e8e5e4d3a522d8aa267f7f8f8acb Mon Sep 17 00:00:00 2001 From: Steve Grubb Date: Mon, 3 Apr 2006 14:06:13 -0400 Subject: [PATCH] audit inode patch Previously, we were gathering the context instead of the sid. Now in this patch, we gather just the sid and convert to context only if an audit event is being output. This patch brings the performance hit from 146% down to 23% Signed-off-by: Al Viro diff --git a/include/linux/selinux.h b/include/linux/selinux.h index 9d684b1..84a6c74 100644 --- a/include/linux/selinux.h +++ b/include/linux/selinux.h @@ -15,6 +15,7 @@ struct selinux_audit_rule; struct audit_context; +struct inode; #ifdef CONFIG_SECURITY_SELINUX @@ -76,6 +77,27 @@ void selinux_audit_set_callback(int (*callback)(void)); */ void selinux_task_ctxid(struct task_struct *tsk, u32 *ctxid); +/** + * selinux_ctxid_to_string - map a security context ID to a string + * @ctxid: security context ID to be converted. + * @ctx: address of context string to be returned + * @ctxlen: length of returned context string. + * + * Returns 0 if successful, -errno if not. On success, the context + * string will be allocated internally, and the caller must call + * kfree() on it after use. + */ +int selinux_ctxid_to_string(u32 ctxid, char **ctx, u32 *ctxlen); + +/** + * selinux_get_inode_sid - get the inode's security context ID + * @inode: inode structure to get the sid from. + * @sid: pointer to security context ID to be filled in. + * + * Returns nothing + */ +void selinux_get_inode_sid(const struct inode *inode, u32 *sid); + #else static inline int selinux_audit_rule_init(u32 field, u32 op, @@ -107,6 +129,18 @@ static inline void selinux_task_ctxid(struct task_struct *tsk, u32 *ctxid) *ctxid = 0; } +static inline int selinux_ctxid_to_string(u32 ctxid, char **ctx, u32 *ctxlen) +{ + *ctx = NULL; + *ctxlen = 0; + return 0; +} + +static inline void selinux_get_inode_sid(const struct inode *inode, u32 *sid) +{ + *sid = 0; +} + #endif /* CONFIG_SECURITY_SELINUX */ #endif /* _LINUX_SELINUX_H */ diff --git a/kernel/auditsc.c b/kernel/auditsc.c index d3d97d2..2e123a8 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -90,7 +90,7 @@ struct audit_names { uid_t uid; gid_t gid; dev_t rdev; - char *ctx; + u32 osid; }; struct audit_aux_data { @@ -410,9 +410,6 @@ static inline void audit_free_names(struct audit_context *context) #endif for (i = 0; i < context->name_count; i++) { - char *p = context->names[i].ctx; - context->names[i].ctx = NULL; - kfree(p); if (context->names[i].name) __putname(context->names[i].name); } @@ -674,6 +671,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts } } for (i = 0; i < context->name_count; i++) { + int call_panic = 0; unsigned long ino = context->names[i].ino; unsigned long pino = context->names[i].pino; @@ -703,12 +701,22 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts context->names[i].gid, MAJOR(context->names[i].rdev), MINOR(context->names[i].rdev)); - if (context->names[i].ctx) { - audit_log_format(ab, " obj=%s", - context->names[i].ctx); + if (context->names[i].osid != 0) { + char *ctx = NULL; + u32 len; + if (selinux_ctxid_to_string( + context->names[i].osid, &ctx, &len)) { + audit_log_format(ab, " obj=%u", + context->names[i].osid); + call_panic = 1; + } else + audit_log_format(ab, " obj=%s", ctx); + kfree(ctx); } audit_log_end(ab); + if (call_panic) + audit_panic("error converting sid to string"); } } @@ -946,37 +954,8 @@ void audit_putname(const char *name) void audit_inode_context(int idx, const struct inode *inode) { struct audit_context *context = current->audit_context; - const char *suffix = security_inode_xattr_getsuffix(); - char *ctx = NULL; - int len = 0; - - if (!suffix) - goto ret; - - len = security_inode_getsecurity(inode, suffix, NULL, 0, 0); - if (len == -EOPNOTSUPP) - goto ret; - if (len < 0) - goto error_path; - - ctx = kmalloc(len, GFP_KERNEL); - if (!ctx) - goto error_path; - - len = security_inode_getsecurity(inode, suffix, ctx, len, 0); - if (len < 0) - goto error_path; - - kfree(context->names[idx].ctx); - context->names[idx].ctx = ctx; - goto ret; -error_path: - if (ctx) - kfree(ctx); - audit_panic("error in audit_inode_context"); -ret: - return; + selinux_get_inode_sid(inode, &context->names[idx].osid); } diff --git a/security/selinux/exports.c b/security/selinux/exports.c index 333c4c7..07ddce7 100644 --- a/security/selinux/exports.c +++ b/security/selinux/exports.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "security.h" #include "objsec.h" @@ -26,3 +27,26 @@ void selinux_task_ctxid(struct task_struct *tsk, u32 *ctxid) else *ctxid = 0; } + +int selinux_ctxid_to_string(u32 ctxid, char **ctx, u32 *ctxlen) +{ + if (selinux_enabled) + return security_sid_to_context(ctxid, ctx, ctxlen); + else { + *ctx = NULL; + *ctxlen = 0; + } + + return 0; +} + +void selinux_get_inode_sid(const struct inode *inode, u32 *sid) +{ + if (selinux_enabled) { + struct inode_security_struct *isec = inode->i_security; + *sid = isec->sid; + return; + } + *sid = 0; +} + -- cgit v0.10.2 From 9c7aa6aa74fa8a5cda36e54cbbe4fffe0214497d Mon Sep 17 00:00:00 2001 From: Steve Grubb Date: Fri, 31 Mar 2006 15:22:49 -0500 Subject: [PATCH] change lspp ipc auditing Hi, The patch below converts IPC auditing to collect sid's and convert to context string only if it needs to output an audit record. This patch depends on the inode audit change patch already being applied. Signed-off-by: Steve Grubb Signed-off-by: Al Viro diff --git a/include/linux/security.h b/include/linux/security.h index aaa0a5c..1bab48f 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -869,11 +869,6 @@ struct swap_info_struct; * @ipcp contains the kernel IPC permission structure * @flag contains the desired (requested) permission set * Return 0 if permission is granted. - * @ipc_getsecurity: - * Copy the security label associated with the ipc object into - * @buffer. @buffer may be NULL to request the size of the buffer - * required. @size indicates the size of @buffer in bytes. Return - * number of bytes used/required on success. * * Security hooks for individual messages held in System V IPC message queues * @msg_msg_alloc_security: @@ -1223,7 +1218,6 @@ struct security_operations { void (*task_to_inode)(struct task_struct *p, struct inode *inode); int (*ipc_permission) (struct kern_ipc_perm * ipcp, short flag); - int (*ipc_getsecurity)(struct kern_ipc_perm *ipcp, void *buffer, size_t size); int (*msg_msg_alloc_security) (struct msg_msg * msg); void (*msg_msg_free_security) (struct msg_msg * msg); @@ -1887,11 +1881,6 @@ static inline int security_ipc_permission (struct kern_ipc_perm *ipcp, return security_ops->ipc_permission (ipcp, flag); } -static inline int security_ipc_getsecurity(struct kern_ipc_perm *ipcp, void *buffer, size_t size) -{ - return security_ops->ipc_getsecurity(ipcp, buffer, size); -} - static inline int security_msg_msg_alloc (struct msg_msg * msg) { return security_ops->msg_msg_alloc_security (msg); @@ -2532,11 +2521,6 @@ static inline int security_ipc_permission (struct kern_ipc_perm *ipcp, return 0; } -static inline int security_ipc_getsecurity(struct kern_ipc_perm *ipcp, void *buffer, size_t size) -{ - return -EOPNOTSUPP; -} - static inline int security_msg_msg_alloc (struct msg_msg * msg) { return 0; diff --git a/include/linux/selinux.h b/include/linux/selinux.h index 84a6c74..413d667 100644 --- a/include/linux/selinux.h +++ b/include/linux/selinux.h @@ -16,6 +16,7 @@ struct selinux_audit_rule; struct audit_context; struct inode; +struct kern_ipc_perm; #ifdef CONFIG_SECURITY_SELINUX @@ -98,6 +99,15 @@ int selinux_ctxid_to_string(u32 ctxid, char **ctx, u32 *ctxlen); */ void selinux_get_inode_sid(const struct inode *inode, u32 *sid); +/** + * selinux_get_ipc_sid - get the ipc security context ID + * @ipcp: ipc structure to get the sid from. + * @sid: pointer to security context ID to be filled in. + * + * Returns nothing + */ +void selinux_get_ipc_sid(const struct kern_ipc_perm *ipcp, u32 *sid); + #else static inline int selinux_audit_rule_init(u32 field, u32 op, @@ -141,6 +151,11 @@ static inline void selinux_get_inode_sid(const struct inode *inode, u32 *sid) *sid = 0; } +static inline void selinux_get_ipc_sid(const struct kern_ipc_perm *ipcp, u32 *sid) +{ + *sid = 0; +} + #endif /* CONFIG_SECURITY_SELINUX */ #endif /* _LINUX_SELINUX_H */ diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 2e123a8..b4f7223 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -107,7 +107,7 @@ struct audit_aux_data_ipcctl { uid_t uid; gid_t gid; mode_t mode; - char *ctx; + u32 osid; }; struct audit_aux_data_socketcall { @@ -432,11 +432,6 @@ static inline void audit_free_aux(struct audit_context *context) dput(axi->dentry); mntput(axi->mnt); } - if ( aux->type == AUDIT_IPC ) { - struct audit_aux_data_ipcctl *axi = (void *)aux; - if (axi->ctx) - kfree(axi->ctx); - } context->aux = aux->next; kfree(aux); @@ -584,7 +579,7 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) { - int i; + int i, call_panic = 0; struct audit_buffer *ab; struct audit_aux_data *aux; const char *tty; @@ -635,8 +630,20 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts case AUDIT_IPC: { struct audit_aux_data_ipcctl *axi = (void *)aux; audit_log_format(ab, - " qbytes=%lx iuid=%u igid=%u mode=%x obj=%s", - axi->qbytes, axi->uid, axi->gid, axi->mode, axi->ctx); + " qbytes=%lx iuid=%u igid=%u mode=%x", + axi->qbytes, axi->uid, axi->gid, axi->mode); + if (axi->osid != 0) { + char *ctx = NULL; + u32 len; + if (selinux_ctxid_to_string( + axi->osid, &ctx, &len)) { + audit_log_format(ab, " obj=%u", + axi->osid); + call_panic = 1; + } else + audit_log_format(ab, " obj=%s", ctx); + kfree(ctx); + } break; } case AUDIT_SOCKETCALL: { @@ -671,7 +678,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts } } for (i = 0; i < context->name_count; i++) { - int call_panic = 0; unsigned long ino = context->names[i].ino; unsigned long pino = context->names[i].pino; @@ -708,16 +714,16 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts context->names[i].osid, &ctx, &len)) { audit_log_format(ab, " obj=%u", context->names[i].osid); - call_panic = 1; + call_panic = 2; } else audit_log_format(ab, " obj=%s", ctx); kfree(ctx); } audit_log_end(ab); - if (call_panic) - audit_panic("error converting sid to string"); } + if (call_panic) + audit_panic("error converting sid to string"); } /** @@ -951,7 +957,7 @@ void audit_putname(const char *name) #endif } -void audit_inode_context(int idx, const struct inode *inode) +static void audit_inode_context(int idx, const struct inode *inode) { struct audit_context *context = current->audit_context; @@ -1141,38 +1147,6 @@ uid_t audit_get_loginuid(struct audit_context *ctx) return ctx ? ctx->loginuid : -1; } -static char *audit_ipc_context(struct kern_ipc_perm *ipcp) -{ - struct audit_context *context = current->audit_context; - char *ctx = NULL; - int len = 0; - - if (likely(!context)) - return NULL; - - len = security_ipc_getsecurity(ipcp, NULL, 0); - if (len == -EOPNOTSUPP) - goto ret; - if (len < 0) - goto error_path; - - ctx = kmalloc(len, GFP_ATOMIC); - if (!ctx) - goto error_path; - - len = security_ipc_getsecurity(ipcp, ctx, len); - if (len < 0) - goto error_path; - - return ctx; - -error_path: - kfree(ctx); - audit_panic("error in audit_ipc_context"); -ret: - return NULL; -} - /** * audit_ipc_perms - record audit data for ipc * @qbytes: msgq bytes @@ -1198,7 +1172,7 @@ int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, str ax->uid = uid; ax->gid = gid; ax->mode = mode; - ax->ctx = audit_ipc_context(ipcp); + selinux_get_ipc_sid(ipcp, &ax->osid); ax->d.type = AUDIT_IPC; ax->d.next = context->aux; diff --git a/security/dummy.c b/security/dummy.c index fd99429..8cccccc 100644 --- a/security/dummy.c +++ b/security/dummy.c @@ -563,11 +563,6 @@ static int dummy_ipc_permission (struct kern_ipc_perm *ipcp, short flag) return 0; } -static int dummy_ipc_getsecurity(struct kern_ipc_perm *ipcp, void *buffer, size_t size) -{ - return -EOPNOTSUPP; -} - static int dummy_msg_msg_alloc_security (struct msg_msg *msg) { return 0; @@ -976,7 +971,6 @@ void security_fixup_ops (struct security_operations *ops) set_to_dummy_if_null(ops, task_reparent_to_init); set_to_dummy_if_null(ops, task_to_inode); set_to_dummy_if_null(ops, ipc_permission); - set_to_dummy_if_null(ops, ipc_getsecurity); set_to_dummy_if_null(ops, msg_msg_alloc_security); set_to_dummy_if_null(ops, msg_msg_free_security); set_to_dummy_if_null(ops, msg_queue_alloc_security); diff --git a/security/selinux/exports.c b/security/selinux/exports.c index 07ddce7..7357cf2 100644 --- a/security/selinux/exports.c +++ b/security/selinux/exports.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "security.h" #include "objsec.h" @@ -50,3 +51,13 @@ void selinux_get_inode_sid(const struct inode *inode, u32 *sid) *sid = 0; } +void selinux_get_ipc_sid(const struct kern_ipc_perm *ipcp, u32 *sid) +{ + if (selinux_enabled) { + struct ipc_security_struct *isec = ipcp->security; + *sid = isec->sid; + return; + } + *sid = 0; +} + diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index b61b9554..3cf368a 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -4052,13 +4052,6 @@ static int selinux_ipc_permission(struct kern_ipc_perm *ipcp, short flag) return ipc_has_perm(ipcp, av); } -static int selinux_ipc_getsecurity(struct kern_ipc_perm *ipcp, void *buffer, size_t size) -{ - struct ipc_security_struct *isec = ipcp->security; - - return selinux_getsecurity(isec->sid, buffer, size); -} - /* module stacking operations */ static int selinux_register_security (const char *name, struct security_operations *ops) { @@ -4321,7 +4314,6 @@ static struct security_operations selinux_ops = { .task_to_inode = selinux_task_to_inode, .ipc_permission = selinux_ipc_permission, - .ipc_getsecurity = selinux_ipc_getsecurity, .msg_msg_alloc_security = selinux_msg_msg_alloc_security, .msg_msg_free_security = selinux_msg_msg_free_security, -- cgit v0.10.2 From e7c3497013a7e5496ce3d5fd3c73b5cf5af7a56e Mon Sep 17 00:00:00 2001 From: Steve Grubb Date: Mon, 3 Apr 2006 09:08:13 -0400 Subject: [PATCH] Reworked patch for labels on user space messages The below patch should be applied after the inode and ipc sid patches. This patch is a reworking of Tim's patch that has been updated to match the inode and ipc patches since its similar. [updated: > Stephen Smalley also wanted to change a variable from isec to tsec in the > user sid patch. ] Signed-off-by: Steve Grubb Signed-off-by: Al Viro diff --git a/include/linux/netlink.h b/include/linux/netlink.h index f8f3d1c..87b8a57 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -143,6 +143,7 @@ struct netlink_skb_parms __u32 dst_group; kernel_cap_t eff_cap; __u32 loginuid; /* Login (audit) uid */ + __u32 sid; /* SELinux security id */ }; #define NETLINK_CB(skb) (*(struct netlink_skb_parms*)&((skb)->cb)) diff --git a/include/linux/selinux.h b/include/linux/selinux.h index 413d667..4047bcd 100644 --- a/include/linux/selinux.h +++ b/include/linux/selinux.h @@ -5,6 +5,7 @@ * * Copyright (C) 2005 Red Hat, Inc., James Morris * Copyright (C) 2006 Trusted Computer Solutions, Inc. + * Copyright (C) 2006 IBM Corporation, Timothy R. Chavez * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2, @@ -108,6 +109,16 @@ void selinux_get_inode_sid(const struct inode *inode, u32 *sid); */ void selinux_get_ipc_sid(const struct kern_ipc_perm *ipcp, u32 *sid); +/** + * selinux_get_task_sid - return the SID of task + * @tsk: the task whose SID will be returned + * @sid: pointer to security context ID to be filled in. + * + * Returns nothing + */ +void selinux_get_task_sid(struct task_struct *tsk, u32 *sid); + + #else static inline int selinux_audit_rule_init(u32 field, u32 op, @@ -156,6 +167,11 @@ static inline void selinux_get_ipc_sid(const struct kern_ipc_perm *ipcp, u32 *si *sid = 0; } +static inline void selinux_get_task_sid(struct task_struct *tsk, u32 *sid) +{ + *sid = 0; +} + #endif /* CONFIG_SECURITY_SELINUX */ #endif /* _LINUX_SELINUX_H */ diff --git a/kernel/audit.c b/kernel/audit.c index 9060be7..7ec9cca 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -390,7 +390,7 @@ static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type) static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { - u32 uid, pid, seq; + u32 uid, pid, seq, sid; void *data; struct audit_status *status_get, status_set; int err; @@ -416,6 +416,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) pid = NETLINK_CREDS(skb)->pid; uid = NETLINK_CREDS(skb)->uid; loginuid = NETLINK_CB(skb).loginuid; + sid = NETLINK_CB(skb).sid; seq = nlh->nlmsg_seq; data = NLMSG_DATA(nlh); @@ -468,8 +469,23 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) ab = audit_log_start(NULL, GFP_KERNEL, msg_type); if (ab) { audit_log_format(ab, - "user pid=%d uid=%u auid=%u msg='%.1024s'", - pid, uid, loginuid, (char *)data); + "user pid=%d uid=%u auid=%u", + pid, uid, loginuid); + if (sid) { + char *ctx = NULL; + u32 len; + if (selinux_ctxid_to_string( + sid, &ctx, &len)) { + audit_log_format(ab, + " subj=%u", sid); + /* Maybe call audit_panic? */ + } else + audit_log_format(ab, + " subj=%s", ctx); + kfree(ctx); + } + audit_log_format(ab, " msg='%.1024s'", + (char *)data); audit_set_pid(ab, pid); audit_log_end(ab); } diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 2a233ff..09fbc4b 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -56,6 +56,7 @@ #include #include #include +#include #include #include @@ -1157,6 +1158,7 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, NETLINK_CB(skb).dst_pid = dst_pid; NETLINK_CB(skb).dst_group = dst_group; NETLINK_CB(skb).loginuid = audit_get_loginuid(current->audit_context); + selinux_get_task_sid(current, &(NETLINK_CB(skb).sid)); memcpy(NETLINK_CREDS(skb), &siocb->scm->creds, sizeof(struct ucred)); /* What can I do? Netlink is asynchronous, so that diff --git a/security/selinux/exports.c b/security/selinux/exports.c index 7357cf2..ae4c73e 100644 --- a/security/selinux/exports.c +++ b/security/selinux/exports.c @@ -5,6 +5,7 @@ * * Copyright (C) 2005 Red Hat, Inc., James Morris * Copyright (C) 2006 Trusted Computer Solutions, Inc. + * Copyright (C) 2006 IBM Corporation, Timothy R. Chavez * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2, @@ -61,3 +62,13 @@ void selinux_get_ipc_sid(const struct kern_ipc_perm *ipcp, u32 *sid) *sid = 0; } +void selinux_get_task_sid(struct task_struct *tsk, u32 *sid) +{ + if (selinux_enabled) { + struct task_security_struct *tsec = tsk->security; + *sid = tsec->sid; + return; + } + *sid = 0; +} + -- cgit v0.10.2 From ce29b682e228c70cdc91a1b2935c5adb2087bab8 Mon Sep 17 00:00:00 2001 From: Steve Grubb Date: Sat, 1 Apr 2006 18:29:34 -0500 Subject: [PATCH] More user space subject labels Hi, The patch below builds upon the patch sent earlier and adds subject label to all audit events generated via the netlink interface. It also cleans up a few other minor things. Signed-off-by: Steve Grubb Signed-off-by: Al Viro diff --git a/include/linux/audit.h b/include/linux/audit.h index 740f950..d5c4082 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -371,7 +371,7 @@ extern void audit_log_d_path(struct audit_buffer *ab, extern int audit_filter_user(struct netlink_skb_parms *cb, int type); extern int audit_filter_type(int type); extern int audit_receive_filter(int type, int pid, int uid, int seq, - void *data, size_t datasz, uid_t loginuid); + void *data, size_t datasz, uid_t loginuid, u32 sid); #else #define audit_log(c,g,t,f,...) do { ; } while (0) #define audit_log_start(c,g,t) ({ NULL; }) diff --git a/kernel/audit.c b/kernel/audit.c index 7ec9cca..df57b49 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -230,49 +230,103 @@ void audit_log_lost(const char *message) } } -static int audit_set_rate_limit(int limit, uid_t loginuid) +static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid) { - int old = audit_rate_limit; - audit_rate_limit = limit; - audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, + int old = audit_rate_limit; + + if (sid) { + char *ctx = NULL; + u32 len; + int rc; + if ((rc = selinux_ctxid_to_string(sid, &ctx, &len))) + return rc; + else + audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, + "audit_rate_limit=%d old=%d by auid=%u subj=%s", + limit, old, loginuid, ctx); + kfree(ctx); + } else + audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, "audit_rate_limit=%d old=%d by auid=%u", - audit_rate_limit, old, loginuid); + limit, old, loginuid); + audit_rate_limit = limit; return old; } -static int audit_set_backlog_limit(int limit, uid_t loginuid) +static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid) { - int old = audit_backlog_limit; - audit_backlog_limit = limit; - audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, + int old = audit_backlog_limit; + + if (sid) { + char *ctx = NULL; + u32 len; + int rc; + if ((rc = selinux_ctxid_to_string(sid, &ctx, &len))) + return rc; + else + audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, + "audit_backlog_limit=%d old=%d by auid=%u subj=%s", + limit, old, loginuid, ctx); + kfree(ctx); + } else + audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, "audit_backlog_limit=%d old=%d by auid=%u", - audit_backlog_limit, old, loginuid); + limit, old, loginuid); + audit_backlog_limit = limit; return old; } -static int audit_set_enabled(int state, uid_t loginuid) +static int audit_set_enabled(int state, uid_t loginuid, u32 sid) { - int old = audit_enabled; + int old = audit_enabled; + if (state != 0 && state != 1) return -EINVAL; - audit_enabled = state; - audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, + + if (sid) { + char *ctx = NULL; + u32 len; + int rc; + if ((rc = selinux_ctxid_to_string(sid, &ctx, &len))) + return rc; + else + audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, + "audit_enabled=%d old=%d by auid=%u subj=%s", + state, old, loginuid, ctx); + kfree(ctx); + } else + audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, "audit_enabled=%d old=%d by auid=%u", - audit_enabled, old, loginuid); + state, old, loginuid); + audit_enabled = state; return old; } -static int audit_set_failure(int state, uid_t loginuid) +static int audit_set_failure(int state, uid_t loginuid, u32 sid) { - int old = audit_failure; + int old = audit_failure; + if (state != AUDIT_FAIL_SILENT && state != AUDIT_FAIL_PRINTK && state != AUDIT_FAIL_PANIC) return -EINVAL; - audit_failure = state; - audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, + + if (sid) { + char *ctx = NULL; + u32 len; + int rc; + if ((rc = selinux_ctxid_to_string(sid, &ctx, &len))) + return rc; + else + audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, + "audit_failure=%d old=%d by auid=%u subj=%s", + state, old, loginuid, ctx); + kfree(ctx); + } else + audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, "audit_failure=%d old=%d by auid=%u", - audit_failure, old, loginuid); + state, old, loginuid); + audit_failure = state; return old; } @@ -437,25 +491,43 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return -EINVAL; status_get = (struct audit_status *)data; if (status_get->mask & AUDIT_STATUS_ENABLED) { - err = audit_set_enabled(status_get->enabled, loginuid); + err = audit_set_enabled(status_get->enabled, + loginuid, sid); if (err < 0) return err; } if (status_get->mask & AUDIT_STATUS_FAILURE) { - err = audit_set_failure(status_get->failure, loginuid); + err = audit_set_failure(status_get->failure, + loginuid, sid); if (err < 0) return err; } if (status_get->mask & AUDIT_STATUS_PID) { int old = audit_pid; + if (sid) { + char *ctx = NULL; + u32 len; + int rc; + if ((rc = selinux_ctxid_to_string( + sid, &ctx, &len))) + return rc; + else + audit_log(NULL, GFP_KERNEL, + AUDIT_CONFIG_CHANGE, + "audit_pid=%d old=%d by auid=%u subj=%s", + status_get->pid, old, + loginuid, ctx); + kfree(ctx); + } else + audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, + "audit_pid=%d old=%d by auid=%u", + status_get->pid, old, loginuid); audit_pid = status_get->pid; - audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, - "audit_pid=%d old=%d by auid=%u", - audit_pid, old, loginuid); } if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) - audit_set_rate_limit(status_get->rate_limit, loginuid); + audit_set_rate_limit(status_get->rate_limit, + loginuid, sid); if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) audit_set_backlog_limit(status_get->backlog_limit, - loginuid); + loginuid, sid); break; case AUDIT_USER: case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: @@ -477,7 +549,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (selinux_ctxid_to_string( sid, &ctx, &len)) { audit_log_format(ab, - " subj=%u", sid); + " ssid=%u", sid); /* Maybe call audit_panic? */ } else audit_log_format(ab, @@ -499,7 +571,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) case AUDIT_LIST: err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, uid, seq, data, nlmsg_len(nlh), - loginuid); + loginuid, sid); break; case AUDIT_ADD_RULE: case AUDIT_DEL_RULE: @@ -509,7 +581,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) case AUDIT_LIST_RULES: err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, uid, seq, data, nlmsg_len(nlh), - loginuid); + loginuid, sid); break; case AUDIT_SIGNAL_INFO: sig_data.uid = audit_sig_uid; diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 85a7862..7c13490 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -586,9 +586,10 @@ static int audit_list_rules(void *_dest) * @data: payload data * @datasz: size of payload data * @loginuid: loginuid of sender + * @sid: SE Linux Security ID of sender */ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, - size_t datasz, uid_t loginuid) + size_t datasz, uid_t loginuid, u32 sid) { struct task_struct *tsk; int *dest; @@ -631,9 +632,23 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, err = audit_add_rule(entry, &audit_filter_list[entry->rule.listnr]); - audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, - "auid=%u add rule to list=%d res=%d\n", - loginuid, entry->rule.listnr, !err); + if (sid) { + char *ctx = NULL; + u32 len; + if (selinux_ctxid_to_string(sid, &ctx, &len)) { + /* Maybe call audit_panic? */ + audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, + "auid=%u ssid=%u add rule to list=%d res=%d", + loginuid, sid, entry->rule.listnr, !err); + } else + audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, + "auid=%u subj=%s add rule to list=%d res=%d", + loginuid, ctx, entry->rule.listnr, !err); + kfree(ctx); + } else + audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, + "auid=%u add rule to list=%d res=%d", + loginuid, entry->rule.listnr, !err); if (err) audit_free_rule(entry); @@ -649,9 +664,24 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, err = audit_del_rule(entry, &audit_filter_list[entry->rule.listnr]); - audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, - "auid=%u remove rule from list=%d res=%d\n", - loginuid, entry->rule.listnr, !err); + + if (sid) { + char *ctx = NULL; + u32 len; + if (selinux_ctxid_to_string(sid, &ctx, &len)) { + /* Maybe call audit_panic? */ + audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, + "auid=%u ssid=%u remove rule from list=%d res=%d", + loginuid, sid, entry->rule.listnr, !err); + } else + audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, + "auid=%u subj=%s remove rule from list=%d res=%d", + loginuid, ctx, entry->rule.listnr, !err); + kfree(ctx); + } else + audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, + "auid=%u remove rule from list=%d res=%d", + loginuid, entry->rule.listnr, !err); audit_free_rule(entry); break; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b4f7223..d94e040 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -637,7 +637,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts u32 len; if (selinux_ctxid_to_string( axi->osid, &ctx, &len)) { - audit_log_format(ab, " obj=%u", + audit_log_format(ab, " osid=%u", axi->osid); call_panic = 1; } else @@ -712,7 +712,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts u32 len; if (selinux_ctxid_to_string( context->names[i].osid, &ctx, &len)) { - audit_log_format(ab, " obj=%u", + audit_log_format(ab, " osid=%u", context->names[i].osid); call_panic = 2; } else -- cgit v0.10.2 From 073115d6b29c7910feaa08241c6484637f5ca958 Mon Sep 17 00:00:00 2001 From: Steve Grubb Date: Sun, 2 Apr 2006 17:07:33 -0400 Subject: [PATCH] Rework of IPC auditing 1) The audit_ipc_perms() function has been split into two different functions: - audit_ipc_obj() - audit_ipc_set_perm() There's a key shift here... The audit_ipc_obj() collects the uid, gid, mode, and SElinux context label of the current ipc object. This audit_ipc_obj() hook is now found in several places. Most notably, it is hooked in ipcperms(), which is called in various places around the ipc code permforming a MAC check. Additionally there are several places where *checkid() is used to validate that an operation is being performed on a valid object while not necessarily having a nearby ipcperms() call. In these locations, audit_ipc_obj() is called to ensure that the information is captured by the audit system. The audit_set_new_perm() function is called any time the permissions on the ipc object changes. In this case, the NEW permissions are recorded (and note that an audit_ipc_obj() call exists just a few lines before each instance). 2) Support for an AUDIT_IPC_SET_PERM audit message type. This allows for separate auxiliary audit records for normal operations on an IPC object and permissions changes. Note that the same struct audit_aux_data_ipcctl is used and populated, however there are separate audit_log_format statements based on the type of the message. Finally, the AUDIT_IPC block of code in audit_free_aux() was extended to handle aux messages of this new type. No more mem leaks I hope ;-) Signed-off-by: Al Viro diff --git a/include/linux/audit.h b/include/linux/audit.h index d5c4082..b74c148 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -83,6 +83,7 @@ #define AUDIT_CONFIG_CHANGE 1305 /* Audit system configuration change */ #define AUDIT_SOCKADDR 1306 /* sockaddr copied as syscall arg */ #define AUDIT_CWD 1307 /* Current working directory */ +#define AUDIT_IPC_SET_PERM 1311 /* IPC new permissions record type */ #define AUDIT_AVC 1400 /* SE Linux avc denial or grant */ #define AUDIT_SELINUX_ERR 1401 /* Internal SE Linux Errors */ @@ -319,7 +320,8 @@ extern void auditsc_get_stamp(struct audit_context *ctx, struct timespec *t, unsigned int *serial); extern int audit_set_loginuid(struct task_struct *task, uid_t loginuid); extern uid_t audit_get_loginuid(struct audit_context *ctx); -extern int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, struct kern_ipc_perm *ipcp); +extern int audit_ipc_obj(struct kern_ipc_perm *ipcp); +extern int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, struct kern_ipc_perm *ipcp); extern int audit_socketcall(int nargs, unsigned long *args); extern int audit_sockaddr(int len, void *addr); extern int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt); @@ -338,7 +340,8 @@ extern int audit_set_macxattr(const char *name); #define audit_inode_child(d,i,p) do { ; } while (0) #define auditsc_get_stamp(c,t,s) do { BUG(); } while (0) #define audit_get_loginuid(c) ({ -1; }) -#define audit_ipc_perms(q,u,g,m,i) ({ 0; }) +#define audit_ipc_obj(i) ({ 0; }) +#define audit_ipc_set_perm(q,u,g,m,i) ({ 0; }) #define audit_socketcall(n,a) ({ 0; }) #define audit_sockaddr(len, addr) ({ 0; }) #define audit_avc_path(dentry, mnt) ({ 0; }) diff --git a/ipc/msg.c b/ipc/msg.c index 48a7f17..7d1340c 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -13,6 +13,9 @@ * mostly rewritten, threaded and wake-one semantics added * MSGMAX limit removed, sysctl's added * (c) 1999 Manfred Spraul + * + * support for audit of ipc object properties and permission changes + * Dustin Kirkland */ #include @@ -447,6 +450,11 @@ asmlinkage long sys_msgctl (int msqid, int cmd, struct msqid_ds __user *buf) if (msg_checkid(msq,msqid)) goto out_unlock_up; ipcp = &msq->q_perm; + + err = audit_ipc_obj(ipcp); + if (err) + goto out_unlock_up; + err = -EPERM; if (current->euid != ipcp->cuid && current->euid != ipcp->uid && !capable(CAP_SYS_ADMIN)) @@ -460,7 +468,8 @@ asmlinkage long sys_msgctl (int msqid, int cmd, struct msqid_ds __user *buf) switch (cmd) { case IPC_SET: { - if ((err = audit_ipc_perms(setbuf.qbytes, setbuf.uid, setbuf.gid, setbuf.mode, ipcp))) + err = audit_ipc_set_perm(setbuf.qbytes, setbuf.uid, setbuf.gid, setbuf.mode, ipcp); + if (err) goto out_unlock_up; err = -EPERM; diff --git a/ipc/sem.c b/ipc/sem.c index 642659c..7919f8e 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -61,6 +61,9 @@ * (c) 2001 Red Hat Inc * Lockless wakeup * (c) 2003 Manfred Spraul + * + * support for audit of ipc object properties and permission changes + * Dustin Kirkland */ #include @@ -820,6 +823,11 @@ static int semctl_down(int semid, int semnum, int cmd, int version, union semun goto out_unlock; } ipcp = &sma->sem_perm; + + err = audit_ipc_obj(ipcp); + if (err) + goto out_unlock; + if (current->euid != ipcp->cuid && current->euid != ipcp->uid && !capable(CAP_SYS_ADMIN)) { err=-EPERM; @@ -836,7 +844,8 @@ static int semctl_down(int semid, int semnum, int cmd, int version, union semun err = 0; break; case IPC_SET: - if ((err = audit_ipc_perms(0, setbuf.uid, setbuf.gid, setbuf.mode, ipcp))) + err = audit_ipc_set_perm(0, setbuf.uid, setbuf.gid, setbuf.mode, ipcp); + if (err) goto out_unlock; ipcp->uid = setbuf.uid; ipcp->gid = setbuf.gid; diff --git a/ipc/shm.c b/ipc/shm.c index 1c2faf6..8098968 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -13,6 +13,8 @@ * Shared /dev/zero support, Kanoj Sarcar * Move the mm functionality over to mm/shmem.c, Christoph Rohland * + * support for audit of ipc object properties and permission changes + * Dustin Kirkland */ #include @@ -542,6 +544,10 @@ asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds __user *buf) if(err) goto out_unlock; + err = audit_ipc_obj(&(shp->shm_perm)); + if (err) + goto out_unlock; + if (!capable(CAP_IPC_LOCK)) { err = -EPERM; if (current->euid != shp->shm_perm.uid && @@ -594,6 +600,10 @@ asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds __user *buf) if(err) goto out_unlock_up; + err = audit_ipc_obj(&(shp->shm_perm)); + if (err) + goto out_unlock_up; + if (current->euid != shp->shm_perm.uid && current->euid != shp->shm_perm.cuid && !capable(CAP_SYS_ADMIN)) { @@ -627,12 +637,15 @@ asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds __user *buf) err=-EINVAL; if(shp==NULL) goto out_up; - if ((err = audit_ipc_perms(0, setbuf.uid, setbuf.gid, - setbuf.mode, &(shp->shm_perm)))) - goto out_unlock_up; err = shm_checkid(shp,shmid); if(err) goto out_unlock_up; + err = audit_ipc_obj(&(shp->shm_perm)); + if (err) + goto out_unlock_up; + err = audit_ipc_set_perm(0, setbuf.uid, setbuf.gid, setbuf.mode, &(shp->shm_perm)); + if (err) + goto out_unlock_up; err=-EPERM; if (current->euid != shp->shm_perm.uid && current->euid != shp->shm_perm.cuid && diff --git a/ipc/util.c b/ipc/util.c index b3dcfad..8193299 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -10,6 +10,8 @@ * Manfred Spraul * Oct 2002 - One lock per IPC id. RCU ipc_free for lock-free grow_ary(). * Mingming Cao + * Mar 2006 - support for audit of ipc object properties + * Dustin Kirkland */ #include @@ -27,6 +29,7 @@ #include #include #include +#include #include @@ -464,8 +467,10 @@ void ipc_rcu_putref(void *ptr) int ipcperms (struct kern_ipc_perm *ipcp, short flag) { /* flag will most probably be 0 or S_...UGO from */ - int requested_mode, granted_mode; + int requested_mode, granted_mode, err; + if (unlikely((err = audit_ipc_obj(ipcp)))) + return err; requested_mode = (flag >> 6) | (flag >> 3) | flag; granted_mode = ipcp->mode; if (current->euid == ipcp->cuid || current->euid == ipcp->uid) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index d94e040..a300736 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -646,6 +646,25 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts } break; } + case AUDIT_IPC_SET_PERM: { + struct audit_aux_data_ipcctl *axi = (void *)aux; + audit_log_format(ab, + " new qbytes=%lx new iuid=%u new igid=%u new mode=%x", + axi->qbytes, axi->uid, axi->gid, axi->mode); + if (axi->osid != 0) { + char *ctx = NULL; + u32 len; + if (selinux_ctxid_to_string( + axi->osid, &ctx, &len)) { + audit_log_format(ab, " osid=%u", + axi->osid); + call_panic = 1; + } else + audit_log_format(ab, " obj=%s", ctx); + kfree(ctx); + } + break; } + case AUDIT_SOCKETCALL: { int i; struct audit_aux_data_socketcall *axs = (void *)aux; @@ -1148,7 +1167,36 @@ uid_t audit_get_loginuid(struct audit_context *ctx) } /** - * audit_ipc_perms - record audit data for ipc + * audit_ipc_obj - record audit data for ipc object + * @ipcp: ipc permissions + * + * Returns 0 for success or NULL context or < 0 on error. + */ +int audit_ipc_obj(struct kern_ipc_perm *ipcp) +{ + struct audit_aux_data_ipcctl *ax; + struct audit_context *context = current->audit_context; + + if (likely(!context)) + return 0; + + ax = kmalloc(sizeof(*ax), GFP_ATOMIC); + if (!ax) + return -ENOMEM; + + ax->uid = ipcp->uid; + ax->gid = ipcp->gid; + ax->mode = ipcp->mode; + selinux_get_ipc_sid(ipcp, &ax->osid); + + ax->d.type = AUDIT_IPC; + ax->d.next = context->aux; + context->aux = (void *)ax; + return 0; +} + +/** + * audit_ipc_set_perm - record audit data for new ipc permissions * @qbytes: msgq bytes * @uid: msgq user id * @gid: msgq group id @@ -1156,7 +1204,7 @@ uid_t audit_get_loginuid(struct audit_context *ctx) * * Returns 0 for success or NULL context or < 0 on error. */ -int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, struct kern_ipc_perm *ipcp) +int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, struct kern_ipc_perm *ipcp) { struct audit_aux_data_ipcctl *ax; struct audit_context *context = current->audit_context; @@ -1174,7 +1222,7 @@ int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, str ax->mode = mode; selinux_get_ipc_sid(ipcp, &ax->osid); - ax->d.type = AUDIT_IPC; + ax->d.type = AUDIT_IPC_SET_PERM; ax->d.next = context->aux; context->aux = (void *)ax; return 0; -- cgit v0.10.2 From 2ad312d2093ae506ae0fa184d8d026b559083087 Mon Sep 17 00:00:00 2001 From: Steve Grubb Date: Tue, 11 Apr 2006 08:50:56 -0400 Subject: [PATCH] Audit Filter Performance While testing the watch performance, I noticed that selinux_task_ctxid() was creeping into the results more than it should. Investigation showed that the function call was being called whether it was needed or not. The below patch fixes this. Signed-off-by: Steve Grubb Signed-off-by: Al Viro diff --git a/kernel/auditsc.c b/kernel/auditsc.c index a300736..1c03a4e 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -168,11 +168,9 @@ static int audit_filter_rules(struct task_struct *tsk, struct audit_context *ctx, enum audit_state *state) { - int i, j; + int i, j, need_sid = 1; u32 sid; - selinux_task_ctxid(tsk, &sid); - for (i = 0; i < rule->field_count; i++) { struct audit_field *f = &rule->fields[i]; int result = 0; @@ -271,11 +269,16 @@ static int audit_filter_rules(struct task_struct *tsk, match for now to avoid losing information that may be wanted. An error message will also be logged upon error */ - if (f->se_rule) + if (f->se_rule) { + if (need_sid) { + selinux_task_ctxid(tsk, &sid); + need_sid = 0; + } result = selinux_audit_rule_match(sid, f->type, f->op, f->se_rule, ctx); + } break; case AUDIT_ARG0: case AUDIT_ARG1: -- cgit v0.10.2 From 254abfd33a214617deb916585b306faee834c97f Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Mon, 1 May 2006 10:40:23 -0700 Subject: IB/mthca: Fix offset in query_gid method GuidInfo records have 8 byte GUIDs in them, so an index should be multiplied by 8 to get an offset. mthca_query_gid() was incorrectly multiplying by 16. Noticed by Leonid Keller . Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 565a24b..a2eae8a 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -306,7 +306,7 @@ static int mthca_query_gid(struct ib_device *ibdev, u8 port, goto out; } - memcpy(gid->raw + 8, out_mad->data + (index % 8) * 16, 8); + memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8); out: kfree(in_mad); -- cgit v0.10.2 From 0568b409c74f7a125d92a09a3f386785700ef688 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 1 May 2006 19:50:48 +0200 Subject: [PATCH] splice: fix bugs in pipe_to_file() Found by Oleg Nesterov , fixed by me. - Only allow full pages to go to the page cache. - Check page != buf->page instead of using PIPE_BUF_FLAG_STOLEN. - Remember to clear 'stolen' if add_to_page_cache() fails. And as a cleanup on that: - Make the bottom fall-through logic a little less convoluted. Also make the steal path hold an extra reference to the page, so we don't have to differentiate between stolen and non-stolen at the end. Signed-off-by: Jens Axboe diff --git a/fs/pipe.c b/fs/pipe.c index 5a36927..888f265 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -99,8 +99,6 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe, { struct page *page = buf->page; - buf->flags &= ~PIPE_BUF_FLAG_STOLEN; - /* * If nobody else uses this page, and we don't already have a * temporary page, let's keep track of it as a one-deep @@ -130,7 +128,6 @@ static int anon_pipe_buf_steal(struct pipe_inode_info *pipe, struct page *page = buf->page; if (page_count(page) == 1) { - buf->flags |= PIPE_BUF_FLAG_STOLEN; lock_page(page); return 0; } diff --git a/fs/splice.c b/fs/splice.c index 9df28d3..1633778 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -78,7 +78,7 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *info, return 1; } - buf->flags |= PIPE_BUF_FLAG_STOLEN | PIPE_BUF_FLAG_LRU; + buf->flags |= PIPE_BUF_FLAG_LRU; return 0; } @@ -87,7 +87,7 @@ static void page_cache_pipe_buf_release(struct pipe_inode_info *info, { page_cache_release(buf->page); buf->page = NULL; - buf->flags &= ~(PIPE_BUF_FLAG_STOLEN | PIPE_BUF_FLAG_LRU); + buf->flags &= ~PIPE_BUF_FLAG_LRU; } static void *page_cache_pipe_buf_map(struct file *file, @@ -587,9 +587,10 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, this_len = PAGE_CACHE_SIZE - offset; /* - * Reuse buf page, if SPLICE_F_MOVE is set. + * Reuse buf page, if SPLICE_F_MOVE is set and we are doing a full + * page. */ - if (sd->flags & SPLICE_F_MOVE) { + if ((sd->flags & SPLICE_F_MOVE) && this_len == PAGE_CACHE_SIZE) { /* * If steal succeeds, buf->page is now pruned from the vm * side (LRU and page cache) and we can reuse it. The page @@ -604,6 +605,8 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, goto find_page; } + page_cache_get(page); + if (!(buf->flags & PIPE_BUF_FLAG_LRU)) lru_cache_add(page); } else { @@ -662,7 +665,7 @@ find_page: } else if (ret) goto out; - if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) { + if (buf->page != page) { char *dst = kmap_atomic(page, KM_USER0); memcpy(dst + offset, src + buf->offset, this_len); @@ -671,22 +674,20 @@ find_page: } ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len); - if (ret == AOP_TRUNCATED_PAGE) { + if (!ret) { + /* + * Return the number of bytes written and mark page as + * accessed, we are now done! + */ + ret = this_len; + mark_page_accessed(page); + balance_dirty_pages_ratelimited(mapping); + } else if (ret == AOP_TRUNCATED_PAGE) { page_cache_release(page); goto find_page; - } else if (ret) - goto out; - - /* - * Return the number of bytes written. - */ - ret = this_len; - mark_page_accessed(page); - balance_dirty_pages_ratelimited(mapping); + } out: - if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) - page_cache_release(page); - + page_cache_release(page); unlock_page(page); out_nomem: buf->ops->unmap(info, buf); diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 0008d4b..3130977 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -5,8 +5,7 @@ #define PIPE_BUFFERS (16) -#define PIPE_BUF_FLAG_STOLEN 0x01 -#define PIPE_BUF_FLAG_LRU 0x02 +#define PIPE_BUF_FLAG_LRU 0x01 struct pipe_buffer { struct page *page; -- cgit v0.10.2 From f84d751994441292593523c7069ed147176f6cab Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 1 May 2006 19:59:03 +0200 Subject: [PATCH] pipe: introduce ->pin() buffer operation The ->map() function is really expensive on highmem machines right now, since it has to use the slower kmap() instead of kmap_atomic(). Splice rarely needs to access the virtual address of a page, so it's a waste of time doing it. Introduce ->pin() to take over the responsibility of making sure the page data is valid. ->map() is then reduced to just kmap(). That way we can also share a most of the pipe buffer ops between pipe.c and splice.c Signed-off-by: Jens Axboe diff --git a/fs/pipe.c b/fs/pipe.c index 888f265..d9644fd 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -110,14 +110,14 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe, page_cache_release(page); } -static void * anon_pipe_buf_map(struct file *file, struct pipe_inode_info *pipe, - struct pipe_buffer *buf) +void *generic_pipe_buf_map(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) { return kmap(buf->page); } -static void anon_pipe_buf_unmap(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) +void generic_pipe_buf_unmap(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) { kunmap(buf->page); } @@ -135,19 +135,24 @@ static int anon_pipe_buf_steal(struct pipe_inode_info *pipe, return 1; } -static void anon_pipe_buf_get(struct pipe_inode_info *info, - struct pipe_buffer *buf) +void generic_pipe_buf_get(struct pipe_inode_info *info, struct pipe_buffer *buf) { page_cache_get(buf->page); } +int generic_pipe_buf_pin(struct pipe_inode_info *info, struct pipe_buffer *buf) +{ + return 0; +} + static struct pipe_buf_operations anon_pipe_buf_ops = { .can_merge = 1, - .map = anon_pipe_buf_map, - .unmap = anon_pipe_buf_unmap, + .map = generic_pipe_buf_map, + .unmap = generic_pipe_buf_unmap, + .pin = generic_pipe_buf_pin, .release = anon_pipe_buf_release, .steal = anon_pipe_buf_steal, - .get = anon_pipe_buf_get, + .get = generic_pipe_buf_get, }; static ssize_t @@ -183,12 +188,14 @@ pipe_readv(struct file *filp, const struct iovec *_iov, if (chars > total_len) chars = total_len; - addr = ops->map(filp, pipe, buf); - if (IS_ERR(addr)) { + error = ops->pin(pipe, buf); + if (error) { if (!ret) - ret = PTR_ERR(addr); + error = ret; break; } + + addr = ops->map(pipe, buf); error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars); ops->unmap(pipe, buf); if (unlikely(error)) { @@ -300,11 +307,11 @@ pipe_writev(struct file *filp, const struct iovec *_iov, void *addr; int error; - addr = ops->map(filp, pipe, buf); - if (IS_ERR(addr)) { - error = PTR_ERR(addr); + error = ops->pin(pipe, buf); + if (error) goto out; - } + + addr = ops->map(pipe, buf); error = pipe_iov_copy_from_user(offset + addr, iov, chars); ops->unmap(pipe, buf); diff --git a/fs/splice.c b/fs/splice.c index 1633778..d7538d8 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -90,9 +90,8 @@ static void page_cache_pipe_buf_release(struct pipe_inode_info *info, buf->flags &= ~PIPE_BUF_FLAG_LRU; } -static void *page_cache_pipe_buf_map(struct file *file, - struct pipe_inode_info *info, - struct pipe_buffer *buf) +static int page_cache_pipe_buf_pin(struct pipe_inode_info *info, + struct pipe_buffer *buf) { struct page *page = buf->page; int err; @@ -118,49 +117,25 @@ static void *page_cache_pipe_buf_map(struct file *file, } /* - * Page is ok afterall, fall through to mapping. + * Page is ok afterall, we are done. */ unlock_page(page); } - return kmap(page); + return 0; error: unlock_page(page); - return ERR_PTR(err); -} - -static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info, - struct pipe_buffer *buf) -{ - kunmap(buf->page); -} - -static void *user_page_pipe_buf_map(struct file *file, - struct pipe_inode_info *pipe, - struct pipe_buffer *buf) -{ - return kmap(buf->page); -} - -static void user_page_pipe_buf_unmap(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) -{ - kunmap(buf->page); -} - -static void page_cache_pipe_buf_get(struct pipe_inode_info *info, - struct pipe_buffer *buf) -{ - page_cache_get(buf->page); + return err; } static struct pipe_buf_operations page_cache_pipe_buf_ops = { .can_merge = 0, - .map = page_cache_pipe_buf_map, - .unmap = page_cache_pipe_buf_unmap, + .map = generic_pipe_buf_map, + .unmap = generic_pipe_buf_unmap, + .pin = page_cache_pipe_buf_pin, .release = page_cache_pipe_buf_release, .steal = page_cache_pipe_buf_steal, - .get = page_cache_pipe_buf_get, + .get = generic_pipe_buf_get, }; static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, @@ -171,11 +146,12 @@ static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, static struct pipe_buf_operations user_page_pipe_buf_ops = { .can_merge = 0, - .map = user_page_pipe_buf_map, - .unmap = user_page_pipe_buf_unmap, + .map = generic_pipe_buf_map, + .unmap = generic_pipe_buf_unmap, + .pin = generic_pipe_buf_pin, .release = page_cache_pipe_buf_release, .steal = user_page_pipe_buf_steal, - .get = page_cache_pipe_buf_get, + .get = generic_pipe_buf_get, }; /* @@ -517,26 +493,16 @@ static int pipe_to_sendpage(struct pipe_inode_info *info, { struct file *file = sd->file; loff_t pos = sd->pos; - ssize_t ret; - void *ptr; - int more; + int ret, more; - /* - * Sub-optimal, but we are limited by the pipe ->map. We don't - * need a kmap'ed buffer here, we just want to make sure we - * have the page pinned if the pipe page originates from the - * page cache. - */ - ptr = buf->ops->map(file, info, buf); - if (IS_ERR(ptr)) - return PTR_ERR(ptr); - - more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; + ret = buf->ops->pin(info, buf); + if (!ret) { + more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; - ret = file->f_op->sendpage(file, buf->page, buf->offset, sd->len, - &pos, more); + ret = file->f_op->sendpage(file, buf->page, buf->offset, + sd->len, &pos, more); + } - buf->ops->unmap(info, buf); return ret; } @@ -569,15 +535,14 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, unsigned int offset, this_len; struct page *page; pgoff_t index; - char *src; int ret; /* * make sure the data in this buffer is uptodate */ - src = buf->ops->map(file, info, buf); - if (IS_ERR(src)) - return PTR_ERR(src); + ret = buf->ops->pin(info, buf); + if (unlikely(ret)) + return ret; index = sd->pos >> PAGE_CACHE_SHIFT; offset = sd->pos & ~PAGE_CACHE_MASK; @@ -666,11 +631,16 @@ find_page: goto out; if (buf->page != page) { - char *dst = kmap_atomic(page, KM_USER0); + /* + * Careful, ->map() uses KM_USER0! + */ + char *src = buf->ops->map(info, buf); + char *dst = kmap_atomic(page, KM_USER1); memcpy(dst + offset, src + buf->offset, this_len); flush_dcache_page(page); - kunmap_atomic(dst, KM_USER0); + kunmap_atomic(dst, KM_USER1); + buf->ops->unmap(info, buf); } ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len); @@ -690,7 +660,6 @@ out: page_cache_release(page); unlock_page(page); out_nomem: - buf->ops->unmap(info, buf); return ret; } diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 3130977..b8aae1f 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -14,10 +14,23 @@ struct pipe_buffer { unsigned int flags; }; +/* + * Note on the nesting of these functions: + * + * ->pin() + * ->steal() + * ... + * ->map() + * ... + * ->unmap() + * + * That is, ->map() must be called on a pinned buffer, same goes for ->steal(). + */ struct pipe_buf_operations { int can_merge; - void * (*map)(struct file *, struct pipe_inode_info *, struct pipe_buffer *); + void * (*map)(struct pipe_inode_info *, struct pipe_buffer *); void (*unmap)(struct pipe_inode_info *, struct pipe_buffer *); + int (*pin)(struct pipe_inode_info *, struct pipe_buffer *); void (*release)(struct pipe_inode_info *, struct pipe_buffer *); int (*steal)(struct pipe_inode_info *, struct pipe_buffer *); void (*get)(struct pipe_inode_info *, struct pipe_buffer *); @@ -50,6 +63,12 @@ struct pipe_inode_info * alloc_pipe_info(struct inode * inode); void free_pipe_info(struct inode * inode); void __free_pipe_info(struct pipe_inode_info *); +/* Generic pipe buffer ops functions */ +void *generic_pipe_buf_map(struct pipe_inode_info *, struct pipe_buffer *); +void generic_pipe_buf_unmap(struct pipe_inode_info *, struct pipe_buffer *); +void generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *); +int generic_pipe_buf_pin(struct pipe_inode_info *, struct pipe_buffer *); + /* * splice is tied to pipes as a transport (at least for now), so we'll just * add the splice flags here. -- cgit v0.10.2 From 7f9c51f0d9783c78db5c2aa16806d0c256ac667f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 1 May 2006 19:59:32 +0200 Subject: [PATCH] Add ->splice_read/splice_write to def_blk_fops It can use the generic handlers. Signed-off-by: Jens Axboe diff --git a/fs/block_dev.c b/fs/block_dev.c index af88c43..f5958f4 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1104,6 +1104,8 @@ const struct file_operations def_blk_fops = { .readv = generic_file_readv, .writev = generic_file_write_nolock, .sendfile = generic_file_sendfile, + .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, }; int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg) -- cgit v0.10.2 From e27dedd84c119e2f7af54fcde3293be5ad812103 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 1 May 2006 19:59:54 +0200 Subject: [PATCH] splice: call handle_ra_miss() on failure to lookup page Notify the readahead logic of the missing page. Suggested by Oleg Nesterov. Signed-off-by: Jens Axboe diff --git a/fs/splice.c b/fs/splice.c index d7538d8..0a69164 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -302,6 +302,12 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, page = find_get_page(mapping, index); if (!page) { /* + * Make sure the read-ahead engine is notified + * about this failure. + */ + handle_ra_miss(mapping, &in->f_ra, index); + + /* * page didn't exist, allocate one. */ page = page_cache_alloc_cold(mapping); -- cgit v0.10.2 From f6762b7ad8edd6abc802542ce845d3bc8adcb92f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 1 May 2006 20:02:05 +0200 Subject: [PATCH] pipe: enable atomic copying of pipe data to/from user space The pipe ->map() method uses kmap() to virtually map the pages, which is both slow and has known scalability issues on SMP. This patch enables atomic copying of pipe pages, by pre-faulting data and using kmap_atomic() instead. lmbench bw_pipe and lat_pipe measurements agree this is a Good Thing. Here are results from that on a UP machine with highmem (1.5GiB of RAM), running first a UP kernel, SMP kernel, and SMP kernel patched. Vanilla-UP: Pipe bandwidth: 1622.28 MB/sec Pipe bandwidth: 1610.59 MB/sec Pipe bandwidth: 1608.30 MB/sec Pipe latency: 7.3275 microseconds Pipe latency: 7.2995 microseconds Pipe latency: 7.3097 microseconds Vanilla-SMP: Pipe bandwidth: 1382.19 MB/sec Pipe bandwidth: 1317.27 MB/sec Pipe bandwidth: 1355.61 MB/sec Pipe latency: 9.6402 microseconds Pipe latency: 9.6696 microseconds Pipe latency: 9.6153 microseconds Patched-SMP: Pipe bandwidth: 1578.70 MB/sec Pipe bandwidth: 1579.95 MB/sec Pipe bandwidth: 1578.63 MB/sec Pipe latency: 9.1654 microseconds Pipe latency: 9.2266 microseconds Pipe latency: 9.1527 microseconds Signed-off-by: Jens Axboe diff --git a/fs/pipe.c b/fs/pipe.c index d9644fd..3941a7f 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -55,7 +55,8 @@ void pipe_wait(struct pipe_inode_info *pipe) } static int -pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len) +pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len, + int atomic) { unsigned long copy; @@ -64,8 +65,13 @@ pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len) iov++; copy = min_t(unsigned long, len, iov->iov_len); - if (copy_from_user(to, iov->iov_base, copy)) - return -EFAULT; + if (atomic) { + if (__copy_from_user_inatomic(to, iov->iov_base, copy)) + return -EFAULT; + } else { + if (copy_from_user(to, iov->iov_base, copy)) + return -EFAULT; + } to += copy; len -= copy; iov->iov_base += copy; @@ -75,7 +81,8 @@ pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len) } static int -pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len) +pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len, + int atomic) { unsigned long copy; @@ -84,8 +91,13 @@ pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len) iov++; copy = min_t(unsigned long, len, iov->iov_len); - if (copy_to_user(iov->iov_base, from, copy)) - return -EFAULT; + if (atomic) { + if (__copy_to_user_inatomic(iov->iov_base, from, copy)) + return -EFAULT; + } else { + if (copy_to_user(iov->iov_base, from, copy)) + return -EFAULT; + } from += copy; len -= copy; iov->iov_base += copy; @@ -94,6 +106,47 @@ pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len) return 0; } +/* + * Attempt to pre-fault in the user memory, so we can use atomic copies. + * Returns the number of bytes not faulted in. + */ +static int iov_fault_in_pages_write(struct iovec *iov, unsigned long len) +{ + while (!iov->iov_len) + iov++; + + while (len > 0) { + unsigned long this_len; + + this_len = min_t(unsigned long, len, iov->iov_len); + if (fault_in_pages_writeable(iov->iov_base, this_len)) + break; + + len -= this_len; + iov++; + } + + return len; +} + +/* + * Pre-fault in the user memory, so we can use atomic copies. + */ +static void iov_fault_in_pages_read(struct iovec *iov, unsigned long len) +{ + while (!iov->iov_len) + iov++; + + while (len > 0) { + unsigned long this_len; + + this_len = min_t(unsigned long, len, iov->iov_len); + fault_in_pages_readable(iov->iov_base, this_len); + len -= this_len; + iov++; + } +} + static void anon_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { @@ -111,15 +164,24 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe, } void *generic_pipe_buf_map(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) + struct pipe_buffer *buf, int atomic) { + if (atomic) { + buf->flags |= PIPE_BUF_FLAG_ATOMIC; + return kmap_atomic(buf->page, KM_USER0); + } + return kmap(buf->page); } void generic_pipe_buf_unmap(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) + struct pipe_buffer *buf, void *map_data) { - kunmap(buf->page); + if (buf->flags & PIPE_BUF_FLAG_ATOMIC) { + buf->flags &= ~PIPE_BUF_FLAG_ATOMIC; + kunmap_atomic(map_data, KM_USER0); + } else + kunmap(buf->page); } static int anon_pipe_buf_steal(struct pipe_inode_info *pipe, @@ -183,7 +245,7 @@ pipe_readv(struct file *filp, const struct iovec *_iov, struct pipe_buf_operations *ops = buf->ops; void *addr; size_t chars = buf->len; - int error; + int error, atomic; if (chars > total_len) chars = total_len; @@ -195,12 +257,21 @@ pipe_readv(struct file *filp, const struct iovec *_iov, break; } - addr = ops->map(pipe, buf); - error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars); - ops->unmap(pipe, buf); + atomic = !iov_fault_in_pages_write(iov, chars); +redo: + addr = ops->map(pipe, buf, atomic); + error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars, atomic); + ops->unmap(pipe, buf, addr); if (unlikely(error)) { + /* + * Just retry with the slow path if we failed. + */ + if (atomic) { + atomic = 0; + goto redo; + } if (!ret) - ret = -EFAULT; + ret = error; break; } ret += chars; @@ -304,21 +375,28 @@ pipe_writev(struct file *filp, const struct iovec *_iov, int offset = buf->offset + buf->len; if (ops->can_merge && offset + chars <= PAGE_SIZE) { + int error, atomic = 1; void *addr; - int error; error = ops->pin(pipe, buf); if (error) goto out; - addr = ops->map(pipe, buf); + iov_fault_in_pages_read(iov, chars); +redo1: + addr = ops->map(pipe, buf, atomic); error = pipe_iov_copy_from_user(offset + addr, iov, - chars); - ops->unmap(pipe, buf); + chars, atomic); + ops->unmap(pipe, buf, addr); ret = error; do_wakeup = 1; - if (error) + if (error) { + if (atomic) { + atomic = 0; + goto redo1; + } goto out; + } buf->len += chars; total_len -= chars; ret = chars; @@ -341,7 +419,8 @@ pipe_writev(struct file *filp, const struct iovec *_iov, int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1); struct pipe_buffer *buf = pipe->bufs + newbuf; struct page *page = pipe->tmp_page; - int error; + char *src; + int error, atomic = 1; if (!page) { page = alloc_page(GFP_HIGHUSER); @@ -361,11 +440,27 @@ pipe_writev(struct file *filp, const struct iovec *_iov, if (chars > total_len) chars = total_len; - error = pipe_iov_copy_from_user(kmap(page), iov, chars); - kunmap(page); + iov_fault_in_pages_read(iov, chars); +redo2: + if (atomic) + src = kmap_atomic(page, KM_USER0); + else + src = kmap(page); + + error = pipe_iov_copy_from_user(src, iov, chars, + atomic); + if (atomic) + kunmap_atomic(src, KM_USER0); + else + kunmap(page); + if (unlikely(error)) { + if (atomic) { + atomic = 0; + goto redo2; + } if (!ret) - ret = -EFAULT; + ret = error; break; } ret += chars; diff --git a/fs/splice.c b/fs/splice.c index 0a69164..d4664a2 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -640,13 +640,13 @@ find_page: /* * Careful, ->map() uses KM_USER0! */ - char *src = buf->ops->map(info, buf); + char *src = buf->ops->map(info, buf, 1); char *dst = kmap_atomic(page, KM_USER1); memcpy(dst + offset, src + buf->offset, this_len); flush_dcache_page(page); kunmap_atomic(dst, KM_USER1); - buf->ops->unmap(info, buf); + buf->ops->unmap(info, buf, src); } ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len); diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index b8aae1f..4c05449 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -5,7 +5,8 @@ #define PIPE_BUFFERS (16) -#define PIPE_BUF_FLAG_LRU 0x01 +#define PIPE_BUF_FLAG_LRU 0x01 /* page is on the LRU */ +#define PIPE_BUF_FLAG_ATOMIC 0x02 /* was atomically mapped */ struct pipe_buffer { struct page *page; @@ -28,8 +29,8 @@ struct pipe_buffer { */ struct pipe_buf_operations { int can_merge; - void * (*map)(struct pipe_inode_info *, struct pipe_buffer *); - void (*unmap)(struct pipe_inode_info *, struct pipe_buffer *); + void * (*map)(struct pipe_inode_info *, struct pipe_buffer *, int); + void (*unmap)(struct pipe_inode_info *, struct pipe_buffer *, void *); int (*pin)(struct pipe_inode_info *, struct pipe_buffer *); void (*release)(struct pipe_inode_info *, struct pipe_buffer *); int (*steal)(struct pipe_inode_info *, struct pipe_buffer *); @@ -64,8 +65,8 @@ void free_pipe_info(struct inode * inode); void __free_pipe_info(struct pipe_inode_info *); /* Generic pipe buffer ops functions */ -void *generic_pipe_buf_map(struct pipe_inode_info *, struct pipe_buffer *); -void generic_pipe_buf_unmap(struct pipe_inode_info *, struct pipe_buffer *); +void *generic_pipe_buf_map(struct pipe_inode_info *, struct pipe_buffer *, int); +void generic_pipe_buf_unmap(struct pipe_inode_info *, struct pipe_buffer *, void *); void generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *); int generic_pipe_buf_pin(struct pipe_inode_info *, struct pipe_buffer *); -- cgit v0.10.2 From 7afa6fd037e51e95d322990cb127bb2b1217251a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 1 May 2006 20:02:33 +0200 Subject: [PATCH] vmsplice: allow user to pass in gift pages If SPLICE_F_GIFT is set, the user is basically giving this pages away to the kernel. That means we can steal them for eg page cache uses instead of copying it. The data must be properly page aligned and also a multiple of the page size in length. Signed-off-by: Jens Axboe diff --git a/fs/splice.c b/fs/splice.c index d4664a2..b150493 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -141,7 +141,10 @@ static struct pipe_buf_operations page_cache_pipe_buf_ops = { static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { - return 1; + if (!(buf->flags & PIPE_BUF_FLAG_GIFT)) + return 1; + + return 0; } static struct pipe_buf_operations user_page_pipe_buf_ops = { @@ -186,6 +189,9 @@ static ssize_t splice_to_pipe(struct pipe_inode_info *pipe, buf->offset = spd->partial[page_nr].offset; buf->len = spd->partial[page_nr].len; buf->ops = spd->ops; + if (spd->flags & SPLICE_F_GIFT) + buf->flags |= PIPE_BUF_FLAG_GIFT; + pipe->nrbufs++; page_nr++; ret += buf->len; @@ -1073,7 +1079,7 @@ static long do_splice(struct file *in, loff_t __user *off_in, */ static int get_iovec_page_array(const struct iovec __user *iov, unsigned int nr_vecs, struct page **pages, - struct partial_page *partial) + struct partial_page *partial, int aligned) { int buffers = 0, error = 0; @@ -1113,6 +1119,15 @@ static int get_iovec_page_array(const struct iovec __user *iov, * in the user pages. */ off = (unsigned long) base & ~PAGE_MASK; + + /* + * If asked for alignment, the offset must be zero and the + * length a multiple of the PAGE_SIZE. + */ + error = -EINVAL; + if (aligned && (off || len & ~PAGE_MASK)) + break; + npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; if (npages > PIPE_BUFFERS - buffers) npages = PIPE_BUFFERS - buffers; @@ -1206,7 +1221,8 @@ static long do_vmsplice(struct file *file, const struct iovec __user *iov, else if (unlikely(!nr_segs)) return 0; - spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial); + spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial, + flags & SPLICE_F_GIFT); if (spd.nr_pages <= 0) return spd.nr_pages; @@ -1314,6 +1330,12 @@ static int link_pipe(struct pipe_inode_info *ipipe, obuf = opipe->bufs + nbuf; *obuf = *ibuf; + /* + * Don't inherit the gift flag, we need to + * prevent multiple steals of this page. + */ + obuf->flags &= ~PIPE_BUF_FLAG_GIFT; + if (obuf->len > len) obuf->len = len; diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 4c05449..df4d3fa 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -7,6 +7,7 @@ #define PIPE_BUF_FLAG_LRU 0x01 /* page is on the LRU */ #define PIPE_BUF_FLAG_ATOMIC 0x02 /* was atomically mapped */ +#define PIPE_BUF_FLAG_GIFT 0x04 /* page is a gift */ struct pipe_buffer { struct page *page; @@ -79,6 +80,7 @@ int generic_pipe_buf_pin(struct pipe_inode_info *, struct pipe_buffer *); /* we may still block on the fd we splice */ /* from/to, of course */ #define SPLICE_F_MORE (0x04) /* expect more data */ +#define SPLICE_F_GIFT (0x08) /* pages passed in are a gift */ /* * Passed to the actors -- cgit v0.10.2 From 755e4ca4a9885b79a14169ab5b615920eb38a32a Mon Sep 17 00:00:00 2001 From: Bryan O'Sullivan Date: Mon, 24 Apr 2006 14:22:57 -0700 Subject: IB/ipath: fix race with exposing reset file We were accidentally exposing the "reset" sysfs file more than once per device. Signed-off-by: Bryan O'Sullivan Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ipath/ipath_diag.c b/drivers/infiniband/hw/ipath/ipath_diag.c index 7d3fb69..28ddceb 100644 --- a/drivers/infiniband/hw/ipath/ipath_diag.c +++ b/drivers/infiniband/hw/ipath/ipath_diag.c @@ -277,13 +277,14 @@ static int ipath_diag_open(struct inode *in, struct file *fp) bail: spin_unlock_irqrestore(&ipath_devs_lock, flags); - mutex_unlock(&ipath_mutex); /* Only expose a way to reset the device if we make it into diag mode. */ if (ret == 0) ipath_expose_reset(&dd->pcidev->dev); + mutex_unlock(&ipath_mutex); + return ret; } diff --git a/drivers/infiniband/hw/ipath/ipath_sysfs.c b/drivers/infiniband/hw/ipath/ipath_sysfs.c index 32acd80..f323791 100644 --- a/drivers/infiniband/hw/ipath/ipath_sysfs.c +++ b/drivers/infiniband/hw/ipath/ipath_sysfs.c @@ -711,10 +711,22 @@ static struct attribute_group dev_attr_group = { * enters diag mode. A device reset is quite likely to crash the * machine entirely, so we don't want to normally make it * available. + * + * Called with ipath_mutex held. */ int ipath_expose_reset(struct device *dev) { - return device_create_file(dev, &dev_attr_reset); + static int exposed; + int ret; + + if (!exposed) { + ret = device_create_file(dev, &dev_attr_reset); + exposed = 1; + } + else + ret = 0; + + return ret; } int ipath_driver_create_group(struct device_driver *drv) -- cgit v0.10.2 From 68dd43a162b43218d2e5ac1d139c1d53da965f54 Mon Sep 17 00:00:00 2001 From: Bryan O'Sullivan Date: Mon, 24 Apr 2006 14:22:58 -0700 Subject: IB/ipath: set up 32-bit DMA mask if 64-bit setup fails Some systems do not set up 64-bit maps on systems with 2GB or less of memory installed, so we have to fall back to trying a 32-bit setup. Signed-off-by: Bryan O'Sullivan Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c index e7617c3..7ff95b7 100644 --- a/drivers/infiniband/hw/ipath/ipath_driver.c +++ b/drivers/infiniband/hw/ipath/ipath_driver.c @@ -418,9 +418,19 @@ static int __devinit ipath_init_one(struct pci_dev *pdev, ret = pci_set_dma_mask(pdev, DMA_64BIT_MASK); if (ret) { - dev_info(&pdev->dev, "pci_set_dma_mask unit %u " - "fails: %d\n", dd->ipath_unit, ret); - goto bail_regions; + /* + * if the 64 bit setup fails, try 32 bit. Some systems + * do not setup 64 bit maps on systems with 2GB or less + * memory installed. + */ + ret = pci_set_dma_mask(pdev, DMA_32BIT_MASK); + if (ret) { + dev_info(&pdev->dev, "pci_set_dma_mask unit %u " + "fails: %d\n", dd->ipath_unit, ret); + goto bail_regions; + } + else + ipath_dbg("No 64bit DMA mask, used 32 bit mask\n"); } pci_set_master(pdev); -- cgit v0.10.2 From 23e86a4584606a08393e0ad3a5ca27b19a3de374 Mon Sep 17 00:00:00 2001 From: Bryan O'Sullivan Date: Mon, 24 Apr 2006 14:22:59 -0700 Subject: IB/ipath: iterate over correct number of ports during reset Signed-off-by: Bryan O'Sullivan Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c index 7ff95b7..398add4 100644 --- a/drivers/infiniband/hw/ipath/ipath_driver.c +++ b/drivers/infiniband/hw/ipath/ipath_driver.c @@ -1959,7 +1959,7 @@ int ipath_reset_device(int unit) } if (dd->ipath_pd) - for (i = 1; i < dd->ipath_portcnt; i++) { + for (i = 1; i < dd->ipath_cfgports; i++) { if (dd->ipath_pd[i] && dd->ipath_pd[i]->port_cnt) { ipath_dbg("unit %u port %d is in use " "(PID %u cmd %s), can't reset\n", -- cgit v0.10.2 From 52e7fad825c47fb86801f23b9f793da1d2774f18 Mon Sep 17 00:00:00 2001 From: Bryan O'Sullivan Date: Mon, 24 Apr 2006 14:23:00 -0700 Subject: IB/ipath: change handling of PIO buffers Different ipath hardware types have different numbers of buffers available, so we decide on the counts ourselves unless we are specifically overridden with a module parameter. Signed-off-by: Bryan O'Sullivan Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ipath/ipath_init_chip.c b/drivers/infiniband/hw/ipath/ipath_init_chip.c index 2823ff9..16f640e 100644 --- a/drivers/infiniband/hw/ipath/ipath_init_chip.c +++ b/drivers/infiniband/hw/ipath/ipath_init_chip.c @@ -53,13 +53,19 @@ MODULE_PARM_DESC(cfgports, "Set max number of ports to use"); /* * Number of buffers reserved for driver (layered drivers and SMA - * send). Reserved at end of buffer list. + * send). Reserved at end of buffer list. Initialized based on + * number of PIO buffers if not set via module interface. + * The problem with this is that it's global, but we'll use different + * numbers for different chip types. So the default value is not + * very useful. I've redefined it for the 1.3 release so that it's + * zero unless set by the user to something else, in which case we + * try to respect it. */ -static ushort ipath_kpiobufs = 32; +static ushort ipath_kpiobufs; static int ipath_set_kpiobufs(const char *val, struct kernel_param *kp); -module_param_call(kpiobufs, ipath_set_kpiobufs, param_get_uint, +module_param_call(kpiobufs, ipath_set_kpiobufs, param_get_ushort, &ipath_kpiobufs, S_IWUSR | S_IRUGO); MODULE_PARM_DESC(kpiobufs, "Set number of PIO buffers for driver"); @@ -531,8 +537,11 @@ static int init_housekeeping(struct ipath_devdata *dd, * Don't clear ipath_flags as 8bit mode was set before * entering this func. However, we do set the linkstate to * unknown, so we can watch for a transition. + * PRESENT is set because we want register reads to work, + * and the kernel infrastructure saw it in config space; + * We clear it if we have failures. */ - dd->ipath_flags |= IPATH_LINKUNK; + dd->ipath_flags |= IPATH_LINKUNK | IPATH_PRESENT; dd->ipath_flags &= ~(IPATH_LINKACTIVE | IPATH_LINKARMED | IPATH_LINKDOWN | IPATH_LINKINIT); @@ -560,6 +569,7 @@ static int init_housekeeping(struct ipath_devdata *dd, || (dd->ipath_uregbase & 0xffffffff) == 0xffffffff) { ipath_dev_err(dd, "Register read failures from chip, " "giving up initialization\n"); + dd->ipath_flags &= ~IPATH_PRESENT; ret = -ENODEV; goto done; } @@ -682,16 +692,14 @@ int ipath_init_chip(struct ipath_devdata *dd, int reinit) */ dd->ipath_pioavregs = ALIGN(val, sizeof(u64) * BITS_PER_BYTE / 2) / (sizeof(u64) * BITS_PER_BYTE / 2); - if (!ipath_kpiobufs) /* have to have at least 1, for SMA */ - kpiobufs = ipath_kpiobufs = 1; - else if ((dd->ipath_piobcnt2k + dd->ipath_piobcnt4k) < - (dd->ipath_cfgports * IPATH_MIN_USER_PORT_BUFCNT)) { - dev_info(&dd->pcidev->dev, "Too few PIO buffers (%u) " - "for %u ports to have %u each!\n", - dd->ipath_piobcnt2k + dd->ipath_piobcnt4k, - dd->ipath_cfgports, IPATH_MIN_USER_PORT_BUFCNT); - kpiobufs = 1; /* reserve just the minimum for SMA/ether */ - } else + if (ipath_kpiobufs == 0) { + /* not set by user, or set explictly to default */ + if ((dd->ipath_piobcnt2k + dd->ipath_piobcnt4k) > 128) + kpiobufs = 32; + else + kpiobufs = 16; + } + else kpiobufs = ipath_kpiobufs; if (kpiobufs > -- cgit v0.10.2 From fccea663643cedfa310c5254da30e1e35e09199b Mon Sep 17 00:00:00 2001 From: Bryan O'Sullivan Date: Mon, 24 Apr 2006 14:23:02 -0700 Subject: IB/ipath: fix verbs registration Remember when the verbs layer unregisters from the lower-level code. Signed-off-by: Bryan O'Sullivan Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ipath/ipath_layer.c b/drivers/infiniband/hw/ipath/ipath_layer.c index 69ed110..9cb5258 100644 --- a/drivers/infiniband/hw/ipath/ipath_layer.c +++ b/drivers/infiniband/hw/ipath/ipath_layer.c @@ -46,13 +46,15 @@ /* Acquire before ipath_devs_lock. */ static DEFINE_MUTEX(ipath_layer_mutex); +static int ipath_verbs_registered; + u16 ipath_layer_rcv_opcode; + static int (*layer_intr)(void *, u32); static int (*layer_rcv)(void *, void *, struct sk_buff *); static int (*layer_rcv_lid)(void *, void *); static int (*verbs_piobufavail)(void *); static void (*verbs_rcv)(void *, void *, void *, u32); -static int ipath_verbs_registered; static void *(*layer_add_one)(int, struct ipath_devdata *); static void (*layer_remove_one)(void *); @@ -586,6 +588,8 @@ void ipath_verbs_unregister(void) verbs_rcv = NULL; verbs_timer_cb = NULL; + ipath_verbs_registered = 0; + mutex_unlock(&ipath_layer_mutex); } -- cgit v0.10.2 From c71c30dcba142f16bc5f651812b1bc0b9f70f02d Mon Sep 17 00:00:00 2001 From: Bryan O'Sullivan Date: Mon, 24 Apr 2006 14:23:03 -0700 Subject: IB/ipath: prevent hardware from being accessed during reset The reset code now turns off the PRESENT flag during a reset, so that other code won't attempt to access a device that's in mid-reset. Signed-off-by: Bryan O'Sullivan Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ipath/ipath_intr.c b/drivers/infiniband/hw/ipath/ipath_intr.c index 0bcb428..8e3b95b 100644 --- a/drivers/infiniband/hw/ipath/ipath_intr.c +++ b/drivers/infiniband/hw/ipath/ipath_intr.c @@ -719,11 +719,24 @@ static void handle_rcv(struct ipath_devdata *dd, u32 istat) irqreturn_t ipath_intr(int irq, void *data, struct pt_regs *regs) { struct ipath_devdata *dd = data; - u32 istat = ipath_read_kreg32(dd, dd->ipath_kregs->kr_intstatus); + u32 istat; ipath_err_t estat = 0; static unsigned unexpected = 0; irqreturn_t ret; + if(!(dd->ipath_flags & IPATH_PRESENT)) { + /* this is mostly so we don't try to touch the chip while + * it is being reset */ + /* + * This return value is perhaps odd, but we do not want the + * interrupt core code to remove our interrupt handler + * because we don't appear to be handling an interrupt + * during a chip reset. + */ + return IRQ_HANDLED; + } + + istat = ipath_read_kreg32(dd, dd->ipath_kregs->kr_intstatus); if (unlikely(!istat)) { ipath_stats.sps_nullintr++; ret = IRQ_NONE; /* not our interrupt, or already handled */ diff --git a/drivers/infiniband/hw/ipath/ipath_kernel.h b/drivers/infiniband/hw/ipath/ipath_kernel.h index 0ce5f19..e6507f8 100644 --- a/drivers/infiniband/hw/ipath/ipath_kernel.h +++ b/drivers/infiniband/hw/ipath/ipath_kernel.h @@ -731,7 +731,7 @@ u64 ipath_read_kreg64_port(const struct ipath_devdata *, ipath_kreg, static inline u32 ipath_read_ureg32(const struct ipath_devdata *dd, ipath_ureg regno, int port) { - if (!dd->ipath_kregbase) + if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT)) return 0; return readl(regno + (u64 __iomem *) @@ -762,7 +762,7 @@ static inline void ipath_write_ureg(const struct ipath_devdata *dd, static inline u32 ipath_read_kreg32(const struct ipath_devdata *dd, ipath_kreg regno) { - if (!dd->ipath_kregbase) + if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT)) return -1; return readl((u32 __iomem *) & dd->ipath_kregbase[regno]); } @@ -770,7 +770,7 @@ static inline u32 ipath_read_kreg32(const struct ipath_devdata *dd, static inline u64 ipath_read_kreg64(const struct ipath_devdata *dd, ipath_kreg regno) { - if (!dd->ipath_kregbase) + if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT)) return -1; return readq(&dd->ipath_kregbase[regno]); @@ -786,7 +786,7 @@ static inline void ipath_write_kreg(const struct ipath_devdata *dd, static inline u64 ipath_read_creg(const struct ipath_devdata *dd, ipath_sreg regno) { - if (!dd->ipath_kregbase) + if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT)) return 0; return readq(regno + (u64 __iomem *) @@ -797,7 +797,7 @@ static inline u64 ipath_read_creg(const struct ipath_devdata *dd, static inline u32 ipath_read_creg32(const struct ipath_devdata *dd, ipath_sreg regno) { - if (!dd->ipath_kregbase) + if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT)) return 0; return readl(regno + (u64 __iomem *) (dd->ipath_cregbase + diff --git a/drivers/infiniband/hw/ipath/ipath_pe800.c b/drivers/infiniband/hw/ipath/ipath_pe800.c index e1dc4f7..6318067 100644 --- a/drivers/infiniband/hw/ipath/ipath_pe800.c +++ b/drivers/infiniband/hw/ipath/ipath_pe800.c @@ -972,6 +972,8 @@ static int ipath_setup_pe_reset(struct ipath_devdata *dd) /* Use ERROR so it shows up in logs, etc. */ ipath_dev_err(dd, "Resetting PE-800 unit %u\n", dd->ipath_unit); + /* keep chip from being accessed in a few places */ + dd->ipath_flags &= ~(IPATH_INITTED|IPATH_PRESENT); val = dd->ipath_control | INFINIPATH_C_RESET; ipath_write_kreg(dd, dd->ipath_kregs->kr_control, val); mb(); @@ -997,6 +999,8 @@ static int ipath_setup_pe_reset(struct ipath_devdata *dd) if ((r = pci_enable_device(dd->pcidev))) ipath_dev_err(dd, "pci_enable_device failed after " "reset: %d\n", r); + /* whether it worked or not, mark as present, again */ + dd->ipath_flags |= IPATH_PRESENT; val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_revision); if (val == dd->ipath_revision) { ipath_cdbg(VERBOSE, "Got matching revision " -- cgit v0.10.2 From 76f0dd141b477094b026206c0d8c21beac6069f5 Mon Sep 17 00:00:00 2001 From: Bryan O'Sullivan Date: Mon, 24 Apr 2006 14:23:05 -0700 Subject: IB/ipath: simplify RC send posting Remove some unnecessarily complicated tests. Signed-off-by: Bryan O'Sullivan Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ipath/ipath_ruc.c b/drivers/infiniband/hw/ipath/ipath_ruc.c index f232e77..eb81424 100644 --- a/drivers/infiniband/hw/ipath/ipath_ruc.c +++ b/drivers/infiniband/hw/ipath/ipath_ruc.c @@ -531,19 +531,12 @@ int ipath_post_rc_send(struct ipath_qp *qp, struct ib_send_wr *wr) } wqe->wr.num_sge = j; qp->s_head = next; - /* - * Wake up the send tasklet if the QP is not waiting - * for an RNR timeout. - */ - next = qp->s_rnr_timeout; spin_unlock_irqrestore(&qp->s_lock, flags); - if (next == 0) { - if (qp->ibqp.qp_type == IB_QPT_UC) - ipath_do_uc_send((unsigned long) qp); - else - ipath_do_rc_send((unsigned long) qp); - } + if (qp->ibqp.qp_type == IB_QPT_UC) + ipath_do_uc_send((unsigned long) qp); + else + ipath_do_rc_send((unsigned long) qp); ret = 0; -- cgit v0.10.2 From 9b2017f1e1c95625b2ca2a1ec5317097117d7078 Mon Sep 17 00:00:00 2001 From: Bryan O'Sullivan Date: Mon, 24 Apr 2006 14:23:06 -0700 Subject: IB/ipath: simplify IB timer usage Signed-off-by: Bryan O'Sullivan Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c index 8d2558a..cb9e387 100644 --- a/drivers/infiniband/hw/ipath/ipath_verbs.c +++ b/drivers/infiniband/hw/ipath/ipath_verbs.c @@ -449,7 +449,6 @@ static void ipath_ib_timer(void *arg) { struct ipath_ibdev *dev = (struct ipath_ibdev *) arg; struct ipath_qp *resend = NULL; - struct ipath_qp *rnr = NULL; struct list_head *last; struct ipath_qp *qp; unsigned long flags; @@ -465,32 +464,18 @@ static void ipath_ib_timer(void *arg) last = &dev->pending[dev->pending_index]; while (!list_empty(last)) { qp = list_entry(last->next, struct ipath_qp, timerwait); - if (last->next == LIST_POISON1 || - last->next != &qp->timerwait || - qp->timerwait.prev != last) { - INIT_LIST_HEAD(last); - } else { - list_del(&qp->timerwait); - qp->timerwait.prev = (struct list_head *) resend; - resend = qp; - atomic_inc(&qp->refcount); - } + list_del(&qp->timerwait); + qp->timer_next = resend; + resend = qp; + atomic_inc(&qp->refcount); } last = &dev->rnrwait; if (!list_empty(last)) { qp = list_entry(last->next, struct ipath_qp, timerwait); if (--qp->s_rnr_timeout == 0) { do { - if (last->next == LIST_POISON1 || - last->next != &qp->timerwait || - qp->timerwait.prev != last) { - INIT_LIST_HEAD(last); - break; - } list_del(&qp->timerwait); - qp->timerwait.prev = - (struct list_head *) rnr; - rnr = qp; + tasklet_hi_schedule(&qp->s_task); if (list_empty(last)) break; qp = list_entry(last->next, struct ipath_qp, @@ -530,8 +515,7 @@ static void ipath_ib_timer(void *arg) spin_unlock_irqrestore(&dev->pending_lock, flags); /* XXX What if timer fires again while this is running? */ - for (qp = resend; qp != NULL; - qp = (struct ipath_qp *) qp->timerwait.prev) { + for (qp = resend; qp != NULL; qp = qp->timer_next) { struct ib_wc wc; spin_lock_irqsave(&qp->s_lock, flags); @@ -545,9 +529,6 @@ static void ipath_ib_timer(void *arg) if (atomic_dec_and_test(&qp->refcount)) wake_up(&qp->wait); } - for (qp = rnr; qp != NULL; - qp = (struct ipath_qp *) qp->timerwait.prev) - tasklet_hi_schedule(&qp->s_task); } /** @@ -556,9 +537,9 @@ static void ipath_ib_timer(void *arg) * * This is called from ipath_intr() at interrupt level when a PIO buffer is * available after ipath_verbs_send() returned an error that no buffers were - * available. Return 0 if we consumed all the PIO buffers and we still have + * available. Return 1 if we consumed all the PIO buffers and we still have * QPs waiting for buffers (for now, just do a tasklet_hi_schedule and - * return one). + * return zero). */ static int ipath_ib_piobufavail(void *arg) { @@ -579,7 +560,7 @@ static int ipath_ib_piobufavail(void *arg) spin_unlock_irqrestore(&dev->pending_lock, flags); bail: - return 1; + return 0; } static int ipath_query_device(struct ib_device *ibdev, @@ -1159,7 +1140,7 @@ static ssize_t show_stats(struct class_device *cdev, char *buf) len = sprintf(buf, "RC resends %d\n" - "RC QACKs %d\n" + "RC no QACK %d\n" "RC ACKs %d\n" "RC SEQ NAKs %d\n" "RC RDMA seq %d\n" diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.h b/drivers/infiniband/hw/ipath/ipath_verbs.h index fcafbc7..4f8d593 100644 --- a/drivers/infiniband/hw/ipath/ipath_verbs.h +++ b/drivers/infiniband/hw/ipath/ipath_verbs.h @@ -282,7 +282,8 @@ struct ipath_srq { */ struct ipath_qp { struct ib_qp ibqp; - struct ipath_qp *next; /* link list for QPN hash table */ + struct ipath_qp *next; /* link list for QPN hash table */ + struct ipath_qp *timer_next; /* link list for ipath_ib_timer() */ struct list_head piowait; /* link for wait PIO buf */ struct list_head timerwait; /* link for waiting for timeouts */ struct ib_ah_attr remote_ah_attr; -- cgit v0.10.2 From ae15f540cc52acbcbfdf4b902d214a5db5c835d2 Mon Sep 17 00:00:00 2001 From: Bryan O'Sullivan Date: Mon, 24 Apr 2006 14:23:07 -0700 Subject: IB/ipath: improve sparse annotation Signed-off-by: Bryan O'Sullivan Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ipath/ips_common.h b/drivers/infiniband/hw/ipath/ips_common.h index 410a764..ab7cbbb 100644 --- a/drivers/infiniband/hw/ipath/ips_common.h +++ b/drivers/infiniband/hw/ipath/ips_common.h @@ -95,7 +95,7 @@ struct ether_header { __u8 seq_num; __le32 len; /* MUST be of word size due to PIO write requirements */ - __u32 csum; + __le32 csum; __le16 csum_offset; __le16 flags; __u16 first_2_bytes; -- cgit v0.10.2 From d562a5ae69bd5643d777788117d02acb22fab347 Mon Sep 17 00:00:00 2001 From: Bryan O'Sullivan Date: Mon, 24 Apr 2006 14:23:08 -0700 Subject: IB/ipath: fix label name in interrupt handler Names that are the opposite of their intended meanings are not so helpful. Signed-off-by: Bryan O'Sullivan Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ipath/ipath_intr.c b/drivers/infiniband/hw/ipath/ipath_intr.c index 8e3b95b..3e72a1f 100644 --- a/drivers/infiniband/hw/ipath/ipath_intr.c +++ b/drivers/infiniband/hw/ipath/ipath_intr.c @@ -665,14 +665,14 @@ static void handle_layer_pioavail(struct ipath_devdata *dd) ret = __ipath_layer_intr(dd, IPATH_LAYER_INT_SEND_CONTINUE); if (ret > 0) - goto clear; + goto set; ret = __ipath_verbs_piobufavail(dd); if (ret > 0) - goto clear; + goto set; return; -clear: +set: set_bit(IPATH_S_PIOINTBUFAVAIL, &dd->ipath_sendctrl); ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl); -- cgit v0.10.2 From f6f0413e10b76440fb82efebc63120f3b6d42adb Mon Sep 17 00:00:00 2001 From: Bryan O'Sullivan Date: Mon, 24 Apr 2006 14:23:09 -0700 Subject: IB/ipath: tidy up white space in a few files Signed-off-by: Bryan O'Sullivan Signed-off-by: Roland Dreier diff --git a/drivers/infiniband/hw/ipath/ipath_debug.h b/drivers/infiniband/hw/ipath/ipath_debug.h index 593e289..4676238 100644 --- a/drivers/infiniband/hw/ipath/ipath_debug.h +++ b/drivers/infiniband/hw/ipath/ipath_debug.h @@ -60,11 +60,11 @@ #define __IPATH_KERNEL_SEND 0x2000 /* use kernel mode send */ #define __IPATH_EPKTDBG 0x4000 /* print ethernet packet data */ #define __IPATH_SMADBG 0x8000 /* sma packet debug */ -#define __IPATH_IPATHDBG 0x10000 /* Ethernet (IPATH) general debug on */ -#define __IPATH_IPATHWARN 0x20000 /* Ethernet (IPATH) warnings on */ -#define __IPATH_IPATHERR 0x40000 /* Ethernet (IPATH) errors on */ -#define __IPATH_IPATHPD 0x80000 /* Ethernet (IPATH) packet dump on */ -#define __IPATH_IPATHTABLE 0x100000 /* Ethernet (IPATH) table dump on */ +#define __IPATH_IPATHDBG 0x10000 /* Ethernet (IPATH) gen debug */ +#define __IPATH_IPATHWARN 0x20000 /* Ethernet (IPATH) warnings */ +#define __IPATH_IPATHERR 0x40000 /* Ethernet (IPATH) errors */ +#define __IPATH_IPATHPD 0x80000 /* Ethernet (IPATH) packet dump */ +#define __IPATH_IPATHTABLE 0x100000 /* Ethernet (IPATH) table dump */ #else /* _IPATH_DEBUGGING */ @@ -79,11 +79,12 @@ #define __IPATH_TRSAMPLE 0x0 /* generate trace buffer sample entries */ #define __IPATH_VERBDBG 0x0 /* very verbose debug */ #define __IPATH_PKTDBG 0x0 /* print packet data */ -#define __IPATH_PROCDBG 0x0 /* print process startup (init)/exit messages */ +#define __IPATH_PROCDBG 0x0 /* process startup (init)/exit messages */ /* print mmap/nopage stuff, not using VDBG any more */ #define __IPATH_MMDBG 0x0 #define __IPATH_EPKTDBG 0x0 /* print ethernet packet data */ -#define __IPATH_SMADBG 0x0 /* print process startup (init)/exit messages */#define __IPATH_IPATHDBG 0x0 /* Ethernet (IPATH) table dump on */ +#define __IPATH_SMADBG 0x0 /* process startup (init)/exit messages */ +#define __IPATH_IPATHDBG 0x0 /* Ethernet (IPATH) table dump on */ #define __IPATH_IPATHWARN 0x0 /* Ethernet (IPATH) warnings on */ #define __IPATH_IPATHERR 0x0 /* Ethernet (IPATH) errors on */ #define __IPATH_IPATHPD 0x0 /* Ethernet (IPATH) packet dump on */ diff --git a/drivers/infiniband/hw/ipath/ipath_registers.h b/drivers/infiniband/hw/ipath/ipath_registers.h index 1e59750..402126e 100644 --- a/drivers/infiniband/hw/ipath/ipath_registers.h +++ b/drivers/infiniband/hw/ipath/ipath_registers.h @@ -34,8 +34,9 @@ #define _IPATH_REGISTERS_H /* - * This file should only be included by kernel source, and by the diags. - * It defines the registers, and their contents, for the InfiniPath HT-400 chip + * This file should only be included by kernel source, and by the diags. It + * defines the registers, and their contents, for the InfiniPath HT-400 + * chip. */ /* @@ -156,8 +157,10 @@ #define INFINIPATH_IBCC_FLOWCTRLWATERMARK_SHIFT 8 #define INFINIPATH_IBCC_LINKINITCMD_MASK 0x3ULL #define INFINIPATH_IBCC_LINKINITCMD_DISABLE 1 -#define INFINIPATH_IBCC_LINKINITCMD_POLL 2 /* cycle through TS1/TS2 till OK */ -#define INFINIPATH_IBCC_LINKINITCMD_SLEEP 3 /* wait for TS1, then go on */ +/* cycle through TS1/TS2 till OK */ +#define INFINIPATH_IBCC_LINKINITCMD_POLL 2 +/* wait for TS1, then go on */ +#define INFINIPATH_IBCC_LINKINITCMD_SLEEP 3 #define INFINIPATH_IBCC_LINKINITCMD_SHIFT 16 #define INFINIPATH_IBCC_LINKCMD_MASK 0x3ULL #define INFINIPATH_IBCC_LINKCMD_INIT 1 /* move to 0x11 */ @@ -182,7 +185,8 @@ #define INFINIPATH_IBCS_LINKSTATE_SHIFT 4 #define INFINIPATH_IBCS_TXREADY 0x40000000 #define INFINIPATH_IBCS_TXCREDITOK 0x80000000 -/* link training states (shift by INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) */ +/* link training states (shift by + INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) */ #define INFINIPATH_IBCS_LT_STATE_DISABLED 0x00 #define INFINIPATH_IBCS_LT_STATE_LINKUP 0x01 #define INFINIPATH_IBCS_LT_STATE_POLLACTIVE 0x02 @@ -267,10 +271,12 @@ /* kr_serdesconfig0 bits */ #define INFINIPATH_SERDC0_RESET_MASK 0xfULL /* overal reset bits */ #define INFINIPATH_SERDC0_RESET_PLL 0x10000000ULL /* pll reset */ -#define INFINIPATH_SERDC0_TXIDLE 0xF000ULL /* tx idle enables (per lane) */ -#define INFINIPATH_SERDC0_RXDETECT_EN 0xF0000ULL /* rx detect enables (per lane) */ -#define INFINIPATH_SERDC0_L1PWR_DN 0xF0ULL /* L1 Power down; use with RXDETECT, - Otherwise not used on IB side */ +/* tx idle enables (per lane) */ +#define INFINIPATH_SERDC0_TXIDLE 0xF000ULL +/* rx detect enables (per lane) */ +#define INFINIPATH_SERDC0_RXDETECT_EN 0xF0000ULL +/* L1 Power down; use with RXDETECT, Otherwise not used on IB side */ +#define INFINIPATH_SERDC0_L1PWR_DN 0xF0ULL /* kr_xgxsconfig bits */ #define INFINIPATH_XGXS_RESET 0x7ULL @@ -390,12 +396,13 @@ struct ipath_kregs { ipath_kreg kr_txintmemsize; ipath_kreg kr_xgxsconfig; ipath_kreg kr_ibpllcfg; - /* use these two (and the following N ports) only with ipath_k*_kreg64_port(); - * not *kreg64() */ + /* use these two (and the following N ports) only with + * ipath_k*_kreg64_port(); not *kreg64() */ ipath_kreg kr_rcvhdraddr; ipath_kreg kr_rcvhdrtailaddr; - /* remaining registers are not present on all types of infinipath chips */ + /* remaining registers are not present on all types of infinipath + chips */ ipath_kreg kr_rcvpktledcnt; ipath_kreg kr_pcierbuftestreg0; ipath_kreg kr_pcierbuftestreg1; diff --git a/drivers/infiniband/hw/ipath/ipath_ud.c b/drivers/infiniband/hw/ipath/ipath_ud.c index 01cfb30..e606daf 100644 --- a/drivers/infiniband/hw/ipath/ipath_ud.c +++ b/drivers/infiniband/hw/ipath/ipath_ud.c @@ -46,8 +46,10 @@ * This is called from ipath_post_ud_send() to forward a WQE addressed * to the same HCA. */ -static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_sge_state *ss, - u32 length, struct ib_send_wr *wr, struct ib_wc *wc) +static void ipath_ud_loopback(struct ipath_qp *sqp, + struct ipath_sge_state *ss, + u32 length, struct ib_send_wr *wr, + struct ib_wc *wc) { struct ipath_ibdev *dev = to_idev(sqp->ibqp.device); struct ipath_qp *qp; -- cgit v0.10.2 From 235acec78e87a60ace01d1ecb4b87ad1d689715a Mon Sep 17 00:00:00 2001 From: Bastian Blank Date: Mon, 1 May 2006 12:15:42 -0700 Subject: [PATCH] s390: make qeth buildable Signed-off-by: Bastian Blank Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: "David S. Miller" Acked-by: Jeff Garzik Acked-by: Frank Pavlic Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/s390/net/qeth_main.c b/drivers/s390/net/qeth_main.c index b3c6e79..cb14642 100644 --- a/drivers/s390/net/qeth_main.c +++ b/drivers/s390/net/qeth_main.c @@ -8014,7 +8014,6 @@ static int (*qeth_old_arp_constructor) (struct neighbour *); static struct neigh_ops arp_direct_ops_template = { .family = AF_INET, - .destructor = NULL, .solicit = NULL, .error_report = NULL, .output = dev_queue_xmit, -- cgit v0.10.2 From df30d0f4ca3c41b60068232d6de9d58be88436f0 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 1 May 2006 12:15:44 -0700 Subject: [PATCH] md: Avoid oops when attempting to fix read errors on raid10 We should add to the counter for the rdev *after* checking if the rdev is NULL!!! Signed-off-by: Neil Brown Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 617012b..ddc1dfc 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1435,9 +1435,9 @@ static void raid10d(mddev_t *mddev) sl--; d = r10_bio->devs[sl].devnum; rdev = conf->mirrors[d].rdev; - atomic_add(s, &rdev->corrected_errors); if (rdev && test_bit(In_sync, &rdev->flags)) { + atomic_add(s, &rdev->corrected_errors); if (sync_page_io(rdev->bdev, r10_bio->devs[sl].addr + sect + rdev->data_offset, -- cgit v0.10.2 From e0a33270ed0e8e00cbb882a33d21e1f92aac0ceb Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 1 May 2006 12:15:45 -0700 Subject: [PATCH] md: Fixed refcounting/locking when attempting read error correction in raid10 We need to hold a reference to rdevs while reading and writing to attempt to correct read errors. This reference must be taken under an rcu lock. Signed-off-by: Neil Brown Cc: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index ddc1dfc..1440935 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1407,36 +1407,45 @@ static void raid10d(mddev_t *mddev) if (s > (PAGE_SIZE>>9)) s = PAGE_SIZE >> 9; + rcu_read_lock(); do { int d = r10_bio->devs[sl].devnum; - rdev = conf->mirrors[d].rdev; + rdev = rcu_dereference(conf->mirrors[d].rdev); if (rdev && - test_bit(In_sync, &rdev->flags) && - sync_page_io(rdev->bdev, - r10_bio->devs[sl].addr + - sect + rdev->data_offset, - s<<9, - conf->tmppage, READ)) - success = 1; - else { - sl++; - if (sl == conf->copies) - sl = 0; + test_bit(In_sync, &rdev->flags)) { + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + success = sync_page_io(rdev->bdev, + r10_bio->devs[sl].addr + + sect + rdev->data_offset, + s<<9, + conf->tmppage, READ); + rdev_dec_pending(rdev, mddev); + rcu_read_lock(); + if (success) + break; } + sl++; + if (sl == conf->copies) + sl = 0; } while (!success && sl != r10_bio->read_slot); + rcu_read_unlock(); if (success) { int start = sl; /* write it back and re-read */ + rcu_read_lock(); while (sl != r10_bio->read_slot) { int d; if (sl==0) sl = conf->copies; sl--; d = r10_bio->devs[sl].devnum; - rdev = conf->mirrors[d].rdev; + rdev = rcu_dereference(conf->mirrors[d].rdev); if (rdev && test_bit(In_sync, &rdev->flags)) { + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); atomic_add(s, &rdev->corrected_errors); if (sync_page_io(rdev->bdev, r10_bio->devs[sl].addr + @@ -1444,6 +1453,8 @@ static void raid10d(mddev_t *mddev) s<<9, conf->tmppage, WRITE) == 0) /* Well, this device is dead */ md_error(mddev, rdev); + rdev_dec_pending(rdev, mddev); + rcu_read_lock(); } } sl = start; @@ -1453,17 +1464,22 @@ static void raid10d(mddev_t *mddev) sl = conf->copies; sl--; d = r10_bio->devs[sl].devnum; - rdev = conf->mirrors[d].rdev; + rdev = rcu_dereference(conf->mirrors[d].rdev); if (rdev && test_bit(In_sync, &rdev->flags)) { + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); if (sync_page_io(rdev->bdev, r10_bio->devs[sl].addr + sect + rdev->data_offset, s<<9, conf->tmppage, READ) == 0) /* Well, this device is dead */ md_error(mddev, rdev); + rdev_dec_pending(rdev, mddev); + rcu_read_lock(); } } + rcu_read_unlock(); } else { /* Cannot read from anywhere -- bye bye array */ md_error(mddev, conf->mirrors[r10_bio->devs[r10_bio->read_slot].devnum].rdev); -- cgit v0.10.2 From bea2771871ed1084c9a85ed0c86340f188505702 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 1 May 2006 12:15:46 -0700 Subject: [PATCH] md: Change ENOTSUPP to EOPNOTSUPP Because that is what you get if a BIO_RW_BARRIER isn't supported! Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 6081941..532d040 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -315,7 +315,7 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int if (r1_bio->bios[mirror] == bio) break; - if (error == -ENOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) { + if (error == -EOPNOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) { set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags); set_bit(R1BIO_BarrierRetry, &r1_bio->state); r1_bio->mddev->barriers_work = 0; @@ -1404,7 +1404,7 @@ static void raid1d(mddev_t *mddev) unplug = 1; } else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) { /* some requests in the r1bio were BIO_RW_BARRIER - * requests which failed with -ENOTSUPP. Hohumm.. + * requests which failed with -EOPNOTSUPP. Hohumm.. * Better resubmit without the barrier. * We know which devices to resubmit for, because * all others have had their bios[] entry cleared. -- cgit v0.10.2 From 62de608da0b0ab17d81a233b50d1e952b9816f69 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 1 May 2006 12:15:47 -0700 Subject: [PATCH] md: Improve detection of lack of barrier support in raid1 Move the test for 'do barrier work' down a bit so that if the first write to a raid1 is a BIO_RW_BARRIER write, the checking done by superblock writes will cause the right thing to happen. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 532d040..b8c13c8 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -753,18 +753,24 @@ static int make_request(request_queue_t *q, struct bio * bio) const int rw = bio_data_dir(bio); int do_barriers; - if (unlikely(!mddev->barriers_work && bio_barrier(bio))) { - bio_endio(bio, bio->bi_size, -EOPNOTSUPP); - return 0; - } - /* * Register the new request and wait if the reconstruction * thread has put up a bar for new requests. * Continue immediately if no resync is active currently. + * We test barriers_work *after* md_write_start as md_write_start + * may cause the first superblock write, and that will check out + * if barriers work. */ + md_write_start(mddev, bio); /* wait on superblock update early */ + if (unlikely(!mddev->barriers_work && bio_barrier(bio))) { + if (rw == WRITE) + md_write_end(mddev); + bio_endio(bio, bio->bi_size, -EOPNOTSUPP); + return 0; + } + wait_barrier(conf); disk_stat_inc(mddev->gendisk, ios[rw]); -- cgit v0.10.2 From 5e7dd2ab6b9bdfa60e19b8739e6b2a204fd4f477 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 1 May 2006 12:15:47 -0700 Subject: [PATCH] md: Fix 'rdev->nr_pending' count when retrying barrier requests When retrying a failed BIO_RW_BARRIER request, we need to keep the reference in ->nr_pending over the whole retry. Currently, we only hold the reference if the failed request is the *last* one to finish - which is silly, because it would normally be the first to finish. So move the rdev_dec_pending call up into the didn't-fail branch. As the rdev isn't used in the later code, calling rdev_dec_pending earlier doesn't hurt. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index b8c13c8..4070eff 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -319,6 +319,7 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags); set_bit(R1BIO_BarrierRetry, &r1_bio->state); r1_bio->mddev->barriers_work = 0; + /* Don't rdev_dec_pending in this branch - keep it for the retry */ } else { /* * this branch is our 'one mirror IO has finished' event handler: @@ -365,6 +366,7 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int } } } + rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); } /* * @@ -374,11 +376,9 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int if (atomic_dec_and_test(&r1_bio->remaining)) { if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) { reschedule_retry(r1_bio); - /* Don't dec_pending yet, we want to hold - * the reference over the retry - */ goto out; } + /* it really is the end of this request */ if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { /* free extra copy of the data pages */ int i = bio->bi_vcnt; @@ -393,8 +393,6 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int md_write_end(r1_bio->mddev); raid_end_bio_io(r1_bio); } - - rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); out: if (to_put) bio_put(to_put); @@ -1414,6 +1412,7 @@ static void raid1d(mddev_t *mddev) * Better resubmit without the barrier. * We know which devices to resubmit for, because * all others have had their bios[] entry cleared. + * We already have a nr_pending reference on these rdevs. */ int i; clear_bit(R1BIO_BarrierRetry, &r1_bio->state); -- cgit v0.10.2 From d2610202290b4924b71747314a0f88f28807702e Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 1 May 2006 12:15:48 -0700 Subject: [PATCH] x86_64: Add compat_sys_vmsplice and use it in x86-64 Signed-off-by: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index 57fc37e..5a92fed 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -695,4 +695,5 @@ ia32_sys_call_table: .quad sys_splice .quad sys_sync_file_range .quad sys_tee + .quad compat_sys_vmsplice ia32_syscall_end: diff --git a/fs/compat.c b/fs/compat.c index 2e32bd3..3f3e8f4 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1317,6 +1317,26 @@ out: return ret; } +asmlinkage long +compat_sys_vmsplice(int fd, const struct compat_iovec __user *iov32, + unsigned int nr_segs, unsigned int flags) +{ + unsigned i; + struct iovec *iov; + if (nr_segs >= UIO_MAXIOV) + return -EINVAL; + iov = compat_alloc_user_space(nr_segs * sizeof(struct iovec)); + for (i = 0; i < nr_segs; i++) { + struct compat_iovec v; + if (get_user(v.iov_base, &iov32[i].iov_base) || + get_user(v.iov_len, &iov32[i].iov_len) || + put_user(compat_ptr(v.iov_base), &iov[i].iov_base) || + put_user(v.iov_len, &iov[i].iov_len)) + return -EFAULT; + } + return sys_vmsplice(fd, iov, nr_segs, flags); +} + /* * Exactly like fs/open.c:sys_open(), except that it doesn't set the * O_LARGEFILE flag. -- cgit v0.10.2 From 32828546b32a96d550b85eab25609bc4ba3942ab Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 1 May 2006 12:15:49 -0700 Subject: [PATCH] i386/x86-64: Fix ACPI disabled LAPIC handling mismerge The patch I submitted earlier to fix disabled LAPIC handling in ACPI was mismerged for some reason I still don't quite understand. Parts of it was applied to the wrong function. This patch fixes it up. Cc: Signed-off-by: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c index 049a255..4c785a6 100644 --- a/arch/i386/kernel/acpi/boot.c +++ b/arch/i386/kernel/acpi/boot.c @@ -215,7 +215,7 @@ static int __init acpi_parse_madt(unsigned long phys_addr, unsigned long size) { struct acpi_table_madt *madt = NULL; - if (!phys_addr || !size || !cpu_has_apic) + if (!phys_addr || !size) return -EINVAL; madt = (struct acpi_table_madt *)__acpi_map_table(phys_addr, size); @@ -1151,6 +1151,9 @@ int __init acpi_boot_init(void) acpi_table_parse(ACPI_BOOT, acpi_parse_sbf); + if (!cpu_has_apic) + return -ENODEV; + /* * set sci_int and PM timer address */ -- cgit v0.10.2 From 5871aa6d5a98f315016d1deee07425c269c37f29 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 1 May 2006 12:15:50 -0700 Subject: [PATCH] i386: Fix overflow in e820_all_mapped The 32bit version of e820_all_mapped() needs to use u64 to avoid overflows on PAE systems. Pointed out by Jan Beulich Signed-off-by: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index 80cb3b2..d77e89a 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -970,8 +970,10 @@ efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg) * not-overlapping, which is the case */ int __init -e820_all_mapped(unsigned long start, unsigned long end, unsigned type) +e820_all_mapped(unsigned long s, unsigned long e, unsigned type) { + u64 start = s; + u64 end = e; int i; for (i = 0; i < e820.nr_map; i++) { struct e820entry *ei = &e820.map[i]; -- cgit v0.10.2 From 42e4c8585f8cbbfac3b70aa2d0a4f869509a0354 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 1 May 2006 12:15:51 -0700 Subject: [PATCH] i386: Remove apic= warning The apic= option can be used to set the APIC driver too. When that is done this code would always produce bogus warnings. Signed-off-by: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c index 254cee9..013b85d 100644 --- a/arch/i386/kernel/apic.c +++ b/arch/i386/kernel/apic.c @@ -757,10 +757,6 @@ static int __init apic_set_verbosity(char *str) apic_verbosity = APIC_DEBUG; else if (strcmp("verbose", str) == 0) apic_verbosity = APIC_VERBOSE; - else - printk(KERN_WARNING "APIC Verbosity level %s not recognised" - " use apic=verbose or apic=debug\n", str); - return 1; } -- cgit v0.10.2 From f3a19cb45f4730c4ce09ca9bccf197efd010dc3b Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 1 May 2006 12:15:52 -0700 Subject: [PATCH] silence initcall warnings Suppress the initcall-return-value warnings unless initcall_debug was specified. They do find bugs, but they're extremely small ones and as Andi points out, people get distressed. Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/init/main.c b/init/main.c index 4a2f089..f715b9b 100644 --- a/init/main.c +++ b/init/main.c @@ -582,7 +582,7 @@ static void __init do_initcalls(void) result = (*call)(); - if (result && (result != -ENODEV || initcall_debug)) { + if (result && result != -ENODEV && initcall_debug) { sprintf(msgbuf, "error code %d", result); msg = msgbuf; } -- cgit v0.10.2 From c39e50b4bada27102306d7fa68ea35dc4e61bd68 Mon Sep 17 00:00:00 2001 From: "Victor V. Vengerov" Date: Mon, 1 May 2006 12:15:53 -0700 Subject: [PATCH] uml: fix iomem list traversal We need to walk the region list properly. Signed-off-by: Jeff Dike Cc: Paolo 'Blaisorblade' Giarrusso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/um/kernel/physmem.c b/arch/um/kernel/physmem.c index 0500800..fc0f0b0 100644 --- a/arch/um/kernel/physmem.c +++ b/arch/um/kernel/physmem.c @@ -407,6 +407,8 @@ unsigned long find_iomem(char *driver, unsigned long *len_out) *len_out = region->size; return(region->virt); } + + region = region->next; } return(0); -- cgit v0.10.2 From 893bb96a29bee4a5cf2486720ac79412aaee5066 Mon Sep 17 00:00:00 2001 From: Joris van Rantwijk Date: Mon, 1 May 2006 12:15:56 -0700 Subject: [PATCH] uml: skas0 support for 2G/2G hosts A quick hack to allow skas0 mode to run on 2G/2G hosts. Signed-off-by: Jeff Dike Cc: Paolo 'Blaisorblade' Giarrusso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/um/Kconfig b/arch/um/Kconfig index 05fbb20..76e85bb 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -57,20 +57,6 @@ config STATIC_LINK chroot, and you disable CONFIG_MODE_TT, you probably want to say Y here. -config HOST_2G_2G - bool "2G/2G host address space split" - default n - depends on MODE_TT - help - This is needed when the host on which you run has a 2G/2G memory - split, instead of the customary 3G/1G. - - Note that to enable such a host - configuration, which makes sense only in some cases, you need special - host patches. - - So, if you do not know what to do here, say 'N'. - config KERNEL_HALF_GIGS int "Kernel address space size (in .5G units)" default "1" diff --git a/arch/um/Kconfig.i386 b/arch/um/Kconfig.i386 index 85e6a55..f6eb72d 100644 --- a/arch/um/Kconfig.i386 +++ b/arch/um/Kconfig.i386 @@ -16,6 +16,19 @@ config SEMAPHORE_SLEEPERS bool default y +config HOST_2G_2G + bool "2G/2G host address space split" + default n + help + This is needed when the host on which you run has a 2G/2G memory + split, instead of the customary 3G/1G. + + Note that to enable such a host + configuration, which makes sense only in some cases, you need special + host patches. + + So, if you do not know what to do here, say 'N'. + config TOP_ADDR hex default 0xc0000000 if !HOST_2G_2G @@ -35,11 +48,13 @@ config 3_LEVEL_PGTABLES config STUB_CODE hex - default 0xbfffe000 + default 0xbfffe000 if !HOST_2G_2G + default 0x7fffe000 if HOST_2G_2G config STUB_DATA hex - default 0xbffff000 + default 0xbffff000 if !HOST_2G_2G + default 0x7ffff000 if HOST_2G_2G config STUB_START hex -- cgit v0.10.2 From 191ef966ac164a1ce3e06290ca52296744a4aee2 Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Mon, 1 May 2006 12:15:57 -0700 Subject: [PATCH] uml: remove NULL checks and add some CodingStyle Remove redundant NULL checks before [kv]free + small CodingStyle cleanup for arch/ Signed-off-by: Jesper Juhl Signed-off-by: Jeff Dike Cc: Paolo 'Blaisorblade' Giarrusso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c index c39ea3a..2ffda01 100644 --- a/arch/um/kernel/irq.c +++ b/arch/um/kernel/irq.c @@ -89,16 +89,18 @@ void sigio_handler(int sig, union uml_pt_regs *regs) struct irq_fd *irq_fd; int n; - if(smp_sigio_handler()) return; - while(1){ + if (smp_sigio_handler()) + return; + + while (1) { n = os_waiting_for_events(active_fds); if (n <= 0) { if(n == -EINTR) continue; else break; } - for(irq_fd = active_fds; irq_fd != NULL; irq_fd = irq_fd->next){ - if(irq_fd->current_events != 0){ + for (irq_fd = active_fds; irq_fd != NULL; irq_fd = irq_fd->next) { + if (irq_fd->current_events != 0) { irq_fd->current_events = 0; do_IRQ(irq_fd->irq, regs); } @@ -110,19 +112,17 @@ void sigio_handler(int sig, union uml_pt_regs *regs) static void maybe_sigio_broken(int fd, int type) { - if(os_isatty(fd)){ - if((type == IRQ_WRITE) && !pty_output_sigio){ + if (os_isatty(fd)) { + if ((type == IRQ_WRITE) && !pty_output_sigio) { write_sigio_workaround(); add_sigio_fd(fd, 0); - } - else if((type == IRQ_READ) && !pty_close_sigio){ + } else if ((type == IRQ_READ) && !pty_close_sigio) { write_sigio_workaround(); add_sigio_fd(fd, 1); } } } - int activate_fd(int irq, int fd, int type, void *dev_id) { struct pollfd *tmp_pfd; @@ -132,16 +132,18 @@ int activate_fd(int irq, int fd, int type, void *dev_id) pid = os_getpid(); err = os_set_fd_async(fd, pid); - if(err < 0) + if (err < 0) goto out; new_fd = um_kmalloc(sizeof(*new_fd)); err = -ENOMEM; - if(new_fd == NULL) + if (new_fd == NULL) goto out; - if(type == IRQ_READ) events = UM_POLLIN | UM_POLLPRI; - else events = UM_POLLOUT; + if (type == IRQ_READ) + events = UM_POLLIN | UM_POLLPRI; + else + events = UM_POLLOUT; *new_fd = ((struct irq_fd) { .next = NULL, .id = dev_id, .fd = fd, @@ -165,8 +167,8 @@ int activate_fd(int irq, int fd, int type, void *dev_id) * a semaphore. */ flags = irq_lock(); - for(irq_fd = active_fds; irq_fd != NULL; irq_fd = irq_fd->next){ - if((irq_fd->fd == fd) && (irq_fd->type == type)){ + for (irq_fd = active_fds; irq_fd != NULL; irq_fd = irq_fd->next) { + if ((irq_fd->fd == fd) && (irq_fd->type == type)) { printk("Registering fd %d twice\n", fd); printk("Irqs : %d, %d\n", irq_fd->irq, irq); printk("Ids : 0x%p, 0x%p\n", irq_fd->id, dev_id); @@ -175,13 +177,13 @@ int activate_fd(int irq, int fd, int type, void *dev_id) } /*-------------*/ - if(type == IRQ_WRITE) + if (type == IRQ_WRITE) fd = -1; tmp_pfd = NULL; n = 0; - while(1){ + while (1) { n = os_create_pollfd(fd, events, tmp_pfd, n); if (n == 0) break; @@ -198,10 +200,8 @@ int activate_fd(int irq, int fd, int type, void *dev_id) * then we free the buffer tmp_fds and try again. */ irq_unlock(flags); - if (tmp_pfd != NULL) { - kfree(tmp_pfd); - tmp_pfd = NULL; - } + kfree(tmp_pfd); + tmp_pfd = NULL; tmp_pfd = um_kmalloc(n); if (tmp_pfd == NULL) @@ -249,7 +249,7 @@ static int same_irq_and_dev(struct irq_fd *irq, void *d) { struct irq_and_dev *data = d; - return((irq->irq == data->irq) && (irq->id == data->dev)); + return ((irq->irq == data->irq) && (irq->id == data->dev)); } void free_irq_by_irq_and_dev(unsigned int irq, void *dev) @@ -262,7 +262,7 @@ void free_irq_by_irq_and_dev(unsigned int irq, void *dev) static int same_fd(struct irq_fd *irq, void *fd) { - return(irq->fd == *((int *) fd)); + return (irq->fd == *((int *)fd)); } void free_irq_by_fd(int fd) @@ -276,16 +276,17 @@ static struct irq_fd *find_irq_by_fd(int fd, int irqnum, int *index_out) int i = 0; int fdi; - for(irq=active_fds; irq != NULL; irq = irq->next){ - if((irq->fd == fd) && (irq->irq == irqnum)) break; + for (irq = active_fds; irq != NULL; irq = irq->next) { + if ((irq->fd == fd) && (irq->irq == irqnum)) + break; i++; } - if(irq == NULL){ + if (irq == NULL) { printk("find_irq_by_fd doesn't have descriptor %d\n", fd); goto out; } fdi = os_get_pollfd(i); - if((fdi != -1) && (fdi != fd)){ + if ((fdi != -1) && (fdi != fd)) { printk("find_irq_by_fd - mismatch between active_fds and " "pollfds, fd %d vs %d, need %d\n", irq->fd, fdi, fd); @@ -294,7 +295,7 @@ static struct irq_fd *find_irq_by_fd(int fd, int irqnum, int *index_out) } *index_out = i; out: - return(irq); + return irq; } void reactivate_fd(int fd, int irqnum) @@ -305,7 +306,7 @@ void reactivate_fd(int fd, int irqnum) flags = irq_lock(); irq = find_irq_by_fd(fd, irqnum, &i); - if(irq == NULL){ + if (irq == NULL) { irq_unlock(flags); return; } @@ -326,7 +327,7 @@ void deactivate_fd(int fd, int irqnum) flags = irq_lock(); irq = find_irq_by_fd(fd, irqnum, &i); - if(irq == NULL) + if (irq == NULL) goto out; os_set_pollfd(i, -1); out: @@ -338,15 +339,15 @@ int deactivate_all_fds(void) struct irq_fd *irq; int err; - for(irq=active_fds;irq != NULL;irq = irq->next){ + for (irq = active_fds; irq != NULL; irq = irq->next) { err = os_clear_fd_async(irq->fd); - if(err) - return(err); + if (err) + return err; } /* If there is a signal already queued, after unblocking ignore it */ os_set_ioignore(); - return(0); + return 0; } void forward_interrupts(int pid) @@ -356,9 +357,9 @@ void forward_interrupts(int pid) int err; flags = irq_lock(); - for(irq=active_fds;irq != NULL;irq = irq->next){ + for (irq = active_fds; irq != NULL; irq = irq->next) { err = os_set_owner(irq->fd, pid); - if(err < 0){ + if (err < 0) { /* XXX Just remove the irq rather than * print out an infinite stream of these */ @@ -379,7 +380,7 @@ void forward_interrupts(int pid) unsigned int do_IRQ(int irq, union uml_pt_regs *regs) { irq_enter(); - __do_IRQ(irq, (struct pt_regs *) regs); + __do_IRQ(irq, (struct pt_regs *)regs); irq_exit(); return 1; } @@ -392,12 +393,12 @@ int um_request_irq(unsigned int irq, int fd, int type, int err; err = request_irq(irq, handler, irqflags, devname, dev_id); - if(err) - return(err); + if (err) + return err; - if(fd != -1) + if (fd != -1) err = activate_fd(irq, fd, type, dev_id); - return(err); + return err; } EXPORT_SYMBOL(um_request_irq); EXPORT_SYMBOL(reactivate_fd); @@ -409,7 +410,7 @@ unsigned long irq_lock(void) unsigned long flags; spin_lock_irqsave(&irq_spinlock, flags); - return(flags); + return flags; } void irq_unlock(unsigned long flags) @@ -452,7 +453,7 @@ void __init init_IRQ(void) irq_desc[TIMER_IRQ].depth = 1; irq_desc[TIMER_IRQ].handler = &SIGVTALRM_irq_type; enable_irq(TIMER_IRQ); - for(i=1;icurrent_events = pollfds[i].revents; pollfds[i].fd = -1; } @@ -54,7 +54,7 @@ int os_waiting_for_events(struct irq_fd *active_fds) int os_isatty(int fd) { - return(isatty(fd)); + return isatty(fd); } int os_create_pollfd(int fd, int events, void *tmp_pfd, int size_tmpfds) @@ -65,7 +65,7 @@ int os_create_pollfd(int fd, int events, void *tmp_pfd, int size_tmpfds) return((pollfds_size + 1) * sizeof(pollfds[0])); } - if(pollfds != NULL){ + if (pollfds != NULL) { memcpy(tmp_pfd, pollfds, sizeof(pollfds[0]) * pollfds_size); /* remove old pollfds */ @@ -73,18 +73,15 @@ int os_create_pollfd(int fd, int events, void *tmp_pfd, int size_tmpfds) } pollfds = tmp_pfd; pollfds_size++; - } else { - /* remove not used tmp_pfd */ - if (tmp_pfd != NULL) - kfree(tmp_pfd); - } + } else + kfree(tmp_pfd); /* remove not used tmp_pfd */ - pollfds[pollfds_num] = ((struct pollfd) { .fd = fd, - .events = events, - .revents = 0 }); + pollfds[pollfds_num] = ((struct pollfd) { .fd = fd, + .events = events, + .revents = 0 }); pollfds_num++; - return(0); + return 0; } void os_free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg, @@ -94,11 +91,11 @@ void os_free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg, int i = 0; prev = &active_fds; - while(*prev != NULL){ - if((*test)(*prev, arg)){ + while (*prev != NULL) { + if ((*test)(*prev, arg)) { struct irq_fd *old_fd = *prev; - if((pollfds[i].fd != -1) && - (pollfds[i].fd != (*prev)->fd)){ + if ((pollfds[i].fd != -1) && + (pollfds[i].fd != (*prev)->fd)) { printk("os_free_irq_by_cb - mismatch between " "active_fds and pollfds, fd %d vs %d\n", (*prev)->fd, pollfds[i].fd); @@ -110,7 +107,6 @@ void os_free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg, /* This moves the *whole* array after pollfds[i] * (though it doesn't spot as such)! */ - memmove(&pollfds[i], &pollfds[i + 1], (pollfds_num - i) * sizeof(pollfds[0])); if(*last_irq_ptr2 == &old_fd->next) @@ -129,10 +125,9 @@ void os_free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg, return; } - int os_get_pollfd(int i) { - return(pollfds[i].fd); + return pollfds[i].fd; } void os_set_pollfd(int i, int fd) @@ -151,8 +146,10 @@ void init_irq_signals(int on_sigstack) int flags; flags = on_sigstack ? SA_ONSTACK : 0; - if(timer_irq_inited) h = (__sighandler_t) alarm_handler; - else h = boot_timer_handler; + if (timer_irq_inited) + h = (__sighandler_t)alarm_handler; + else + h = boot_timer_handler; set_handler(SIGVTALRM, h, flags | SA_RESTART, SIGUSR1, SIGIO, SIGWINCH, SIGALRM, -1); -- cgit v0.10.2 From e3104f50d89b1fffe1fd973e32248a5c7f1bb41e Mon Sep 17 00:00:00 2001 From: Jeff Dike Date: Mon, 1 May 2006 12:15:58 -0700 Subject: [PATCH] uml: clean up after MADVISE_REMOVE The MADVISE_REMOVE-checking code didn't clean up after itself. Signed-off-by: Jeff Dike Cc: Paolo 'Blaisorblade' Giarrusso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/um/os-Linux/process.c b/arch/um/os-Linux/process.c index 3505f44..233be2f 100644 --- a/arch/um/os-Linux/process.c +++ b/arch/um/os-Linux/process.c @@ -206,29 +206,36 @@ int os_drop_memory(void *addr, int length) int can_drop_memory(void) { void *addr; - int fd; + int fd, ok = 0; printk("Checking host MADV_REMOVE support..."); fd = create_mem_file(UM_KERN_PAGE_SIZE); if(fd < 0){ printk("Creating test memory file failed, err = %d\n", -fd); - return 0; + goto out; } addr = mmap64(NULL, UM_KERN_PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); if(addr == MAP_FAILED){ printk("Mapping test memory file failed, err = %d\n", -errno); - return 0; + goto out_close; } if(madvise(addr, UM_KERN_PAGE_SIZE, MADV_REMOVE) != 0){ printk("MADV_REMOVE failed, err = %d\n", -errno); - return 0; + goto out_unmap; } printk("OK\n"); - return 1; + ok = 1; + +out_unmap: + munmap(addr, UM_KERN_PAGE_SIZE); +out_close: + close(fd); +out: + return ok; } void init_new_thread_stack(void *sig_stack, void (*usr1_handler)(int)) -- cgit v0.10.2 From 347b3dc2faf34d63a96b41e3244b743cc72a2aa0 Mon Sep 17 00:00:00 2001 From: Jeff Dike Date: Mon, 1 May 2006 12:15:59 -0700 Subject: [PATCH] uml: update defconfig Bring defconfig up to date. Also disable CONFIG_BLK_DEV_UBD_SYNC by default. By performing synchronous I/O to the host, it slows things down, only protects against host crashes, and can make a UML appear to hang while it waits for the host's disk. Signed-off-by: Jeff Dike Cc: Paolo 'Blaisorblade' Giarrusso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/um/defconfig b/arch/um/defconfig index 80d30d1..402a74d 100644 --- a/arch/um/defconfig +++ b/arch/um/defconfig @@ -1,14 +1,13 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.12-rc6-mm1 -# Tue Jun 14 18:22:21 2005 +# Linux kernel version: 2.6.17-rc3 +# Fri Apr 28 09:31:20 2006 # CONFIG_GENERIC_HARDIRQS=y CONFIG_UML=y CONFIG_MMU=y -CONFIG_UID16=y -CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_CALIBRATE_DELAY=y +CONFIG_IRQ_RELEASE_METHOD=y # # UML-specific options @@ -16,8 +15,50 @@ CONFIG_GENERIC_CALIBRATE_DELAY=y # CONFIG_MODE_TT is not set # CONFIG_STATIC_LINK is not set CONFIG_MODE_SKAS=y + +# +# Host processor type and features +# +# CONFIG_M386 is not set +# CONFIG_M486 is not set +# CONFIG_M586 is not set +# CONFIG_M586TSC is not set +# CONFIG_M586MMX is not set +CONFIG_M686=y +# CONFIG_MPENTIUMII is not set +# CONFIG_MPENTIUMIII is not set +# CONFIG_MPENTIUMM is not set +# CONFIG_MPENTIUM4 is not set +# CONFIG_MK6 is not set +# CONFIG_MK7 is not set +# CONFIG_MK8 is not set +# CONFIG_MCRUSOE is not set +# CONFIG_MEFFICEON is not set +# CONFIG_MWINCHIPC6 is not set +# CONFIG_MWINCHIP2 is not set +# CONFIG_MWINCHIP3D is not set +# CONFIG_MGEODEGX1 is not set +# CONFIG_MGEODE_LX is not set +# CONFIG_MCYRIXIII is not set +# CONFIG_MVIAC3_2 is not set +# CONFIG_X86_GENERIC is not set +CONFIG_X86_CMPXCHG=y +CONFIG_X86_XADD=y +CONFIG_X86_L1_CACHE_SHIFT=5 +CONFIG_RWSEM_XCHGADD_ALGORITHM=y +CONFIG_X86_PPRO_FENCE=y +CONFIG_X86_WP_WORKS_OK=y +CONFIG_X86_INVLPG=y +CONFIG_X86_BSWAP=y +CONFIG_X86_POPAD_OK=y +CONFIG_X86_CMPXCHG64=y +CONFIG_X86_GOOD_APIC=y +CONFIG_X86_USE_PPRO_CHECKSUM=y +CONFIG_X86_TSC=y CONFIG_UML_X86=y # CONFIG_64BIT is not set +CONFIG_SEMAPHORE_SLEEPERS=y +# CONFIG_HOST_2G_2G is not set CONFIG_TOP_ADDR=0xc0000000 # CONFIG_3_LEVEL_PGTABLES is not set CONFIG_STUB_CODE=0xbfffe000 @@ -25,22 +66,24 @@ CONFIG_STUB_DATA=0xbffff000 CONFIG_STUB_START=0xbfffe000 CONFIG_ARCH_HAS_SC_SIGNALS=y CONFIG_ARCH_REUSE_HOST_VSYSCALL_AREA=y +CONFIG_GENERIC_HWEIGHT=y CONFIG_SELECT_MEMORY_MODEL=y CONFIG_FLATMEM_MANUAL=y # CONFIG_DISCONTIGMEM_MANUAL is not set # CONFIG_SPARSEMEM_MANUAL is not set CONFIG_FLATMEM=y CONFIG_FLAT_NODE_MEM_MAP=y +# CONFIG_SPARSEMEM_STATIC is not set +CONFIG_SPLIT_PTLOCK_CPUS=4 CONFIG_LD_SCRIPT_DYN=y CONFIG_NET=y CONFIG_BINFMT_ELF=y CONFIG_BINFMT_MISC=m # CONFIG_HOSTFS is not set +# CONFIG_HPPFS is not set CONFIG_MCONSOLE=y # CONFIG_MAGIC_SYSRQ is not set -# CONFIG_HOST_2G_2G is not set CONFIG_NEST_LEVEL=0 -CONFIG_KERNEL_HALF_GIGS=1 # CONFIG_HIGHMEM is not set CONFIG_KERNEL_STACK_ORDER=2 CONFIG_UML_REAL_TIME_CLOCK=y @@ -49,7 +92,6 @@ CONFIG_UML_REAL_TIME_CLOCK=y # Code maturity level options # CONFIG_EXPERIMENTAL=y -CONFIG_CLEAN_COMPILE=y CONFIG_BROKEN_ON_SMP=y CONFIG_INIT_ENV_ARG_LIMIT=32 @@ -57,6 +99,7 @@ CONFIG_INIT_ENV_ARG_LIMIT=32 # General setup # CONFIG_LOCALVERSION="" +CONFIG_LOCALVERSION_AUTO=y CONFIG_SWAP=y CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y @@ -64,26 +107,28 @@ CONFIG_BSD_PROCESS_ACCT=y # CONFIG_BSD_PROCESS_ACCT_V3 is not set CONFIG_SYSCTL=y # CONFIG_AUDIT is not set -# CONFIG_HOTPLUG is not set -CONFIG_KOBJECT_UEVENT=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y +# CONFIG_RELAY is not set +CONFIG_INITRAMFS_SOURCE="" +CONFIG_UID16=y +CONFIG_CC_OPTIMIZE_FOR_SIZE=y # CONFIG_EMBEDDED is not set CONFIG_KALLSYMS=y # CONFIG_KALLSYMS_ALL is not set CONFIG_KALLSYMS_EXTRA_PASS=y +CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y +CONFIG_ELF_CORE=y CONFIG_BASE_FULL=y CONFIG_FUTEX=y CONFIG_EPOLL=y CONFIG_SHMEM=y -CONFIG_CC_ALIGN_FUNCTIONS=0 -CONFIG_CC_ALIGN_LABELS=0 -CONFIG_CC_ALIGN_LOOPS=0 -CONFIG_CC_ALIGN_JUMPS=0 +CONFIG_SLAB=y # CONFIG_TINY_SHMEM is not set CONFIG_BASE_SMALL=0 +# CONFIG_SLOB is not set # # Loadable module support @@ -91,18 +136,43 @@ CONFIG_BASE_SMALL=0 CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y # CONFIG_MODULE_FORCE_UNLOAD is not set -CONFIG_OBSOLETE_MODPARM=y # CONFIG_MODVERSIONS is not set # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_KMOD=y # -# Generic Driver Options +# Block layer # -CONFIG_STANDALONE=y -CONFIG_PREVENT_FIRMWARE_BUILD=y -# CONFIG_FW_LOADER is not set -# CONFIG_DEBUG_DRIVER is not set +# CONFIG_LBD is not set +# CONFIG_BLK_DEV_IO_TRACE is not set +# CONFIG_LSF is not set + +# +# IO Schedulers +# +CONFIG_IOSCHED_NOOP=y +CONFIG_IOSCHED_AS=y +CONFIG_IOSCHED_DEADLINE=y +CONFIG_IOSCHED_CFQ=y +CONFIG_DEFAULT_AS=y +# CONFIG_DEFAULT_DEADLINE is not set +# CONFIG_DEFAULT_CFQ is not set +# CONFIG_DEFAULT_NOOP is not set +CONFIG_DEFAULT_IOSCHED="anticipatory" + +# +# Block devices +# +CONFIG_BLK_DEV_UBD=y +# CONFIG_BLK_DEV_UBD_SYNC is not set +CONFIG_BLK_DEV_COW_COMMON=y +# CONFIG_MMAPPER is not set +CONFIG_BLK_DEV_LOOP=m +# CONFIG_BLK_DEV_CRYPTOLOOP is not set +CONFIG_BLK_DEV_NBD=m +# CONFIG_BLK_DEV_RAM is not set +# CONFIG_BLK_DEV_INITRD is not set +# CONFIG_ATA_OVER_ETH is not set # # Character Devices @@ -127,50 +197,23 @@ CONFIG_UML_SOUND=m CONFIG_SOUND=m CONFIG_HOSTAUDIO=m CONFIG_UML_RANDOM=y -# CONFIG_MMAPPER is not set - -# -# Block devices -# -CONFIG_BLK_DEV_UBD=y -CONFIG_BLK_DEV_UBD_SYNC=y -CONFIG_BLK_DEV_COW_COMMON=y -CONFIG_BLK_DEV_LOOP=m -# CONFIG_BLK_DEV_CRYPTOLOOP is not set -CONFIG_BLK_DEV_NBD=m -# CONFIG_BLK_DEV_RAM is not set -CONFIG_BLK_DEV_RAM_COUNT=16 -CONFIG_INITRAMFS_SOURCE="" -# CONFIG_LBD is not set - -# -# IO Schedulers -# -CONFIG_IOSCHED_NOOP=y -CONFIG_IOSCHED_AS=y -CONFIG_IOSCHED_DEADLINE=y -CONFIG_IOSCHED_CFQ=y -# CONFIG_ATA_OVER_ETH is not set -CONFIG_NETDEVICES=y # -# UML Network Devices +# Generic Driver Options # -CONFIG_UML_NET=y -CONFIG_UML_NET_ETHERTAP=y -CONFIG_UML_NET_TUNTAP=y -CONFIG_UML_NET_SLIP=y -CONFIG_UML_NET_DAEMON=y -CONFIG_UML_NET_MCAST=y -CONFIG_UML_NET_SLIRP=y +CONFIG_STANDALONE=y +CONFIG_PREVENT_FIRMWARE_BUILD=y +# CONFIG_FW_LOADER is not set +# CONFIG_DEBUG_DRIVER is not set # -# Networking support +# Networking # # # Networking options # +# CONFIG_NETDEBUG is not set CONFIG_PACKET=y CONFIG_PACKET_MMAP=y CONFIG_UNIX=y @@ -178,6 +221,7 @@ CONFIG_UNIX=y CONFIG_INET=y # CONFIG_IP_MULTICAST is not set # CONFIG_IP_ADVANCED_ROUTER is not set +CONFIG_IP_FIB_HASH=y # CONFIG_IP_PNP is not set # CONFIG_NET_IPIP is not set # CONFIG_NET_IPGRE is not set @@ -186,27 +230,31 @@ CONFIG_INET=y # CONFIG_INET_AH is not set # CONFIG_INET_ESP is not set # CONFIG_INET_IPCOMP is not set +# CONFIG_INET_XFRM_TUNNEL is not set # CONFIG_INET_TUNNEL is not set -CONFIG_IP_TCPDIAG=y -# CONFIG_IP_TCPDIAG_IPV6 is not set - -# -# TCP congestion control -# +CONFIG_INET_DIAG=y +CONFIG_INET_TCP_DIAG=y +# CONFIG_TCP_CONG_ADVANCED is not set CONFIG_TCP_CONG_BIC=y -CONFIG_TCP_CONG_WESTWOOD=y -CONFIG_TCP_CONG_HTCP=y -# CONFIG_TCP_CONG_HSTCP is not set -# CONFIG_TCP_CONG_HYBLA is not set -# CONFIG_TCP_CONG_VEGAS is not set -# CONFIG_TCP_CONG_SCALABLE is not set # CONFIG_IPV6 is not set +# CONFIG_INET6_XFRM_TUNNEL is not set +# CONFIG_INET6_TUNNEL is not set # CONFIG_NETFILTER is not set # +# DCCP Configuration (EXPERIMENTAL) +# +# CONFIG_IP_DCCP is not set + +# # SCTP Configuration (EXPERIMENTAL) # # CONFIG_IP_SCTP is not set + +# +# TIPC Configuration (EXPERIMENTAL) +# +# CONFIG_TIPC is not set # CONFIG_ATM is not set # CONFIG_BRIDGE is not set # CONFIG_VLAN_8021Q is not set @@ -224,27 +272,47 @@ CONFIG_TCP_CONG_HTCP=y # QoS and/or fair queueing # # CONFIG_NET_SCHED is not set -# CONFIG_NET_CLS_ROUTE is not set # # Network testing # # CONFIG_NET_PKTGEN is not set -# CONFIG_KGDBOE is not set -# CONFIG_NETPOLL is not set -# CONFIG_NETPOLL_RX is not set -# CONFIG_NETPOLL_TRAP is not set -# CONFIG_NET_POLL_CONTROLLER is not set # CONFIG_HAMRADIO is not set # CONFIG_IRDA is not set # CONFIG_BT is not set # CONFIG_IEEE80211 is not set + +# +# UML Network Devices +# +CONFIG_UML_NET=y +CONFIG_UML_NET_ETHERTAP=y +CONFIG_UML_NET_TUNTAP=y +CONFIG_UML_NET_SLIP=y +CONFIG_UML_NET_DAEMON=y +CONFIG_UML_NET_MCAST=y +# CONFIG_UML_NET_PCAP is not set +CONFIG_UML_NET_SLIRP=y + +# +# Network device support +# +CONFIG_NETDEVICES=y CONFIG_DUMMY=m # CONFIG_BONDING is not set # CONFIG_EQUALIZER is not set CONFIG_TUN=m # +# PHY device support +# + +# +# Wireless LAN (non-hamradio) +# +# CONFIG_NET_RADIO is not set + +# # Wan interfaces # # CONFIG_WAN is not set @@ -263,6 +331,13 @@ CONFIG_SLIP=m # CONFIG_SLIP_MODE_SLIP6 is not set # CONFIG_SHAPER is not set # CONFIG_NETCONSOLE is not set +# CONFIG_NETPOLL is not set +# CONFIG_NET_POLL_CONTROLLER is not set + +# +# Connector - unified userspace <-> kernelspace linker +# +# CONFIG_CONNECTOR is not set # # File systems @@ -274,17 +349,14 @@ CONFIG_EXT3_FS=y # CONFIG_EXT3_FS_XATTR is not set CONFIG_JBD=y # CONFIG_JBD_DEBUG is not set -# CONFIG_REISER4_FS is not set CONFIG_REISERFS_FS=y # CONFIG_REISERFS_CHECK is not set # CONFIG_REISERFS_PROC_INFO is not set # CONFIG_REISERFS_FS_XATTR is not set # CONFIG_JFS_FS is not set - -# -# XFS support -# +# CONFIG_FS_POSIX_ACL is not set # CONFIG_XFS_FS is not set +# CONFIG_OCFS2_FS is not set # CONFIG_MINIX_FS is not set # CONFIG_ROMFS_FS is not set CONFIG_INOTIFY=y @@ -295,11 +367,6 @@ CONFIG_QUOTACTL=y CONFIG_DNOTIFY=y CONFIG_AUTOFS_FS=m CONFIG_AUTOFS4_FS=m - -# -# Caches -# -# CONFIG_FSCACHE is not set # CONFIG_FUSE_FS is not set # @@ -323,14 +390,10 @@ CONFIG_JOLIET=y CONFIG_PROC_FS=y CONFIG_PROC_KCORE=y CONFIG_SYSFS=y -# CONFIG_DEVFS_FS is not set -# CONFIG_DEVPTS_FS_XATTR is not set CONFIG_TMPFS=y -# CONFIG_TMPFS_XATTR is not set # CONFIG_HUGETLB_PAGE is not set CONFIG_RAMFS=y # CONFIG_CONFIGFS_FS is not set -# CONFIG_RELAYFS_FS is not set # # Miscellaneous filesystems @@ -430,6 +493,7 @@ CONFIG_NLS_DEFAULT="iso8859-1" # Library routines # # CONFIG_CRC_CCITT is not set +# CONFIG_CRC16 is not set CONFIG_CRC32=m # CONFIG_LIBCRC32C is not set @@ -448,12 +512,18 @@ CONFIG_LOG_BUF_SHIFT=14 CONFIG_DETECT_SOFTLOCKUP=y # CONFIG_SCHEDSTATS is not set CONFIG_DEBUG_SLAB=y +# CONFIG_DEBUG_SLAB_LEAK is not set +# CONFIG_DEBUG_MUTEXES is not set # CONFIG_DEBUG_SPINLOCK is not set # CONFIG_DEBUG_SPINLOCK_SLEEP is not set # CONFIG_DEBUG_KOBJECT is not set CONFIG_DEBUG_INFO=y # CONFIG_DEBUG_FS is not set +# CONFIG_DEBUG_VM is not set CONFIG_FRAME_POINTER=y +# CONFIG_UNWIND_INFO is not set +CONFIG_FORCED_INLINING=y +# CONFIG_RCU_TORTURE_TEST is not set # CONFIG_GPROF is not set # CONFIG_GCOV is not set # CONFIG_SYSCALL_DEBUG is not set -- cgit v0.10.2 From 2ace87b9502d922397cabaf07d73e0b60c480ecf Mon Sep 17 00:00:00 2001 From: Jeff Dike Date: Mon, 1 May 2006 12:16:00 -0700 Subject: [PATCH] uml: error handling fixes Blairsorblade noticed some confusion between our use of a system call's return value and errno. This patch fixes a number of related bugs - using errno instead of a return value using a return value instead of errno forgetting to negate a error return to get a positive error code Signed-off-by: Jeff Dike Cc: Paolo 'Blaisorblade' Giarrusso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/um/os-Linux/file.c b/arch/um/os-Linux/file.c index 3bd10de..0925133 100644 --- a/arch/um/os-Linux/file.c +++ b/arch/um/os-Linux/file.c @@ -171,7 +171,7 @@ int os_sigio_async(int master, int slave) flags = fcntl(master, F_GETFL); if(flags < 0) - return errno; + return -errno; if((fcntl(master, F_SETFL, flags | O_NONBLOCK | O_ASYNC) < 0) || (fcntl(master, F_SETOWN, os_getpid()) < 0)) diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c index 0776bc1..bd89c6b 100644 --- a/arch/um/os-Linux/skas/process.c +++ b/arch/um/os-Linux/skas/process.c @@ -344,12 +344,12 @@ int copy_context_skas0(unsigned long new_stack, int pid) err = ptrace_setregs(pid, regs); if(err < 0) panic("copy_context_skas0 : PTRACE_SETREGS failed, " - "pid = %d, errno = %d\n", pid, errno); + "pid = %d, errno = %d\n", pid, -err); err = ptrace_setfpregs(pid, fp_regs); if(err < 0) panic("copy_context_skas0 : PTRACE_SETFPREGS failed, " - "pid = %d, errno = %d\n", pid, errno); + "pid = %d, errno = %d\n", pid, -err); /* set a well known return code for detection of child write failure */ child_data->err = 12345678; @@ -362,7 +362,7 @@ int copy_context_skas0(unsigned long new_stack, int pid) pid = data->err; if(pid < 0) panic("copy_context_skas0 - stub-parent reports error %d\n", - pid); + -pid); /* Wait, until child has finished too: read child's result from * child's stack and check it. diff --git a/arch/um/os-Linux/sys-i386/registers.c b/arch/um/os-Linux/sys-i386/registers.c index 7a6f6b9..516f66d 100644 --- a/arch/um/os-Linux/sys-i386/registers.c +++ b/arch/um/os-Linux/sys-i386/registers.c @@ -104,7 +104,7 @@ void init_registers(int pid) err = ptrace(PTRACE_GETREGS, pid, 0, exec_regs); if(err) panic("check_ptrace : PTRACE_GETREGS failed, errno = %d", - err); + errno); errno = 0; err = ptrace(PTRACE_GETFPXREGS, pid, 0, exec_fpx_regs); @@ -119,7 +119,7 @@ void init_registers(int pid) err = ptrace(PTRACE_GETFPREGS, pid, 0, exec_fp_regs); if(err) panic("check_ptrace : PTRACE_GETFPREGS failed, errno = %d", - err); + errno); } void get_safe_registers(unsigned long *regs, unsigned long *fp_regs) diff --git a/arch/um/os-Linux/sys-x86_64/registers.c b/arch/um/os-Linux/sys-x86_64/registers.c index 001941f..becd898 100644 --- a/arch/um/os-Linux/sys-x86_64/registers.c +++ b/arch/um/os-Linux/sys-x86_64/registers.c @@ -62,12 +62,12 @@ void init_registers(int pid) err = ptrace(PTRACE_GETREGS, pid, 0, exec_regs); if(err) panic("check_ptrace : PTRACE_GETREGS failed, errno = %d", - err); + errno); err = ptrace(PTRACE_GETFPREGS, pid, 0, exec_fp_regs); if(err) panic("check_ptrace : PTRACE_GETFPREGS failed, errno = %d", - err); + errno); } void get_safe_registers(unsigned long *regs, unsigned long *fp_regs) diff --git a/arch/um/os-Linux/umid.c b/arch/um/os-Linux/umid.c index 34bfc1b..362db05 100644 --- a/arch/um/os-Linux/umid.c +++ b/arch/um/os-Linux/umid.c @@ -178,14 +178,14 @@ static void __init create_pid_file(void) fd = open(file, O_RDWR | O_CREAT | O_EXCL, 0644); if(fd < 0){ printk("Open of machine pid file \"%s\" failed: %s\n", - file, strerror(-fd)); + file, strerror(errno)); return; } snprintf(pid, sizeof(pid), "%d\n", getpid()); n = write(fd, pid, strlen(pid)); if(n != strlen(pid)) - printk("Write of pid file failed - err = %d\n", -n); + printk("Write of pid file failed - err = %d\n", errno); close(fd); } -- cgit v0.10.2 From b15fb6b157b83ce983e42130ab7d1a866020d8f5 Mon Sep 17 00:00:00 2001 From: Paolo 'Blaisorblade' Giarrusso Date: Mon, 1 May 2006 12:16:01 -0700 Subject: [PATCH] uml: fix patch mismerge I sent a patch, it was applied as cda402b283c34a24b091f78eee116963e9494762, then it was applied again as 181ae4005d0a4010802be534d929b38c42b9ac06 by mistake. But while the 1st time it modified (correctly) cow_header_v3, the 2nd it modified cow_header_v3_broken. Signed-off-by: Paolo 'Blaisorblade' Giarrusso Acked-by: Jeff Dike Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/um/drivers/cow_user.c b/arch/um/drivers/cow_user.c index 6ab852b..0ec4052 100644 --- a/arch/um/drivers/cow_user.c +++ b/arch/um/drivers/cow_user.c @@ -100,7 +100,7 @@ struct cow_header_v3_broken { __u32 alignment; __u32 cow_format; char backing_file[PATH_LEN_V3]; -} __attribute__((packed)); +}; /* COW format definitions - for now, we have only the usual COW bitmap */ #define COW_BITMAP 0 -- cgit v0.10.2 From cb98cdcd0dd892dcaec7dabd57c3b00890006b61 Mon Sep 17 00:00:00 2001 From: Mattia Dongili Date: Mon, 1 May 2006 12:16:01 -0700 Subject: [PATCH] uml: search from uml_net in a more reasonable PATH Append /usr/lib/uml to the existing PATH environment variable to let execvp() search uml_net in FHS compliant locations. Signed-off-by: Mattia Dongili Signed-off-by: Paolo 'Blaisorblade' Giarrusso Acked-by: Jeff Dike Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/um/os-Linux/main.c b/arch/um/os-Linux/main.c index 2878e89..3a0ac38 100644 --- a/arch/um/os-Linux/main.c +++ b/arch/um/os-Linux/main.c @@ -74,6 +74,34 @@ static void last_ditch_exit(int sig) exit(1); } +#define UML_LIB_PATH ":/usr/lib/uml" + +static void setup_env_path(void) +{ + char *new_path = NULL; + char *old_path = NULL; + int path_len = 0; + + old_path = getenv("PATH"); + /* if no PATH variable is set or it has an empty value + * just use the default + /usr/lib/uml + */ + if (!old_path || (path_len = strlen(old_path)) == 0) { + putenv("PATH=:/bin:/usr/bin/" UML_LIB_PATH); + return; + } + + /* append /usr/lib/uml to the existing path */ + path_len += strlen("PATH=" UML_LIB_PATH) + 1; + new_path = malloc(path_len); + if (!new_path) { + perror("coudn't malloc to set a new PATH"); + return; + } + snprintf(new_path, path_len, "PATH=%s" UML_LIB_PATH, old_path); + putenv(new_path); +} + extern int uml_exitcode; extern void scan_elf_aux( char **envp); @@ -114,6 +142,8 @@ int main(int argc, char **argv, char **envp) set_stklim(); + setup_env_path(); + new_argv = malloc((argc + 1) * sizeof(char *)); if(new_argv == NULL){ perror("Mallocing argv"); -- cgit v0.10.2 From cb8aa3d29b562e4c16fdfa4ecc8170ddc55397f4 Mon Sep 17 00:00:00 2001 From: Paolo 'Blaisorblade' Giarrusso Date: Mon, 1 May 2006 12:16:03 -0700 Subject: [PATCH] uml: use Kbuild tracking for all files and fix compilation output Move the build of user-offsets to arch/um/sys-$(SUBARCH), where it's located. So we can also build it via Kbuild with its dependency tracking rather than by hand. While hacking here, fix also a lot of little cosmetic things. Signed-off-by: Paolo 'Blaisorblade' Giarrusso Acked-by: Jeff Dike Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/um/Makefile b/arch/um/Makefile index a508e7a..930e006 100644 --- a/arch/um/Makefile +++ b/arch/um/Makefile @@ -96,7 +96,8 @@ PHONY += linux all: linux linux: vmlinux - ln -f $< $@ + @echo ' SYMLINK $@' + $(Q)ln -f $< $@ define archhelp echo '* linux - Binary kernel image (./linux) - for backward' @@ -203,8 +204,8 @@ endef $(ARCH_DIR)/include/uml-config.h : include/linux/autoconf.h $(call filechk,umlconfig) -$(ARCH_DIR)/user-offsets.s: $(ARCH_DIR)/sys-$(SUBARCH)/user-offsets.c - $(CC) $(USER_CFLAGS) -S -o $@ $< +$(ARCH_DIR)/sys-$(SUBARCH)/user-offsets.s: FORCE + $(Q)$(MAKE) $(build)=$(ARCH_DIR)/sys-$(SUBARCH) $@ define filechk_gen-asm-offsets (set -e; \ @@ -219,13 +220,11 @@ define filechk_gen-asm-offsets echo ""; ) endef -$(ARCH_DIR)/include/user_constants.h: $(ARCH_DIR)/user-offsets.s +$(ARCH_DIR)/include/user_constants.h: $(ARCH_DIR)/sys-$(SUBARCH)/user-offsets.s $(call filechk,gen-asm-offsets) -CLEAN_FILES += $(ARCH_DIR)/user-offsets.s - $(ARCH_DIR)/include/kern_constants.h: $(objtree)/$(ARCH_DIR)/include @echo ' SYMLINK $@' - $(Q) ln -sf ../../../include/asm-um/asm-offsets.h $@ + $(Q)ln -sf ../../../include/asm-um/asm-offsets.h $@ export SUBARCH USER_CFLAGS OS diff --git a/arch/um/sys-i386/Makefile b/arch/um/sys-i386/Makefile index 98b20b7..82121ab 100644 --- a/arch/um/sys-i386/Makefile +++ b/arch/um/sys-i386/Makefile @@ -10,9 +10,12 @@ subarch-obj-$(CONFIG_MODULES) += kernel/module.o USER_OBJS := bugs.o ptrace_user.o sigcontext.o fault.o stub_segv.o -include arch/um/scripts/Makefile.rules +USER_OBJS += user-offsets.s +extra-y += user-offsets.s extra-$(CONFIG_MODE_TT) += unmap.o +include arch/um/scripts/Makefile.rules + $(obj)/stub_segv.o $(obj)/unmap.o: \ _c_flags = $(call unprofile,$(CFLAGS)) diff --git a/arch/um/sys-x86_64/Makefile b/arch/um/sys-x86_64/Makefile index b5fc22b..f739bea 100644 --- a/arch/um/sys-x86_64/Makefile +++ b/arch/um/sys-x86_64/Makefile @@ -18,9 +18,12 @@ ldt-y = ../sys-i386/ldt.o USER_OBJS := ptrace_user.o sigcontext.o stub_segv.o -include arch/um/scripts/Makefile.rules +USER_OBJS += user-offsets.s +extra-y += user-offsets.s extra-$(CONFIG_MODE_TT) += unmap.o +include arch/um/scripts/Makefile.rules + $(obj)/stub_segv.o $(obj)/unmap.o: \ _c_flags = $(call unprofile,$(CFLAGS)) -- cgit v0.10.2 From 275e6e1ee2bafde77e9390b27e876fa83f24cb60 Mon Sep 17 00:00:00 2001 From: Paolo 'Blaisorblade' Giarrusso Date: Mon, 1 May 2006 12:16:04 -0700 Subject: [PATCH] uml: fix compilation and execution with hardened GCC To make some half-assembly stubs compile, disable various "hardened" GCC features: *) we can't make it build PIC code as we need %ebx to do syscalls and GCC wants it free for PIC *) we can't leave stack protection as the stub is moved (not relocated!) in memory so the RIP-relative access to the canary tries reading from an unmapped address and causes a segfault, since we move the stub of various megabytes (the exact amount will be decided at runtime) away from the link-time address. Signed-off-by: Paolo 'Blaisorblade' Giarrusso Acked-by: Jeff Dike Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/um/Makefile b/arch/um/Makefile index 930e006..bed604a 100644 --- a/arch/um/Makefile +++ b/arch/um/Makefile @@ -118,6 +118,10 @@ prepare: $(ARCH_DIR)/include/kern_constants.h LINK-$(CONFIG_LD_SCRIPT_STATIC) += -static LINK-$(CONFIG_LD_SCRIPT_DYN) += -Wl,-rpath,/lib +CFLAGS_NO_HARDENING := $(call cc-option, -fno-PIC,) $(call cc-option, -fno-pic,) \ + $(call cc-option, -fno-stack-protector,) \ + $(call cc-option, -fno-stack-protector-all,) + CPP_MODE-$(CONFIG_MODE_TT) := -DMODE_TT CONFIG_KERNEL_STACK_ORDER ?= 2 STACK_SIZE := $(shell echo $$[ 4096 * (1 << $(CONFIG_KERNEL_STACK_ORDER)) ] ) @@ -227,4 +231,4 @@ $(ARCH_DIR)/include/kern_constants.h: $(objtree)/$(ARCH_DIR)/include @echo ' SYMLINK $@' $(Q)ln -sf ../../../include/asm-um/asm-offsets.h $@ -export SUBARCH USER_CFLAGS OS +export SUBARCH USER_CFLAGS CFLAGS_NO_HARDENING OS diff --git a/arch/um/kernel/skas/Makefile b/arch/um/kernel/skas/Makefile index 57181a9..ad84296 100644 --- a/arch/um/kernel/skas/Makefile +++ b/arch/um/kernel/skas/Makefile @@ -11,4 +11,11 @@ USER_OBJS := clone.o include arch/um/scripts/Makefile.rules # clone.o is in the stub, so it can't be built with profiling -$(obj)/clone.o : c_flags = -Wp,-MD,$(depfile) $(call unprofile,$(USER_CFLAGS)) +# GCC hardened also auto-enables -fpic, but we need %ebx so it can't work -> +# disable it + +CFLAGS_clone.o := $(CFLAGS_NO_HARDENING) + +# since we're setting c_flags we _must_ add $(CFLAGS_$(*F).o). + +$(obj)/clone.o : c_flags = -Wp,-MD,$(depfile) $(call unprofile,$(USER_CFLAGS)) $(CFLAGS_$(*F).o) diff --git a/arch/um/sys-i386/Makefile b/arch/um/sys-i386/Makefile index 82121ab..3734c3e 100644 --- a/arch/um/sys-i386/Makefile +++ b/arch/um/sys-i386/Makefile @@ -13,6 +13,8 @@ USER_OBJS := bugs.o ptrace_user.o sigcontext.o fault.o stub_segv.o USER_OBJS += user-offsets.s extra-y += user-offsets.s +CFLAGS_stub_segv.o := $(CFLAGS_NO_HARDENING) + extra-$(CONFIG_MODE_TT) += unmap.o include arch/um/scripts/Makefile.rules diff --git a/arch/um/sys-x86_64/Makefile b/arch/um/sys-x86_64/Makefile index f739bea..6d3b29c 100644 --- a/arch/um/sys-x86_64/Makefile +++ b/arch/um/sys-x86_64/Makefile @@ -21,6 +21,8 @@ USER_OBJS := ptrace_user.o sigcontext.o stub_segv.o USER_OBJS += user-offsets.s extra-y += user-offsets.s +CFLAGS_stub_segv.o := $(CFLAGS_NO_HARDENING) + extra-$(CONFIG_MODE_TT) += unmap.o include arch/um/scripts/Makefile.rules -- cgit v0.10.2 From 7b12b9137930eb821b68e1bfa11e9de692208620 Mon Sep 17 00:00:00 2001 From: Paolo 'Blaisorblade' Giarrusso Date: Mon, 1 May 2006 12:16:05 -0700 Subject: [PATCH] uml: cleanup unprofile expression and build infrastructure *) Rather than duplicate in various buggy ways the application of CFLAGS_NO_HARDENING and UNPROFILE (which apply to the same files), centralize it in Makefile.rules. UNPROFILE_OBJS mustn't be listed in USER_OBJS but are compiled as such. I've also verified that unprofile didn't work in the current form, because we set _c_flags directly (using CFLAGS and not USER_CFLAGS, which is wrong), which is normally used by c_flags, but we also override c_flags for all USER_OBJS, and there we don't call unprofile. Instead it only worked for unmap.o, the only one which wasn't a USER_OBJ. We need to set c_flags (which is not a public Kbuild API) to clear a lot of compilation flags like -nostdinc which Kbuild forces on everything. *) Rather than $(CFLAGS_$(notdir $@)), which expands to CFLAGS_anObj.s when building "anObj.s", use $(CFLAGS_$(*F).o) which always accesses CFLAGS_anObj.o, like done by Kbuild. *) Make c_flags apply to all targets having the same basename, rather than listing .s, .i, .lst and .o, with the use (which I tested) of $(USER_OBJS:.o=.%): c_flags = ... and of - $(obj)/unmap.c: _c_flags = ... + $(obj)/unmap.%: _c_flags = ... Signed-off-by: Paolo 'Blaisorblade' Giarrusso Acked-by: Jeff Dike Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/um/kernel/skas/Makefile b/arch/um/kernel/skas/Makefile index ad84296..ea3a8e4 100644 --- a/arch/um/kernel/skas/Makefile +++ b/arch/um/kernel/skas/Makefile @@ -6,16 +6,11 @@ obj-y := clone.o exec_kern.o mem.o mmu.o process_kern.o \ syscall.o tlb.o uaccess.o -USER_OBJS := clone.o - -include arch/um/scripts/Makefile.rules - # clone.o is in the stub, so it can't be built with profiling # GCC hardened also auto-enables -fpic, but we need %ebx so it can't work -> # disable it CFLAGS_clone.o := $(CFLAGS_NO_HARDENING) +UNPROFILE_OBJS := clone.o -# since we're setting c_flags we _must_ add $(CFLAGS_$(*F).o). - -$(obj)/clone.o : c_flags = -Wp,-MD,$(depfile) $(call unprofile,$(USER_CFLAGS)) $(CFLAGS_$(*F).o) +include arch/um/scripts/Makefile.rules diff --git a/arch/um/scripts/Makefile.rules b/arch/um/scripts/Makefile.rules index 5e7a9c3..1347dc6 100644 --- a/arch/um/scripts/Makefile.rules +++ b/arch/um/scripts/Makefile.rules @@ -7,11 +7,19 @@ USER_SINGLE_OBJS := \ USER_OBJS += $(filter %_user.o,$(obj-y) $(obj-m) $(USER_SINGLE_OBJS)) USER_OBJS := $(foreach file,$(USER_OBJS),$(obj)/$(file)) -$(USER_OBJS) $(USER_OBJS:.o=.i) $(USER_OBJS:.o=.s) $(USER_OBJS:.o=.lst): \ - c_flags = -Wp,-MD,$(depfile) $(USER_CFLAGS) $(CFLAGS_$(notdir $@)) +$(USER_OBJS:.o=.%): \ + c_flags = -Wp,-MD,$(depfile) $(USER_CFLAGS) $(CFLAGS_$(*F).o) $(USER_OBJS) : CHECKFLAGS := -D__linux__ -Dlinux -D__STDC__ \ -Dunix -D__unix__ -D__$(SUBARCH)__ +# These are like USER_OBJS but filter USER_CFLAGS through unprofile instead of +# using it directly. +UNPROFILE_OBJS := $(foreach file,$(UNPROFILE_OBJS),$(obj)/$(file)) + +$(UNPROFILE_OBJS:.o=.%): \ + c_flags = -Wp,-MD,$(depfile) $(call unprofile,$(USER_CFLAGS)) $(CFLAGS_$(*F).o) +$(UNPROFILE_OBJS) : CHECKFLAGS := -D__linux__ -Dlinux -D__STDC__ \ + -Dunix -D__unix__ -D__$(SUBARCH)__ # The stubs and unmap.o can't try to call mcount or update basic block data define unprofile diff --git a/arch/um/sys-i386/Makefile b/arch/um/sys-i386/Makefile index 3734c3e..374d61a 100644 --- a/arch/um/sys-i386/Makefile +++ b/arch/um/sys-i386/Makefile @@ -8,16 +8,16 @@ subarch-obj-y = lib/bitops.o kernel/semaphore.o subarch-obj-$(CONFIG_HIGHMEM) += mm/highmem.o subarch-obj-$(CONFIG_MODULES) += kernel/module.o -USER_OBJS := bugs.o ptrace_user.o sigcontext.o fault.o stub_segv.o +USER_OBJS := bugs.o ptrace_user.o sigcontext.o fault.o USER_OBJS += user-offsets.s extra-y += user-offsets.s -CFLAGS_stub_segv.o := $(CFLAGS_NO_HARDENING) - extra-$(CONFIG_MODE_TT) += unmap.o +UNPROFILE_OBJS := stub_segv.o +CFLAGS_stub_segv.o := $(CFLAGS_NO_HARDENING) + include arch/um/scripts/Makefile.rules -$(obj)/stub_segv.o $(obj)/unmap.o: \ - _c_flags = $(call unprofile,$(CFLAGS)) +$(obj)/unmap.%: _c_flags = $(call unprofile,$(CFLAGS)) diff --git a/arch/um/sys-x86_64/Makefile b/arch/um/sys-x86_64/Makefile index 6d3b29c..c19794d 100644 --- a/arch/um/sys-x86_64/Makefile +++ b/arch/um/sys-x86_64/Makefile @@ -16,16 +16,16 @@ subarch-obj-$(CONFIG_MODULES) += kernel/module.o ldt-y = ../sys-i386/ldt.o -USER_OBJS := ptrace_user.o sigcontext.o stub_segv.o +USER_OBJS := ptrace_user.o sigcontext.o USER_OBJS += user-offsets.s extra-y += user-offsets.s -CFLAGS_stub_segv.o := $(CFLAGS_NO_HARDENING) - extra-$(CONFIG_MODE_TT) += unmap.o +UNPROFILE_OBJS := stub_segv.o +CFLAGS_stub_segv.o := $(CFLAGS_NO_HARDENING) + include arch/um/scripts/Makefile.rules -$(obj)/stub_segv.o $(obj)/unmap.o: \ - _c_flags = $(call unprofile,$(CFLAGS)) +$(obj)/unmap.%: _c_flags = $(call unprofile,$(CFLAGS)) -- cgit v0.10.2 From cead61a6717a9873426b08d73a34a325e3546f5d Mon Sep 17 00:00:00 2001 From: Paolo 'Blaisorblade' Giarrusso Date: Mon, 1 May 2006 12:16:06 -0700 Subject: [PATCH] uml: export symbols added by GCC hardened GCC hardened introduces additional symbol refererences (for the canary and friends), also in modules - add weak export_symbols for them. We already tested that the weak declaration creates no problem on both GCC's providing the function definition and on GCC's which don't provide it. Signed-off-by: Paolo 'Blaisorblade' Giarrusso Acked-by: Jeff Dike Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/um/os-Linux/user_syms.c b/arch/um/os-Linux/user_syms.c index 2598158..3f33165 100644 --- a/arch/um/os-Linux/user_syms.c +++ b/arch/um/os-Linux/user_syms.c @@ -96,6 +96,13 @@ EXPORT_SYMBOL_PROTO(getuid); EXPORT_SYMBOL_PROTO(fsync); EXPORT_SYMBOL_PROTO(fdatasync); +/* Export symbols used by GCC for the stack protector. */ +extern void __stack_smash_handler(void *) __attribute__((weak)); +EXPORT_SYMBOL(__stack_smash_handler); + +extern long __guard __attribute__((weak)); +EXPORT_SYMBOL(__guard); + /* * Overrides for Emacs so that we follow Linus's tabbing style. * Emacs will notice this stuff at the end of the file and automatically -- cgit v0.10.2 From a4b741e380cb56045ccea36f2dbc10e42af21f24 Mon Sep 17 00:00:00 2001 From: Jeff Dike Date: Mon, 1 May 2006 12:16:06 -0700 Subject: [PATCH] uml: uml-makefile-nicer uses SYMLINK incorrectly Blaisorblade's uml-makefile-nicer makes a V=0 build say SYMLINK where what's happening is really a LINK. Signed-off-by: Jeff Dike Acked-by: Paolo 'Blaisorblade' Giarrusso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/um/Makefile b/arch/um/Makefile index bed604a..f6ad832 100644 --- a/arch/um/Makefile +++ b/arch/um/Makefile @@ -96,7 +96,7 @@ PHONY += linux all: linux linux: vmlinux - @echo ' SYMLINK $@' + @echo ' LINK $@' $(Q)ln -f $< $@ define archhelp -- cgit v0.10.2 From 4c28f81193b6778f7b49090930d88e6d12bcb928 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 1 May 2006 12:16:08 -0700 Subject: [PATCH] page migration: Fix fallback behavior for dirty pages Currently we check PageDirty() in order to make the decision to swap out the page. However, the dirty information may be only be contained in the ptes pointing to the page. We need to first unmap the ptes before checking for PageDirty(). If unmap is successful then the page count of the page will also be decreased so that pageout() works properly. This is a fix necessary for 2.6.17. Without this fix we may migrate dirty pages for filesystems without migration functions. Filesystems may keep pointers to dirty pages. Migration of dirty pages can result in the filesystem keeping pointers to freed pages. Unmapping is currently not be separated out from removing all the references to a page and moving the mapping. Therefore try_to_unmap will be called again in migrate_page() if the writeout is successful. However, it wont do anything since the ptes are already removed. The coming updates to the page migration code will restructure the code so that this is no longer necessary. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/migrate.c b/mm/migrate.c index d444229..1c25040 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -439,6 +439,17 @@ redo: goto unlock_both; } + /* Make sure the dirty bit is up to date */ + if (try_to_unmap(page, 1) == SWAP_FAIL) { + rc = -EPERM; + goto unlock_both; + } + + if (page_mapcount(page)) { + rc = -EAGAIN; + goto unlock_both; + } + /* * Default handling if a filesystem does not provide * a migration function. We can only migrate clean -- cgit v0.10.2 From 2c43630fb0ff3f01c29367248ffa4a851e2c9b34 Mon Sep 17 00:00:00 2001 From: Pat Gefre Date: Mon, 1 May 2006 12:16:08 -0700 Subject: [PATCH] Altix: correct ioc3 port order Currently loading the ioc3 as a module will cause the ports to be numbered in reverse order. This mod maintains the proper order of cards for port numbering. Signed-off-by: Patrick Gefre Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/sn/ioc3.c b/drivers/sn/ioc3.c index 0b49ff7..501316b 100644 --- a/drivers/sn/ioc3.c +++ b/drivers/sn/ioc3.c @@ -678,7 +678,7 @@ static int ioc3_probe(struct pci_dev *pdev, const struct pci_device_id *pci_id) /* Track PCI-device specific data */ pci_set_drvdata(pdev, idd); down_write(&ioc3_devices_rwsem); - list_add(&idd->list, &ioc3_devices); + list_add_tail(&idd->list, &ioc3_devices); idd->id = ioc3_counter++; up_write(&ioc3_devices_rwsem); -- cgit v0.10.2 From 46a66eecdf7bc12562ecb492297447ed0e1ecf59 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Mon, 1 May 2006 12:16:09 -0700 Subject: [PATCH] sparsemem interaction with memory add bug fixes This patch fixes two bugs with the way sparsemem interacts with memory add. They are: - memory leak if memmap for section already exists - calling alloc_bootmem_node() after boot These bugs were discovered and a first cut at the fixes were provided by Arnd Bergmann and Joel Schopp . Signed-off-by: Mike Kravetz Signed-off-by: Joel Schopp Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/sparse.c b/mm/sparse.c index 0a51f36..d7c32de 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -32,7 +32,10 @@ static struct mem_section *sparse_index_alloc(int nid) unsigned long array_size = SECTIONS_PER_ROOT * sizeof(struct mem_section); - section = alloc_bootmem_node(NODE_DATA(nid), array_size); + if (system_state == SYSTEM_RUNNING) + section = kmalloc_node(array_size, GFP_KERNEL, nid); + else + section = alloc_bootmem_node(NODE_DATA(nid), array_size); if (section) memset(section, 0, array_size); @@ -281,9 +284,9 @@ int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, ret = sparse_init_one_section(ms, section_nr, memmap); - if (ret <= 0) - __kfree_section_memmap(memmap, nr_pages); out: pgdat_resize_unlock(pgdat, &flags); + if (ret <= 0) + __kfree_section_memmap(memmap, nr_pages); return ret; } -- cgit v0.10.2 From bed120c64eb07b6838bb758109811484af8cebba Mon Sep 17 00:00:00 2001 From: Joel H Schopp Date: Mon, 1 May 2006 12:16:11 -0700 Subject: [PATCH] spufs: fix for CONFIG_NUMA Based on an older patch from Mike Kravetz We need to have a mem_map for high addresses in order to make fops->no_page work on spufs mem and register files. So far, we have used the memory_present() function during early bootup, but that did not work when CONFIG_NUMA was enabled. We now use the __add_pages() function to add the mem_map when loading the spufs module, which is a lot nicer. Signed-off-by: Arnd Bergmann Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/powerpc/platforms/cell/Kconfig b/arch/powerpc/platforms/cell/Kconfig index c2a3db8..6a02d51 100644 --- a/arch/powerpc/platforms/cell/Kconfig +++ b/arch/powerpc/platforms/cell/Kconfig @@ -12,7 +12,8 @@ config SPU_FS config SPUFS_MMAP bool - depends on SPU_FS && SPARSEMEM && !PPC_64K_PAGES + depends on SPU_FS && SPARSEMEM + select MEMORY_HOTPLUG default y endmenu diff --git a/arch/powerpc/platforms/cell/setup.c b/arch/powerpc/platforms/cell/setup.c index dac5d03..6574b22 100644 --- a/arch/powerpc/platforms/cell/setup.c +++ b/arch/powerpc/platforms/cell/setup.c @@ -29,6 +29,8 @@ #include #include #include +#include +#include #include #include @@ -46,6 +48,7 @@ #include #include #include +#include #include "interrupt.h" #include "iommu.h" @@ -69,77 +72,6 @@ static void cell_show_cpuinfo(struct seq_file *m) of_node_put(root); } -#ifdef CONFIG_SPARSEMEM -static int __init find_spu_node_id(struct device_node *spe) -{ - unsigned int *id; -#ifdef CONFIG_NUMA - struct device_node *cpu; - cpu = spe->parent->parent; - id = (unsigned int *)get_property(cpu, "node-id", NULL); -#else - id = NULL; -#endif - return id ? *id : 0; -} - -static void __init cell_spuprop_present(struct device_node *spe, - const char *prop, int early) -{ - struct address_prop { - unsigned long address; - unsigned int len; - } __attribute__((packed)) *p; - int proplen; - - unsigned long start_pfn, end_pfn, pfn; - int node_id; - - p = (void*)get_property(spe, prop, &proplen); - WARN_ON(proplen != sizeof (*p)); - - node_id = find_spu_node_id(spe); - - start_pfn = p->address >> PAGE_SHIFT; - end_pfn = (p->address + p->len + PAGE_SIZE - 1) >> PAGE_SHIFT; - - /* We need to call memory_present *before* the call to sparse_init, - but we can initialize the page structs only *after* that call. - Thus, we're being called twice. */ - if (early) - memory_present(node_id, start_pfn, end_pfn); - else { - /* As the pages backing SPU LS and I/O are outside the range - of regular memory, their page structs were not initialized - by free_area_init. Do it here instead. */ - for (pfn = start_pfn; pfn < end_pfn; pfn++) { - struct page *page = pfn_to_page(pfn); - set_page_links(page, ZONE_DMA, node_id, pfn); - init_page_count(page); - reset_page_mapcount(page); - SetPageReserved(page); - INIT_LIST_HEAD(&page->lru); - } - } -} - -static void __init cell_spumem_init(int early) -{ - struct device_node *node; - for (node = of_find_node_by_type(NULL, "spe"); - node; node = of_find_node_by_type(node, "spe")) { - cell_spuprop_present(node, "local-store", early); - cell_spuprop_present(node, "problem", early); - cell_spuprop_present(node, "priv1", early); - cell_spuprop_present(node, "priv2", early); - } -} -#else -static void __init cell_spumem_init(int early) -{ -} -#endif - static void cell_progress(char *s, unsigned short hex) { printk("*** %04x : %s\n", hex, s ? s : ""); @@ -172,8 +104,6 @@ static void __init cell_setup_arch(void) #endif mmio_nvram_init(); - - cell_spumem_init(0); } /* @@ -189,8 +119,6 @@ static void __init cell_init_early(void) ppc64_interrupt_controller = IC_CELL_PIC; - cell_spumem_init(1); - DBG(" <- cell_init_early()\n"); } diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c index ef47a62..b788e16 100644 --- a/arch/powerpc/platforms/cell/spu_base.c +++ b/arch/powerpc/platforms/cell/spu_base.c @@ -520,6 +520,56 @@ void spu_irq_setaffinity(struct spu *spu, int cpu) } EXPORT_SYMBOL_GPL(spu_irq_setaffinity); +static int __init find_spu_node_id(struct device_node *spe) +{ + unsigned int *id; + struct device_node *cpu; + cpu = spe->parent->parent; + id = (unsigned int *)get_property(cpu, "node-id", NULL); + return id ? *id : 0; +} + +static int __init cell_spuprop_present(struct device_node *spe, + const char *prop) +{ + static DEFINE_MUTEX(add_spumem_mutex); + + struct address_prop { + unsigned long address; + unsigned int len; + } __attribute__((packed)) *p; + int proplen; + + unsigned long start_pfn, nr_pages; + int node_id; + struct pglist_data *pgdata; + struct zone *zone; + int ret; + + p = (void*)get_property(spe, prop, &proplen); + WARN_ON(proplen != sizeof (*p)); + + start_pfn = p->address >> PAGE_SHIFT; + nr_pages = ((unsigned long)p->len + PAGE_SIZE - 1) >> PAGE_SHIFT; + + /* + * XXX need to get the correct NUMA node in here. This may + * be different from the spe::node_id property, e.g. when + * the host firmware is not NUMA aware. + */ + node_id = 0; + + pgdata = NODE_DATA(node_id); + zone = pgdata->node_zones; + + /* XXX rethink locking here */ + mutex_lock(&add_spumem_mutex); + ret = __add_pages(zone, start_pfn, nr_pages); + mutex_unlock(&add_spumem_mutex); + + return ret; +} + static void __iomem * __init map_spe_prop(struct device_node *n, const char *name) { @@ -530,6 +580,8 @@ static void __iomem * __init map_spe_prop(struct device_node *n, void *p; int proplen; + void* ret = NULL; + int err = 0; p = get_property(n, name, &proplen); if (proplen != sizeof (struct address_prop)) @@ -537,7 +589,14 @@ static void __iomem * __init map_spe_prop(struct device_node *n, prop = p; - return ioremap(prop->address, prop->len); + err = cell_spuprop_present(n, name); + if (err && (err != -EEXIST)) + goto out; + + ret = ioremap(prop->address, prop->len); + + out: + return ret; } static void spu_unmap(struct spu *spu) @@ -597,17 +656,6 @@ out: return ret; } -static int __init find_spu_node_id(struct device_node *spe) -{ - unsigned int *id; - struct device_node *cpu; - - cpu = spe->parent->parent; - id = (unsigned int *)get_property(cpu, "node-id", NULL); - - return id ? *id : 0; -} - static int __init create_spu(struct device_node *spe) { struct spu *spu; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 1fe76d9..1ae2b2c 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -69,12 +69,16 @@ int __add_pages(struct zone *zone, unsigned long phys_start_pfn, for (i = 0; i < nr_pages; i += PAGES_PER_SECTION) { err = __add_section(zone, phys_start_pfn + i); - if (err) + /* We want to keep adding the rest of the + * sections if the first ones already exist + */ + if (err && (err != -EEXIST)) break; } return err; } +EXPORT_SYMBOL_GPL(__add_pages); static void grow_zone_span(struct zone *zone, unsigned long start_pfn, unsigned long end_pfn) -- cgit v0.10.2 From 953039c8df7beb2694814e20e2707a77d335a2e3 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Mon, 1 May 2006 12:16:12 -0700 Subject: [PATCH] powerpc: Allow devices to register with numa topology Change of_node_to_nid() to traverse the device tree, looking for a numa id. Cell uses this to assign ids to SPUs, which are children of the CPU node. Existing users of of_node_to_nid() are altered to use of_node_to_nid_single(), which doesn't do the traversal. Export an attach_sysdev_to_node() function, allowing system devices (eg. SPUs) to link themselves into the numa topology in sysfs. Signed-off-by: Arnd Bergmann Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c index ed737ca..5bc2585 100644 --- a/arch/powerpc/kernel/sysfs.c +++ b/arch/powerpc/kernel/sysfs.c @@ -322,13 +322,31 @@ static void register_nodes(void) } } } + +int sysfs_add_device_to_node(struct sys_device *dev, int nid) +{ + struct node *node = &node_devices[nid]; + return sysfs_create_link(&node->sysdev.kobj, &dev->kobj, + kobject_name(&dev->kobj)); +} + +void sysfs_remove_device_from_node(struct sys_device *dev, int nid) +{ + struct node *node = &node_devices[nid]; + sysfs_remove_link(&node->sysdev.kobj, kobject_name(&dev->kobj)); +} + #else static void register_nodes(void) { return; } + #endif +EXPORT_SYMBOL_GPL(sysfs_add_device_to_node); +EXPORT_SYMBOL_GPL(sysfs_remove_device_from_node); + /* Only valid if CPU is present. */ static ssize_t show_physical_id(struct sys_device *dev, char *buf) { diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 0a335f3..092355f 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -194,7 +194,7 @@ static int *of_get_associativity(struct device_node *dev) /* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa * info is found. */ -static int of_node_to_nid(struct device_node *device) +static int of_node_to_nid_single(struct device_node *device) { int nid = -1; unsigned int *tmp; @@ -216,6 +216,28 @@ out: return nid; } +/* Walk the device tree upwards, looking for an associativity id */ +int of_node_to_nid(struct device_node *device) +{ + struct device_node *tmp; + int nid = -1; + + of_node_get(device); + while (device) { + nid = of_node_to_nid_single(device); + if (nid != -1) + break; + + tmp = device; + device = of_get_parent(tmp); + of_node_put(tmp); + } + of_node_put(device); + + return nid; +} +EXPORT_SYMBOL_GPL(of_node_to_nid); + /* * In theory, the "ibm,associativity" property may contain multiple * associativity lists because a resource may be multiply connected @@ -300,7 +322,7 @@ static int __cpuinit numa_setup_cpu(unsigned long lcpu) goto out; } - nid = of_node_to_nid(cpu); + nid = of_node_to_nid_single(cpu); if (nid < 0 || !node_online(nid)) nid = any_online_node(NODE_MASK_ALL); @@ -393,7 +415,7 @@ static int __init parse_numa_properties(void) cpu = find_cpu_node(i); BUG_ON(!cpu); - nid = of_node_to_nid(cpu); + nid = of_node_to_nid_single(cpu); of_node_put(cpu); /* @@ -437,7 +459,7 @@ new_range: * have associativity properties. If none, then * everything goes to default_nid. */ - nid = of_node_to_nid(memory); + nid = of_node_to_nid_single(memory); if (nid < 0) nid = default_nid; node_set_online(nid); @@ -776,7 +798,7 @@ int hot_add_scn_to_nid(unsigned long scn_addr) ha_new_range: start = read_n_cells(n_mem_addr_cells, &memcell_buf); size = read_n_cells(n_mem_size_cells, &memcell_buf); - nid = of_node_to_nid(memory); + nid = of_node_to_nid_single(memory); /* Domains not present at boot default to 0 */ if (nid < 0 || !node_online(nid)) diff --git a/include/asm-powerpc/topology.h b/include/asm-powerpc/topology.h index 1e19cd0..87362a0 100644 --- a/include/asm-powerpc/topology.h +++ b/include/asm-powerpc/topology.h @@ -4,6 +4,9 @@ #include +struct sys_device; +struct device_node; + #ifdef CONFIG_NUMA #include @@ -27,6 +30,8 @@ static inline int node_to_first_cpu(int node) return first_cpu(tmp); } +int of_node_to_nid(struct device_node *device); + #define pcibus_to_node(node) (-1) #define pcibus_to_cpumask(bus) (cpu_online_map) @@ -57,10 +62,29 @@ static inline int node_to_first_cpu(int node) extern void __init dump_numa_cpu_topology(void); +extern int sysfs_add_device_to_node(struct sys_device *dev, int nid); +extern void sysfs_remove_device_from_node(struct sys_device *dev, int nid); + #else +static inline int of_node_to_nid(struct device_node *device) +{ + return 0; +} + static inline void dump_numa_cpu_topology(void) {} +static inline int sysfs_add_device_to_node(struct sys_device *dev, int nid) +{ + return 0; +} + +static inline void sysfs_remove_device_from_node(struct sys_device *dev, + int nid) +{ +} + + #include #endif /* CONFIG_NUMA */ -- cgit v0.10.2 From 8261aa600943046a5305564a1cd7432009971799 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Mon, 1 May 2006 12:16:13 -0700 Subject: [PATCH] powerpc: cell: Add numa id to struct spu Add an nid member to the spu structure, and store the numa id of the spu there on creation. Signed-off-by: Arnd Bergmann Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c index b788e16..ad141fe 100644 --- a/arch/powerpc/platforms/cell/spu_base.c +++ b/arch/powerpc/platforms/cell/spu_base.c @@ -529,8 +529,8 @@ static int __init find_spu_node_id(struct device_node *spe) return id ? *id : 0; } -static int __init cell_spuprop_present(struct device_node *spe, - const char *prop) +static int __init cell_spuprop_present(struct spu *spu, struct device_node *spe, + const char *prop) { static DEFINE_MUTEX(add_spumem_mutex); @@ -541,7 +541,6 @@ static int __init cell_spuprop_present(struct device_node *spe, int proplen; unsigned long start_pfn, nr_pages; - int node_id; struct pglist_data *pgdata; struct zone *zone; int ret; @@ -552,14 +551,7 @@ static int __init cell_spuprop_present(struct device_node *spe, start_pfn = p->address >> PAGE_SHIFT; nr_pages = ((unsigned long)p->len + PAGE_SIZE - 1) >> PAGE_SHIFT; - /* - * XXX need to get the correct NUMA node in here. This may - * be different from the spe::node_id property, e.g. when - * the host firmware is not NUMA aware. - */ - node_id = 0; - - pgdata = NODE_DATA(node_id); + pgdata = NODE_DATA(spu->nid); zone = pgdata->node_zones; /* XXX rethink locking here */ @@ -570,8 +562,8 @@ static int __init cell_spuprop_present(struct device_node *spe, return ret; } -static void __iomem * __init map_spe_prop(struct device_node *n, - const char *name) +static void __iomem * __init map_spe_prop(struct spu *spu, + struct device_node *n, const char *name) { struct address_prop { unsigned long address; @@ -589,7 +581,7 @@ static void __iomem * __init map_spe_prop(struct device_node *n, prop = p; - err = cell_spuprop_present(n, name); + err = cell_spuprop_present(spu, n, name); if (err && (err != -EEXIST)) goto out; @@ -607,44 +599,45 @@ static void spu_unmap(struct spu *spu) iounmap((u8 __iomem *)spu->local_store); } -static int __init spu_map_device(struct spu *spu, struct device_node *spe) +static int __init spu_map_device(struct spu *spu, struct device_node *node) { char *prop; int ret; ret = -ENODEV; - prop = get_property(spe, "isrc", NULL); + prop = get_property(node, "isrc", NULL); if (!prop) goto out; spu->isrc = *(unsigned int *)prop; - spu->name = get_property(spe, "name", NULL); + spu->name = get_property(node, "name", NULL); if (!spu->name) goto out; - prop = get_property(spe, "local-store", NULL); + prop = get_property(node, "local-store", NULL); if (!prop) goto out; spu->local_store_phys = *(unsigned long *)prop; /* we use local store as ram, not io memory */ - spu->local_store = (void __force *)map_spe_prop(spe, "local-store"); + spu->local_store = (void __force *) + map_spe_prop(spu, node, "local-store"); if (!spu->local_store) goto out; - prop = get_property(spe, "problem", NULL); + prop = get_property(node, "problem", NULL); if (!prop) goto out_unmap; spu->problem_phys = *(unsigned long *)prop; - spu->problem= map_spe_prop(spe, "problem"); + spu->problem= map_spe_prop(spu, node, "problem"); if (!spu->problem) goto out_unmap; - spu->priv1= map_spe_prop(spe, "priv1"); + spu->priv1= map_spe_prop(spu, node, "priv1"); /* priv1 is not available on a hypervisor */ - spu->priv2= map_spe_prop(spe, "priv2"); + spu->priv2= map_spe_prop(spu, node, "priv2"); if (!spu->priv2) goto out_unmap; ret = 0; @@ -672,6 +665,10 @@ static int __init create_spu(struct device_node *spe) goto out_free; spu->node = find_spu_node_id(spe); + spu->nid = of_node_to_nid(spe); + if (spu->nid == -1) + spu->nid = 0; + spu->stop_code = 0; spu->slb_replace = 0; spu->mm = NULL; diff --git a/include/asm-powerpc/spu.h b/include/asm-powerpc/spu.h index f431d8b0..7cfcff3 100644 --- a/include/asm-powerpc/spu.h +++ b/include/asm-powerpc/spu.h @@ -117,6 +117,7 @@ struct spu { struct list_head list; struct list_head sched_list; int number; + int nid; u32 isrc; u32 node; u64 flags; -- cgit v0.10.2 From 022e4fc0fb2e4e179aaba4ee139dcb8fded0cba2 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 1 May 2006 12:16:14 -0700 Subject: [PATCH] s390: fix ipd handling As pointed out by Paulo Marques MAX_IPD_TIME is by a factor of ten too small. Since this means that we allow ten times more IPDs in the intended time frame this could result in a cpu check stop of a physical cpu. Cc: Martin Schwidefsky Signed-off-by: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/s390/s390mach.c b/drivers/s390/s390mach.c index 5ae1480..f99e553 100644 --- a/drivers/s390/s390mach.c +++ b/drivers/s390/s390mach.c @@ -13,6 +13,7 @@ #include #include #include +#include #include @@ -363,7 +364,7 @@ s390_revalidate_registers(struct mci *mci) } #define MAX_IPD_COUNT 29 -#define MAX_IPD_TIME (5 * 60 * 100 * 1000) /* 5 minutes */ +#define MAX_IPD_TIME (5 * 60 * USEC_PER_SEC) /* 5 minutes */ /* * machine check handler. -- cgit v0.10.2 From b44df334a7e909d88cf5c54cc0481b4e2eaeca23 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 1 May 2006 12:16:15 -0700 Subject: [PATCH] s390: bug in setup_rt_frame Consider return value of __put_user() when setting up a signal frame instead of ignoring it. Cc: Martin Schwidefsky Signed-off-by: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c index ae1927e..d48cfc7 100644 --- a/arch/s390/kernel/signal.c +++ b/arch/s390/kernel/signal.c @@ -358,8 +358,9 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, } else { regs->gprs[14] = (unsigned long) frame->retcode | PSW_ADDR_AMODE; - err |= __put_user(S390_SYSCALL_OPCODE | __NR_rt_sigreturn, - (u16 __user *)(frame->retcode)); + if (__put_user(S390_SYSCALL_OPCODE | __NR_rt_sigreturn, + (u16 __user *)(frame->retcode))) + goto give_sigsegv; } /* Set up backchain. */ -- cgit v0.10.2 From 3418ff76119da52f808eb496191d1fd380f53f3d Mon Sep 17 00:00:00 2001 From: Atsushi Nemoto Date: Mon, 1 May 2006 12:16:16 -0700 Subject: [PATCH] RTC: rtc-dev tweak for 64-bit kernel Make rtc-dev work well on 64-bit platforms with 32-bit userland. On those platforms, users might try to read 32-bit integer value. This patch make rtc-dev's read() work well for both "int" and "long" size. This tweak is came from genrtc driver. Signed-off-by: Atsushi Nemoto Cc: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/rtc/rtc-dev.c b/drivers/rtc/rtc-dev.c index b1e3e61..6c9ad92 100644 --- a/drivers/rtc/rtc-dev.c +++ b/drivers/rtc/rtc-dev.c @@ -58,7 +58,7 @@ rtc_dev_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) unsigned long data; ssize_t ret; - if (count < sizeof(unsigned long)) + if (count != sizeof(unsigned int) && count < sizeof(unsigned long)) return -EINVAL; add_wait_queue(&rtc->irq_queue, &wait); @@ -90,11 +90,16 @@ rtc_dev_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) if (ret == 0) { /* Check for any data updates */ if (rtc->ops->read_callback) - data = rtc->ops->read_callback(rtc->class_dev.dev, data); - - ret = put_user(data, (unsigned long __user *)buf); - if (ret == 0) - ret = sizeof(unsigned long); + data = rtc->ops->read_callback(rtc->class_dev.dev, + data); + + if (sizeof(int) != sizeof(long) && + count == sizeof(unsigned int)) + ret = put_user(data, (unsigned int __user *)buf) ?: + sizeof(unsigned int); + else + ret = put_user(data, (unsigned long __user *)buf) ?: + sizeof(unsigned long); } return ret; } -- cgit v0.10.2 From f3537ea7b9c2f10397a8b68cd006981d7c615431 Mon Sep 17 00:00:00 2001 From: Atsushi Nemoto Date: Mon, 1 May 2006 12:16:17 -0700 Subject: [PATCH] genrtc: fix read on 64-bit platforms Fix genrtc's read() routine for 64-bit platforms. Current gen_rtc_read() stores 64bit integer and returns 8 even if an user tried to read a 32bit integer. Signed-off-by: Atsushi Nemoto Cc: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/char/genrtc.c b/drivers/char/genrtc.c index d3a2bc3..588fca5 100644 --- a/drivers/char/genrtc.c +++ b/drivers/char/genrtc.c @@ -200,13 +200,13 @@ static ssize_t gen_rtc_read(struct file *file, char __user *buf, /* first test allows optimizer to nuke this case for 32-bit machines */ if (sizeof (int) != sizeof (long) && count == sizeof (unsigned int)) { unsigned int uidata = data; - retval = put_user(uidata, (unsigned long __user *)buf); + retval = put_user(uidata, (unsigned int __user *)buf) ?: + sizeof(unsigned int); } else { - retval = put_user(data, (unsigned long __user *)buf); + retval = put_user(data, (unsigned long __user *)buf) ?: + sizeof(unsigned long); } - if (!retval) - retval = sizeof(unsigned long); out: current->state = TASK_RUNNING; remove_wait_queue(&gen_rtc_wait, &wait); -- cgit v0.10.2 From 160bd18e5e545cbb4e5c26f54414485f8ac291ec Mon Sep 17 00:00:00 2001 From: Mikael Pettersson Date: Mon, 1 May 2006 12:16:18 -0700 Subject: [PATCH] x86_64: make PC Speaker driver work The PC Speaker driver's ->probe() routine doesn't even get called in the 64-bit kernels. The reason for that is that the arch code apparently has to explictly add a "pcspkr" platform device in order for the driver core to call the ->probe() routine. arch/i386/kernel/setup.c unconditionally adds a "pcspkr" device, but the x86_64 kernel has no code at all related to the PC Speaker. The patch below copies the relevant code from i386 to x86_64, which makes the PC Speaker work for me on x86_64. Cc: Dmitry Torokhov Acked-by: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 759070c..ebc3c33 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -1426,3 +1426,22 @@ struct seq_operations cpuinfo_op = { .show = show_cpuinfo, }; +#ifdef CONFIG_INPUT_PCSPKR +#include +static __init int add_pcspkr(void) +{ + struct platform_device *pd; + int ret; + + pd = platform_device_alloc("pcspkr", -1); + if (!pd) + return -ENOMEM; + + ret = platform_device_add(pd); + if (ret) + platform_device_put(pd); + + return ret; +} +device_initcall(add_pcspkr); +#endif -- cgit v0.10.2 From 6ba815ded3fef03e888a9fc8eae3113938ff5349 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 1 May 2006 12:16:19 -0700 Subject: [PATCH] timer TSC check suspend notifier change At suspend time, the TSC CPUFREQ_SUSPENDCHANGE notifier change might wrongly enable interrupt. cpufreq driver suspend/resume is in interrupt disabled environment. Signed-off-by: Shaohua Li Cc: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/i386/kernel/timers/timer_tsc.c b/arch/i386/kernel/timers/timer_tsc.c index 5e41ee2..f1187dd 100644 --- a/arch/i386/kernel/timers/timer_tsc.c +++ b/arch/i386/kernel/timers/timer_tsc.c @@ -279,7 +279,7 @@ time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, { struct cpufreq_freqs *freq = data; - if (val != CPUFREQ_RESUMECHANGE) + if (val != CPUFREQ_RESUMECHANGE && val != CPUFREQ_SUSPENDCHANGE) write_seqlock_irq(&xtime_lock); if (!ref_freq) { if (!freq->old){ @@ -312,7 +312,7 @@ time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, } end: - if (val != CPUFREQ_RESUMECHANGE) + if (val != CPUFREQ_RESUMECHANGE && val != CPUFREQ_SUSPENDCHANGE) write_sequnlock_irq(&xtime_lock); return 0; -- cgit v0.10.2 From 46c5ea3c9ae7fbc6e52a13c92e59d4fc7f4ca80a Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 2 May 2006 05:12:22 +0200 Subject: [NETFILTER] x_tables: fix compat related crash on non-x86 When iptables userspace adds an ipt_standard_target, it calculates the size of the entire entry as: sizeof(struct ipt_entry) + XT_ALIGN(sizeof(struct ipt_standard_target)) ipt_standard_target looks like this: struct xt_standard_target { struct xt_entry_target target; int verdict; }; xt_entry_target contains a pointer, so when compiled for 64 bit the structure gets an extra 4 byte of padding at the end. On 32 bit architectures where iptables aligns to 8 byte it will also have 4 byte padding at the end because it is only 36 bytes large. The compat_ipt_standard_fn in the kernel adjusts the offsets by sizeof(struct ipt_standard_target) - sizeof(struct compat_ipt_standard_target), which will always result in 4, even if the structure from userspace was already padded to a multiple of 8. On x86 this works out by accident because userspace only aligns to 4, on all other architectures this is broken and causes incorrect adjustments to the size and following offsets. Thanks to Linus for lots of debugging help and testing. Signed-off-by: Patrick McHardy Signed-off-by: Linus Torvalds diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 3870145..48cc32d8 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -337,6 +337,10 @@ struct compat_xt_entry_match char name[XT_FUNCTION_MAXNAMELEN - 1]; u_int8_t revision; } user; + struct { + u_int16_t match_size; + compat_uptr_t match; + } kernel; u_int16_t match_size; } u; unsigned char data[0]; @@ -350,6 +354,10 @@ struct compat_xt_entry_target char name[XT_FUNCTION_MAXNAMELEN - 1]; u_int8_t revision; } user; + struct { + u_int16_t target_size; + compat_uptr_t target; + } kernel; u_int16_t target_size; } u; unsigned char data[0]; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index d25ac8b..6d1c115 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -956,15 +956,16 @@ struct compat_ipt_standard_target compat_int_t verdict; }; -#define IPT_ST_OFFSET (sizeof(struct ipt_standard_target) - \ - sizeof(struct compat_ipt_standard_target)) - struct compat_ipt_standard { struct compat_ipt_entry entry; struct compat_ipt_standard_target target; }; +#define IPT_ST_LEN XT_ALIGN(sizeof(struct ipt_standard_target)) +#define IPT_ST_COMPAT_LEN COMPAT_XT_ALIGN(sizeof(struct compat_ipt_standard_target)) +#define IPT_ST_OFFSET (IPT_ST_LEN - IPT_ST_COMPAT_LEN) + static int compat_ipt_standard_fn(void *target, void **dstptr, int *size, int convert) { @@ -975,35 +976,29 @@ static int compat_ipt_standard_fn(void *target, ret = 0; switch (convert) { case COMPAT_TO_USER: - pst = (struct ipt_standard_target *)target; + pst = target; memcpy(&compat_st.target, &pst->target, - sizeof(struct ipt_entry_target)); + sizeof(compat_st.target)); compat_st.verdict = pst->verdict; if (compat_st.verdict > 0) compat_st.verdict -= compat_calc_jump(compat_st.verdict); - compat_st.target.u.user.target_size = - sizeof(struct compat_ipt_standard_target); - if (__copy_to_user(*dstptr, &compat_st, - sizeof(struct compat_ipt_standard_target))) + compat_st.target.u.user.target_size = IPT_ST_COMPAT_LEN; + if (copy_to_user(*dstptr, &compat_st, IPT_ST_COMPAT_LEN)) ret = -EFAULT; *size -= IPT_ST_OFFSET; - *dstptr += sizeof(struct compat_ipt_standard_target); + *dstptr += IPT_ST_COMPAT_LEN; break; case COMPAT_FROM_USER: - pcompat_st = - (struct compat_ipt_standard_target *)target; - memcpy(&st.target, &pcompat_st->target, - sizeof(struct ipt_entry_target)); + pcompat_st = target; + memcpy(&st.target, &pcompat_st->target, IPT_ST_COMPAT_LEN); st.verdict = pcompat_st->verdict; if (st.verdict > 0) st.verdict += compat_calc_jump(st.verdict); - st.target.u.user.target_size = - sizeof(struct ipt_standard_target); - memcpy(*dstptr, &st, - sizeof(struct ipt_standard_target)); + st.target.u.user.target_size = IPT_ST_LEN; + memcpy(*dstptr, &st, IPT_ST_LEN); *size += IPT_ST_OFFSET; - *dstptr += sizeof(struct ipt_standard_target); + *dstptr += IPT_ST_LEN; break; case COMPAT_CALC_SIZE: *size += IPT_ST_OFFSET; -- cgit v0.10.2 From 7591489a8fbee83f19bacc75756989a6a4d0389c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 2 May 2006 12:57:18 +0200 Subject: [PATCH] vmsplice: fix badly placed end paranthesis We need to use the minium of {len, PAGE_SIZE-off}, not {len, PAGE_SIZE}-off. The latter doesn't make any sense, and could cause us to attempt negative length transfers... Signed-off-by: Jens Axboe diff --git a/fs/splice.c b/fs/splice.c index b150493..b0c157d 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1143,7 +1143,7 @@ static int get_iovec_page_array(const struct iovec __user *iov, * Fill this contiguous range into the partial page map. */ for (i = 0; i < error; i++) { - const int plen = min_t(size_t, len, PAGE_SIZE) - off; + const int plen = min_t(size_t, len, PAGE_SIZE - off); partial[buffers].offset = off; partial[buffers].len = plen; -- cgit v0.10.2 From a893b99be71f1d669b74f840e3a683dd077d007b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 2 May 2006 15:03:27 +0200 Subject: [PATCH] splice: fix page LRU accounting Currently we rely on the PIPE_BUF_FLAG_LRU flag being set correctly to know whether we need to fiddle with page LRU state after stealing it, however for some origins we just don't know if the page is on the LRU list or not. So remove PIPE_BUF_FLAG_LRU and do this check/add manually in pipe_to_file() instead. Signed-off-by: Jens Axboe diff --git a/fs/splice.c b/fs/splice.c index b0c157d..3318b96 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -78,7 +78,6 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *info, return 1; } - buf->flags |= PIPE_BUF_FLAG_LRU; return 0; } @@ -86,8 +85,6 @@ static void page_cache_pipe_buf_release(struct pipe_inode_info *info, struct pipe_buffer *buf) { page_cache_release(buf->page); - buf->page = NULL; - buf->flags &= ~PIPE_BUF_FLAG_LRU; } static int page_cache_pipe_buf_pin(struct pipe_inode_info *info, @@ -570,22 +567,36 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, if ((sd->flags & SPLICE_F_MOVE) && this_len == PAGE_CACHE_SIZE) { /* * If steal succeeds, buf->page is now pruned from the vm - * side (LRU and page cache) and we can reuse it. The page - * will also be looked on successful return. + * side (page cache) and we can reuse it. The page will also + * be locked on successful return. */ if (buf->ops->steal(info, buf)) goto find_page; page = buf->page; + page_cache_get(page); + + /* + * page must be on the LRU for adding to the pagecache. + * Check this without grabbing the zone lock, if it isn't + * the do grab the zone lock, recheck, and add if necessary. + */ + if (!PageLRU(page)) { + struct zone *zone = page_zone(page); + + spin_lock_irq(&zone->lru_lock); + if (!PageLRU(page)) { + SetPageLRU(page); + add_page_to_inactive_list(zone, page); + } + spin_unlock_irq(&zone->lru_lock); + } + if (add_to_page_cache(page, mapping, index, gfp_mask)) { + page_cache_release(page); unlock_page(page); goto find_page; } - - page_cache_get(page); - - if (!(buf->flags & PIPE_BUF_FLAG_LRU)) - lru_cache_add(page); } else { find_page: page = find_lock_page(mapping, index); diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index df4d3fa..070954f 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -5,9 +5,8 @@ #define PIPE_BUFFERS (16) -#define PIPE_BUF_FLAG_LRU 0x01 /* page is on the LRU */ -#define PIPE_BUF_FLAG_ATOMIC 0x02 /* was atomically mapped */ -#define PIPE_BUF_FLAG_GIFT 0x04 /* page is a gift */ +#define PIPE_BUF_FLAG_ATOMIC 0x01 /* was atomically mapped */ +#define PIPE_BUF_FLAG_GIFT 0x02 /* page is a gift */ struct pipe_buffer { struct page *page; -- cgit v0.10.2 From 330ab71619bacc4d4494227a6cfc9b7f5500403d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 2 May 2006 15:29:57 +0200 Subject: [PATCH] vmsplice: restrict stealing a little more Apply the same rules as the anon pipe pages, only allow stealing if no one else is using the page. Signed-off-by: Jens Axboe diff --git a/fs/pipe.c b/fs/pipe.c index 3941a7f..5acd895 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -184,8 +184,8 @@ void generic_pipe_buf_unmap(struct pipe_inode_info *pipe, kunmap(buf->page); } -static int anon_pipe_buf_steal(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) +int generic_pipe_buf_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) { struct page *page = buf->page; @@ -213,7 +213,7 @@ static struct pipe_buf_operations anon_pipe_buf_ops = { .unmap = generic_pipe_buf_unmap, .pin = generic_pipe_buf_pin, .release = anon_pipe_buf_release, - .steal = anon_pipe_buf_steal, + .steal = generic_pipe_buf_steal, .get = generic_pipe_buf_get, }; diff --git a/fs/splice.c b/fs/splice.c index 3318b96..7fb0497 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -141,7 +141,7 @@ static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, if (!(buf->flags & PIPE_BUF_FLAG_GIFT)) return 1; - return 0; + return generic_pipe_buf_steal(pipe, buf); } static struct pipe_buf_operations user_page_pipe_buf_ops = { diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 070954f..ba73108 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -69,6 +69,7 @@ void *generic_pipe_buf_map(struct pipe_inode_info *, struct pipe_buffer *, int); void generic_pipe_buf_unmap(struct pipe_inode_info *, struct pipe_buffer *, void *); void generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *); int generic_pipe_buf_pin(struct pipe_inode_info *, struct pipe_buffer *); +int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *); /* * splice is tied to pipes as a transport (at least for now), so we'll just -- cgit v0.10.2 From 37be4e7809e0581db85387e126ae4da68c3d6286 Mon Sep 17 00:00:00 2001 From: Russell King Date: Tue, 2 May 2006 17:24:59 +0100 Subject: [MMC] extend data timeout for writes The CSD contains a "read2write factor" which determines the multiplier to be applied to the read timeout to obtain the write timeout. We were ignoring this parameter, resulting in the possibility for writes being timed out too early. Signed-off-by: Russell King diff --git a/drivers/mmc/mmc.c b/drivers/mmc/mmc.c index da6ddd9..05aa4d6 100644 --- a/drivers/mmc/mmc.c +++ b/drivers/mmc/mmc.c @@ -549,6 +549,7 @@ static void mmc_decode_csd(struct mmc_card *card) csd->read_partial = UNSTUFF_BITS(resp, 79, 1); csd->write_misalign = UNSTUFF_BITS(resp, 78, 1); csd->read_misalign = UNSTUFF_BITS(resp, 77, 1); + csd->r2w_factor = UNSTUFF_BITS(resp, 26, 3); csd->write_blkbits = UNSTUFF_BITS(resp, 22, 4); csd->write_partial = UNSTUFF_BITS(resp, 21, 1); } else { @@ -583,6 +584,7 @@ static void mmc_decode_csd(struct mmc_card *card) csd->read_partial = UNSTUFF_BITS(resp, 79, 1); csd->write_misalign = UNSTUFF_BITS(resp, 78, 1); csd->read_misalign = UNSTUFF_BITS(resp, 77, 1); + csd->r2w_factor = UNSTUFF_BITS(resp, 26, 3); csd->write_blkbits = UNSTUFF_BITS(resp, 22, 4); csd->write_partial = UNSTUFF_BITS(resp, 21, 1); } diff --git a/drivers/mmc/mmc_block.c b/drivers/mmc/mmc_block.c index 8eb2a2e..06bd1f4 100644 --- a/drivers/mmc/mmc_block.c +++ b/drivers/mmc/mmc_block.c @@ -187,6 +187,12 @@ static int mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req) brq.cmd.opcode = MMC_WRITE_BLOCK; brq.data.flags |= MMC_DATA_WRITE; brq.data.blocks = 1; + + /* + * Scale up the timeout by the r2w factor + */ + brq.data.timeout_ns <<= card->csd.r2w_factor; + brq.data.timeout_clks <<= card->csd.r2w_factor; } if (brq.data.blocks > 1) { diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h index 30dd978..991a373 100644 --- a/include/linux/mmc/card.h +++ b/include/linux/mmc/card.h @@ -28,6 +28,7 @@ struct mmc_csd { unsigned short cmdclass; unsigned short tacc_clks; unsigned int tacc_ns; + unsigned int r2w_factor; unsigned int max_dtr; unsigned int read_blkbits; unsigned int write_blkbits; -- cgit v0.10.2 From 58741e8b3603e56c3699550e8bc89cb136329343 Mon Sep 17 00:00:00 2001 From: Russell King Date: Tue, 2 May 2006 20:02:39 +0100 Subject: [MMC] PXA and i.MX: don't avoid sending stop command on error Always send a stop command at the end of a data transfer. If we avoid sending the stop command, some cards remain in data transfer mode, and refuse to accept further read/write commands. Signed-off-by: Russell King diff --git a/drivers/mmc/imxmmc.c b/drivers/mmc/imxmmc.c index 07f36c4..bc27192 100644 --- a/drivers/mmc/imxmmc.c +++ b/drivers/mmc/imxmmc.c @@ -529,7 +529,7 @@ static int imxmci_data_done(struct imxmci_host *host, unsigned int stat) data_error = imxmci_finish_data(host, stat); - if (host->req->stop && (data_error == MMC_ERR_NONE)) { + if (host->req->stop) { imxmci_stop_clock(host); imxmci_start_cmd(host, host->req->stop, 0); } else { diff --git a/drivers/mmc/pxamci.c b/drivers/mmc/pxamci.c index eb42cb3..15a5caa 100644 --- a/drivers/mmc/pxamci.c +++ b/drivers/mmc/pxamci.c @@ -291,7 +291,7 @@ static int pxamci_data_done(struct pxamci_host *host, unsigned int stat) pxamci_disable_irq(host, DATA_TRAN_DONE); host->data = NULL; - if (host->mrq->stop && data->error == MMC_ERR_NONE) { + if (host->mrq->stop) { pxamci_stop_clock(host); pxamci_start_cmd(host, host->mrq->stop, 0); } else { -- cgit v0.10.2 From d78e9079af7526c4661ff484322094832776ef41 Mon Sep 17 00:00:00 2001 From: Russell King Date: Tue, 2 May 2006 20:18:53 +0100 Subject: [MMC] PXA: reduce the number of lines PXAMCI debug uses There's no reason for the PXAMCI debug code to print so many lines - it causes the kernel buffer to overflow when trying to debug this driver. Remove some debug messages which are duplicated by core code, and combine other messages, resulting in fewer characters written to the kernel log. Signed-off-by: Russell King diff --git a/drivers/mmc/pxamci.c b/drivers/mmc/pxamci.c index 15a5caa..40970c4 100644 --- a/drivers/mmc/pxamci.c +++ b/drivers/mmc/pxamci.c @@ -198,7 +198,6 @@ static void pxamci_start_cmd(struct pxamci_host *host, struct mmc_command *cmd, static void pxamci_finish_request(struct pxamci_host *host, struct mmc_request *mrq) { - pr_debug("PXAMCI: request done\n"); host->mrq = NULL; host->cmd = NULL; host->data = NULL; @@ -309,12 +308,10 @@ static irqreturn_t pxamci_irq(int irq, void *devid, struct pt_regs *regs) ireg = readl(host->base + MMC_I_REG); - pr_debug("PXAMCI: irq %08x\n", ireg); - if (ireg) { unsigned stat = readl(host->base + MMC_STAT); - pr_debug("PXAMCI: stat %08x\n", stat); + pr_debug("PXAMCI: irq %08x stat %08x\n", ireg, stat); if (ireg & END_CMD_RES) handled |= pxamci_cmd_done(host, stat); @@ -368,7 +365,7 @@ static void pxamci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) { struct pxamci_host *host = mmc_priv(mmc); - pr_debug("pxamci_set_ios: clock %u power %u vdd %u.%02u\n", + pr_debug("PXAMCI: clock %u power %u vdd %u.%02u\n", ios->clock, ios->power_mode, ios->vdd / 100, ios->vdd % 100); @@ -397,7 +394,7 @@ static void pxamci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) host->cmdat |= CMDAT_INIT; } - pr_debug("pxamci_set_ios: clkrt = %x cmdat = %x\n", + pr_debug("PXAMCI: clkrt = %x cmdat = %x\n", host->clkrt, host->cmdat); } -- cgit v0.10.2 From b0b8dab288590ede2377a671db0a31380f454541 Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Thu, 27 Apr 2006 18:23:49 -0700 Subject: [PATCH] mv643xx_eth: provide sysfs class device symlink On Sat, Mar 11, Olaf Hering wrote: > Why is the /sys/class/net/eth0/device symlink not created for the > mv643xx_eth driver? Does this work for other platform device drivers? > Seems to work for the ps2 keyboard at least. The SET_NETDEV_DEV has to be done before a call to register_netdev. With the new patch below, the device symlink for the platform device was created. Unfortunately, after the 4 ls commands, the network connection died. No idea if the box crashed or if something else broke, lost remote access. Provide sysfs 'device' in /class/net/ethN Also, set module owner field, like pcnet32 driver does. Signed-off-by: Olaf Hering Acked-by: Dale Farnsworth Signed-off-by: Andrew Morton Signed-off-by: Jeff Garzik diff --git a/drivers/net/mv643xx_eth.c b/drivers/net/mv643xx_eth.c index ea62a3e..411f4d8 100644 --- a/drivers/net/mv643xx_eth.c +++ b/drivers/net/mv643xx_eth.c @@ -1419,6 +1419,8 @@ static int mv643xx_eth_probe(struct platform_device *pdev) mv643xx_eth_update_pscr(dev, &cmd); mv643xx_set_settings(dev, &cmd); + SET_MODULE_OWNER(dev); + SET_NETDEV_DEV(dev, &pdev->dev); err = register_netdev(dev); if (err) goto out; -- cgit v0.10.2 From 3e0d167a6b6e5722d7fadfad9b817f668ab41ec1 Mon Sep 17 00:00:00 2001 From: Craig Brind Date: Thu, 27 Apr 2006 02:30:46 -0700 Subject: [PATCH] via-rhine: zero pad short packets on Rhine I ethernet cards Fixes Rhine I cards disclosing fragments of previously transmitted frames in new transmissions. Before transmission, any socket buffer (skb) shorter than the ethernet minimum length of 60 bytes was zero-padded. On Rhine I cards the data can later be copied into an aligned transmission buffer without copying this padding. This resulted in the transmission of the frame with the extra bytes beyond the provided content leaking the previous contents of this buffer on to the network. Now zero-padding is repeated in the local aligned buffer if one is used. Following a suggestion from the via-rhine maintainer, no attempt is made here to avoid the duplicated effort of padding the skb if it is known that an aligned buffer will definitely be used. This is to make the change "obviously correct" and allow it to be applied to a stable kernel if necessary. There is no change to the flow of control and the changes are only to the Rhine I code path. The patch has run on an in-service Rhine-I host without incident. Frames shorter than 60 bytes are now correctly zero-padded when captured on a separate host. I see no unusual stats reported by ifconfig, and no unusual log messages. Signed-off-by: Craig Brind Signed-off-by: Roger Luethi Cc: Jeff Garzik Signed-off-by: Andrew Morton Signed-off-by: Jeff Garzik diff --git a/drivers/net/via-rhine.c b/drivers/net/via-rhine.c index 6a23964..a6dc53b 100644 --- a/drivers/net/via-rhine.c +++ b/drivers/net/via-rhine.c @@ -129,6 +129,7 @@ - Massive clean-up - Rewrite PHY, media handling (remove options, full_duplex, backoff) - Fix Tx engine race for good + - Craig Brind: Zero padded aligned buffers for short packets. */ @@ -1326,7 +1327,12 @@ static int rhine_start_tx(struct sk_buff *skb, struct net_device *dev) rp->stats.tx_dropped++; return 0; } + + /* Padding is not copied and so must be redone. */ skb_copy_and_csum_dev(skb, rp->tx_buf[entry]); + if (skb->len < ETH_ZLEN) + memset(rp->tx_buf[entry] + skb->len, 0, + ETH_ZLEN - skb->len); rp->tx_skbuff_dma[entry] = 0; rp->tx_ring[entry].addr = cpu_to_le32(rp->tx_bufs_dma + (rp->tx_buf[entry] - -- cgit v0.10.2 From ebf34c9b6fcd22338ef764b039b3ac55ed0e297b Mon Sep 17 00:00:00 2001 From: Ayaz Abdulla Date: Tue, 2 May 2006 15:26:06 -0400 Subject: forcedeth: fix multi irq issues This patch fixes the issues with multiple irqs. I am resending based on feedback. I decoupled the dma mask for consistent memory and fixed leak with multiple irq in error path. Thanks to Manfred for catching the spin lock problem. Signed-Off-By: Ayaz Abdulla diff --git a/drivers/net/forcedeth.c b/drivers/net/forcedeth.c index 9788b1e..f7235c9 100644 --- a/drivers/net/forcedeth.c +++ b/drivers/net/forcedeth.c @@ -106,6 +106,7 @@ * 0.51: 20 Jan 2006: Add 64bit consistent memory allocation for rings. * 0.52: 20 Jan 2006: Add MSI/MSIX support. * 0.53: 19 Mar 2006: Fix init from low power mode and add hw reset. + * 0.54: 21 Mar 2006: Fix spin locks for multi irqs and cleanup. * * Known bugs: * We suspect that on some hardware no TX done interrupts are generated. @@ -117,7 +118,7 @@ * DEV_NEED_TIMERIRQ will not harm you on sane hardware, only generating a few * superfluous timer interrupts from the nic. */ -#define FORCEDETH_VERSION "0.53" +#define FORCEDETH_VERSION "0.54" #define DRV_NAME "forcedeth" #include @@ -710,6 +711,72 @@ static void setup_hw_rings(struct net_device *dev, int rxtx_flags) } } +static int using_multi_irqs(struct net_device *dev) +{ + struct fe_priv *np = get_nvpriv(dev); + + if (!(np->msi_flags & NV_MSI_X_ENABLED) || + ((np->msi_flags & NV_MSI_X_ENABLED) && + ((np->msi_flags & NV_MSI_X_VECTORS_MASK) == 0x1))) + return 0; + else + return 1; +} + +static void nv_enable_irq(struct net_device *dev) +{ + struct fe_priv *np = get_nvpriv(dev); + + if (!using_multi_irqs(dev)) { + if (np->msi_flags & NV_MSI_X_ENABLED) + enable_irq(np->msi_x_entry[NV_MSI_X_VECTOR_ALL].vector); + else + enable_irq(dev->irq); + } else { + enable_irq(np->msi_x_entry[NV_MSI_X_VECTOR_RX].vector); + enable_irq(np->msi_x_entry[NV_MSI_X_VECTOR_TX].vector); + enable_irq(np->msi_x_entry[NV_MSI_X_VECTOR_OTHER].vector); + } +} + +static void nv_disable_irq(struct net_device *dev) +{ + struct fe_priv *np = get_nvpriv(dev); + + if (!using_multi_irqs(dev)) { + if (np->msi_flags & NV_MSI_X_ENABLED) + disable_irq(np->msi_x_entry[NV_MSI_X_VECTOR_ALL].vector); + else + disable_irq(dev->irq); + } else { + disable_irq(np->msi_x_entry[NV_MSI_X_VECTOR_RX].vector); + disable_irq(np->msi_x_entry[NV_MSI_X_VECTOR_TX].vector); + disable_irq(np->msi_x_entry[NV_MSI_X_VECTOR_OTHER].vector); + } +} + +/* In MSIX mode, a write to irqmask behaves as XOR */ +static void nv_enable_hw_interrupts(struct net_device *dev, u32 mask) +{ + u8 __iomem *base = get_hwbase(dev); + + writel(mask, base + NvRegIrqMask); +} + +static void nv_disable_hw_interrupts(struct net_device *dev, u32 mask) +{ + struct fe_priv *np = get_nvpriv(dev); + u8 __iomem *base = get_hwbase(dev); + + if (np->msi_flags & NV_MSI_X_ENABLED) { + writel(mask, base + NvRegIrqMask); + } else { + if (np->msi_flags & NV_MSI_ENABLED) + writel(0, base + NvRegMSIIrqMask); + writel(0, base + NvRegIrqMask); + } +} + #define MII_READ (-1) /* mii_rw: read/write a register on the PHY. * @@ -1019,24 +1086,25 @@ static void nv_do_rx_refill(unsigned long data) struct net_device *dev = (struct net_device *) data; struct fe_priv *np = netdev_priv(dev); - - if (!(np->msi_flags & NV_MSI_X_ENABLED) || - ((np->msi_flags & NV_MSI_X_ENABLED) && - ((np->msi_flags & NV_MSI_X_VECTORS_MASK) == 0x1))) { - disable_irq(dev->irq); + if (!using_multi_irqs(dev)) { + if (np->msi_flags & NV_MSI_X_ENABLED) + disable_irq(np->msi_x_entry[NV_MSI_X_VECTOR_ALL].vector); + else + disable_irq(dev->irq); } else { disable_irq(np->msi_x_entry[NV_MSI_X_VECTOR_RX].vector); } if (nv_alloc_rx(dev)) { - spin_lock(&np->lock); + spin_lock_irq(&np->lock); if (!np->in_shutdown) mod_timer(&np->oom_kick, jiffies + OOM_REFILL); - spin_unlock(&np->lock); + spin_unlock_irq(&np->lock); } - if (!(np->msi_flags & NV_MSI_X_ENABLED) || - ((np->msi_flags & NV_MSI_X_ENABLED) && - ((np->msi_flags & NV_MSI_X_VECTORS_MASK) == 0x1))) { - enable_irq(dev->irq); + if (!using_multi_irqs(dev)) { + if (np->msi_flags & NV_MSI_X_ENABLED) + enable_irq(np->msi_x_entry[NV_MSI_X_VECTOR_ALL].vector); + else + enable_irq(dev->irq); } else { enable_irq(np->msi_x_entry[NV_MSI_X_VECTOR_RX].vector); } @@ -1668,15 +1736,7 @@ static int nv_change_mtu(struct net_device *dev, int new_mtu) * guessed, there is probably a simpler approach. * Changing the MTU is a rare event, it shouldn't matter. */ - if (!(np->msi_flags & NV_MSI_X_ENABLED) || - ((np->msi_flags & NV_MSI_X_ENABLED) && - ((np->msi_flags & NV_MSI_X_VECTORS_MASK) == 0x1))) { - disable_irq(dev->irq); - } else { - disable_irq(np->msi_x_entry[NV_MSI_X_VECTOR_RX].vector); - disable_irq(np->msi_x_entry[NV_MSI_X_VECTOR_TX].vector); - disable_irq(np->msi_x_entry[NV_MSI_X_VECTOR_OTHER].vector); - } + nv_disable_irq(dev); spin_lock_bh(&dev->xmit_lock); spin_lock(&np->lock); /* stop engines */ @@ -1709,15 +1769,7 @@ static int nv_change_mtu(struct net_device *dev, int new_mtu) nv_start_tx(dev); spin_unlock(&np->lock); spin_unlock_bh(&dev->xmit_lock); - if (!(np->msi_flags & NV_MSI_X_ENABLED) || - ((np->msi_flags & NV_MSI_X_ENABLED) && - ((np->msi_flags & NV_MSI_X_VECTORS_MASK) == 0x1))) { - enable_irq(dev->irq); - } else { - enable_irq(np->msi_x_entry[NV_MSI_X_VECTOR_RX].vector); - enable_irq(np->msi_x_entry[NV_MSI_X_VECTOR_TX].vector); - enable_irq(np->msi_x_entry[NV_MSI_X_VECTOR_OTHER].vector); - } + nv_enable_irq(dev); } return 0; } @@ -2108,16 +2160,16 @@ static irqreturn_t nv_nic_irq_tx(int foo, void *data, struct pt_regs *regs) if (!(events & np->irqmask)) break; - spin_lock(&np->lock); + spin_lock_irq(&np->lock); nv_tx_done(dev); - spin_unlock(&np->lock); + spin_unlock_irq(&np->lock); if (events & (NVREG_IRQ_TX_ERR)) { dprintk(KERN_DEBUG "%s: received irq with events 0x%x. Probably TX fail.\n", dev->name, events); } if (i > max_interrupt_work) { - spin_lock(&np->lock); + spin_lock_irq(&np->lock); /* disable interrupts on the nic */ writel(NVREG_IRQ_TX_ALL, base + NvRegIrqMask); pci_push(base); @@ -2127,7 +2179,7 @@ static irqreturn_t nv_nic_irq_tx(int foo, void *data, struct pt_regs *regs) mod_timer(&np->nic_poll, jiffies + POLL_WAIT); } printk(KERN_DEBUG "%s: too many iterations (%d) in nv_nic_irq_tx.\n", dev->name, i); - spin_unlock(&np->lock); + spin_unlock_irq(&np->lock); break; } @@ -2157,14 +2209,14 @@ static irqreturn_t nv_nic_irq_rx(int foo, void *data, struct pt_regs *regs) nv_rx_process(dev); if (nv_alloc_rx(dev)) { - spin_lock(&np->lock); + spin_lock_irq(&np->lock); if (!np->in_shutdown) mod_timer(&np->oom_kick, jiffies + OOM_REFILL); - spin_unlock(&np->lock); + spin_unlock_irq(&np->lock); } if (i > max_interrupt_work) { - spin_lock(&np->lock); + spin_lock_irq(&np->lock); /* disable interrupts on the nic */ writel(NVREG_IRQ_RX_ALL, base + NvRegIrqMask); pci_push(base); @@ -2174,7 +2226,7 @@ static irqreturn_t nv_nic_irq_rx(int foo, void *data, struct pt_regs *regs) mod_timer(&np->nic_poll, jiffies + POLL_WAIT); } printk(KERN_DEBUG "%s: too many iterations (%d) in nv_nic_irq_rx.\n", dev->name, i); - spin_unlock(&np->lock); + spin_unlock_irq(&np->lock); break; } @@ -2203,14 +2255,14 @@ static irqreturn_t nv_nic_irq_other(int foo, void *data, struct pt_regs *regs) break; if (events & NVREG_IRQ_LINK) { - spin_lock(&np->lock); + spin_lock_irq(&np->lock); nv_link_irq(dev); - spin_unlock(&np->lock); + spin_unlock_irq(&np->lock); } if (np->need_linktimer && time_after(jiffies, np->link_timeout)) { - spin_lock(&np->lock); + spin_lock_irq(&np->lock); nv_linkchange(dev); - spin_unlock(&np->lock); + spin_unlock_irq(&np->lock); np->link_timeout = jiffies + LINK_TIMEOUT; } if (events & (NVREG_IRQ_UNKNOWN)) { @@ -2218,7 +2270,7 @@ static irqreturn_t nv_nic_irq_other(int foo, void *data, struct pt_regs *regs) dev->name, events); } if (i > max_interrupt_work) { - spin_lock(&np->lock); + spin_lock_irq(&np->lock); /* disable interrupts on the nic */ writel(NVREG_IRQ_OTHER, base + NvRegIrqMask); pci_push(base); @@ -2228,7 +2280,7 @@ static irqreturn_t nv_nic_irq_other(int foo, void *data, struct pt_regs *regs) mod_timer(&np->nic_poll, jiffies + POLL_WAIT); } printk(KERN_DEBUG "%s: too many iterations (%d) in nv_nic_irq_other.\n", dev->name, i); - spin_unlock(&np->lock); + spin_unlock_irq(&np->lock); break; } @@ -2251,10 +2303,11 @@ static void nv_do_nic_poll(unsigned long data) * nv_nic_irq because that may decide to do otherwise */ - if (!(np->msi_flags & NV_MSI_X_ENABLED) || - ((np->msi_flags & NV_MSI_X_ENABLED) && - ((np->msi_flags & NV_MSI_X_VECTORS_MASK) == 0x1))) { - disable_irq(dev->irq); + if (!using_multi_irqs(dev)) { + if (np->msi_flags & NV_MSI_X_ENABLED) + disable_irq(np->msi_x_entry[NV_MSI_X_VECTOR_ALL].vector); + else + disable_irq(dev->irq); mask = np->irqmask; } else { if (np->nic_poll_irq & NVREG_IRQ_RX_ALL) { @@ -2277,11 +2330,12 @@ static void nv_do_nic_poll(unsigned long data) writel(mask, base + NvRegIrqMask); pci_push(base); - if (!(np->msi_flags & NV_MSI_X_ENABLED) || - ((np->msi_flags & NV_MSI_X_ENABLED) && - ((np->msi_flags & NV_MSI_X_VECTORS_MASK) == 0x1))) { + if (!using_multi_irqs(dev)) { nv_nic_irq((int) 0, (void *) data, (struct pt_regs *) NULL); - enable_irq(dev->irq); + if (np->msi_flags & NV_MSI_X_ENABLED) + enable_irq(np->msi_x_entry[NV_MSI_X_VECTOR_ALL].vector); + else + enable_irq(dev->irq); } else { if (np->nic_poll_irq & NVREG_IRQ_RX_ALL) { nv_nic_irq_rx((int) 0, (void *) data, (struct pt_regs *) NULL); @@ -2628,6 +2682,113 @@ static void set_msix_vector_map(struct net_device *dev, u32 vector, u32 irqmask) writel(readl(base + NvRegMSIXMap1) | msixmap, base + NvRegMSIXMap1); } +static int nv_request_irq(struct net_device *dev) +{ + struct fe_priv *np = get_nvpriv(dev); + u8 __iomem *base = get_hwbase(dev); + int ret = 1; + int i; + + if (np->msi_flags & NV_MSI_X_CAPABLE) { + for (i = 0; i < (np->msi_flags & NV_MSI_X_VECTORS_MASK); i++) { + np->msi_x_entry[i].entry = i; + } + if ((ret = pci_enable_msix(np->pci_dev, np->msi_x_entry, (np->msi_flags & NV_MSI_X_VECTORS_MASK))) == 0) { + np->msi_flags |= NV_MSI_X_ENABLED; + if (optimization_mode == NV_OPTIMIZATION_MODE_THROUGHPUT) { + /* Request irq for rx handling */ + if (request_irq(np->msi_x_entry[NV_MSI_X_VECTOR_RX].vector, &nv_nic_irq_rx, SA_SHIRQ, dev->name, dev) != 0) { + printk(KERN_INFO "forcedeth: request_irq failed for rx %d\n", ret); + pci_disable_msix(np->pci_dev); + np->msi_flags &= ~NV_MSI_X_ENABLED; + goto out_err; + } + /* Request irq for tx handling */ + if (request_irq(np->msi_x_entry[NV_MSI_X_VECTOR_TX].vector, &nv_nic_irq_tx, SA_SHIRQ, dev->name, dev) != 0) { + printk(KERN_INFO "forcedeth: request_irq failed for tx %d\n", ret); + pci_disable_msix(np->pci_dev); + np->msi_flags &= ~NV_MSI_X_ENABLED; + goto out_free_rx; + } + /* Request irq for link and timer handling */ + if (request_irq(np->msi_x_entry[NV_MSI_X_VECTOR_OTHER].vector, &nv_nic_irq_other, SA_SHIRQ, dev->name, dev) != 0) { + printk(KERN_INFO "forcedeth: request_irq failed for link %d\n", ret); + pci_disable_msix(np->pci_dev); + np->msi_flags &= ~NV_MSI_X_ENABLED; + goto out_free_tx; + } + /* map interrupts to their respective vector */ + writel(0, base + NvRegMSIXMap0); + writel(0, base + NvRegMSIXMap1); + set_msix_vector_map(dev, NV_MSI_X_VECTOR_RX, NVREG_IRQ_RX_ALL); + set_msix_vector_map(dev, NV_MSI_X_VECTOR_TX, NVREG_IRQ_TX_ALL); + set_msix_vector_map(dev, NV_MSI_X_VECTOR_OTHER, NVREG_IRQ_OTHER); + } else { + /* Request irq for all interrupts */ + if (request_irq(np->msi_x_entry[NV_MSI_X_VECTOR_ALL].vector, &nv_nic_irq, SA_SHIRQ, dev->name, dev) != 0) { + printk(KERN_INFO "forcedeth: request_irq failed %d\n", ret); + pci_disable_msix(np->pci_dev); + np->msi_flags &= ~NV_MSI_X_ENABLED; + goto out_err; + } + + /* map interrupts to vector 0 */ + writel(0, base + NvRegMSIXMap0); + writel(0, base + NvRegMSIXMap1); + } + } + } + if (ret != 0 && np->msi_flags & NV_MSI_CAPABLE) { + if ((ret = pci_enable_msi(np->pci_dev)) == 0) { + np->msi_flags |= NV_MSI_ENABLED; + if (request_irq(np->pci_dev->irq, &nv_nic_irq, SA_SHIRQ, dev->name, dev) != 0) { + printk(KERN_INFO "forcedeth: request_irq failed %d\n", ret); + pci_disable_msi(np->pci_dev); + np->msi_flags &= ~NV_MSI_ENABLED; + goto out_err; + } + + /* map interrupts to vector 0 */ + writel(0, base + NvRegMSIMap0); + writel(0, base + NvRegMSIMap1); + /* enable msi vector 0 */ + writel(NVREG_MSI_VECTOR_0_ENABLED, base + NvRegMSIIrqMask); + } + } + if (ret != 0) { + if (request_irq(np->pci_dev->irq, &nv_nic_irq, SA_SHIRQ, dev->name, dev) != 0) + goto out_err; + } + + return 0; +out_free_tx: + free_irq(np->msi_x_entry[NV_MSI_X_VECTOR_TX].vector, dev); +out_free_rx: + free_irq(np->msi_x_entry[NV_MSI_X_VECTOR_RX].vector, dev); +out_err: + return 1; +} + +static void nv_free_irq(struct net_device *dev) +{ + struct fe_priv *np = get_nvpriv(dev); + int i; + + if (np->msi_flags & NV_MSI_X_ENABLED) { + for (i = 0; i < (np->msi_flags & NV_MSI_X_VECTORS_MASK); i++) { + free_irq(np->msi_x_entry[i].vector, dev); + } + pci_disable_msix(np->pci_dev); + np->msi_flags &= ~NV_MSI_X_ENABLED; + } else { + free_irq(np->pci_dev->irq, dev); + if (np->msi_flags & NV_MSI_ENABLED) { + pci_disable_msi(np->pci_dev); + np->msi_flags &= ~NV_MSI_ENABLED; + } + } +} + static int nv_open(struct net_device *dev) { struct fe_priv *np = netdev_priv(dev); @@ -2720,12 +2881,16 @@ static int nv_open(struct net_device *dev) udelay(10); writel(readl(base + NvRegPowerState) | NVREG_POWERSTATE_VALID, base + NvRegPowerState); - writel(0, base + NvRegIrqMask); + nv_disable_hw_interrupts(dev, np->irqmask); pci_push(base); writel(NVREG_MIISTAT_MASK2, base + NvRegMIIStatus); writel(NVREG_IRQSTAT_MASK, base + NvRegIrqStatus); pci_push(base); + if (nv_request_irq(dev)) { + goto out_drain; + } + if (np->msi_flags & NV_MSI_X_CAPABLE) { for (i = 0; i < (np->msi_flags & NV_MSI_X_VECTORS_MASK); i++) { np->msi_x_entry[i].entry = i; @@ -2799,7 +2964,7 @@ static int nv_open(struct net_device *dev) } /* ask for interrupts */ - writel(np->irqmask, base + NvRegIrqMask); + nv_enable_hw_interrupts(dev, np->irqmask); spin_lock_irq(&np->lock); writel(NVREG_MCASTADDRA_FORCE, base + NvRegMulticastAddrA); @@ -2843,7 +3008,6 @@ static int nv_close(struct net_device *dev) { struct fe_priv *np = netdev_priv(dev); u8 __iomem *base; - int i; spin_lock_irq(&np->lock); np->in_shutdown = 1; @@ -2861,31 +3025,13 @@ static int nv_close(struct net_device *dev) /* disable interrupts on the nic or we will lock up */ base = get_hwbase(dev); - if (np->msi_flags & NV_MSI_X_ENABLED) { - writel(np->irqmask, base + NvRegIrqMask); - } else { - if (np->msi_flags & NV_MSI_ENABLED) - writel(0, base + NvRegMSIIrqMask); - writel(0, base + NvRegIrqMask); - } + nv_disable_hw_interrupts(dev, np->irqmask); pci_push(base); dprintk(KERN_INFO "%s: Irqmask is zero again\n", dev->name); spin_unlock_irq(&np->lock); - if (np->msi_flags & NV_MSI_X_ENABLED) { - for (i = 0; i < (np->msi_flags & NV_MSI_X_VECTORS_MASK); i++) { - free_irq(np->msi_x_entry[i].vector, dev); - } - pci_disable_msix(np->pci_dev); - np->msi_flags &= ~NV_MSI_X_ENABLED; - } else { - free_irq(np->pci_dev->irq, dev); - if (np->msi_flags & NV_MSI_ENABLED) { - pci_disable_msi(np->pci_dev); - np->msi_flags &= ~NV_MSI_ENABLED; - } - } + nv_free_irq(dev); drain_ring(dev); @@ -2974,20 +3120,18 @@ static int __devinit nv_probe(struct pci_dev *pci_dev, const struct pci_device_i if (id->driver_data & DEV_HAS_HIGH_DMA) { /* packet format 3: supports 40-bit addressing */ np->desc_ver = DESC_VER_3; + np->txrxctl_bits = NVREG_TXRXCTL_DESC_3; if (pci_set_dma_mask(pci_dev, DMA_39BIT_MASK)) { printk(KERN_INFO "forcedeth: 64-bit DMA failed, using 32-bit addressing for device %s.\n", pci_name(pci_dev)); } else { - if (pci_set_consistent_dma_mask(pci_dev, 0x0000007fffffffffULL)) { - printk(KERN_INFO "forcedeth: 64-bit DMA (consistent) failed for device %s.\n", - pci_name(pci_dev)); - goto out_relreg; - } else { - dev->features |= NETIF_F_HIGHDMA; - printk(KERN_INFO "forcedeth: using HIGHDMA\n"); - } + dev->features |= NETIF_F_HIGHDMA; + printk(KERN_INFO "forcedeth: using HIGHDMA\n"); + } + if (pci_set_consistent_dma_mask(pci_dev, 0x0000007fffffffffULL)) { + printk(KERN_INFO "forcedeth: 64-bit DMA (consistent) failed for device %s.\n", + pci_name(pci_dev)); } - np->txrxctl_bits = NVREG_TXRXCTL_DESC_3; } else if (id->driver_data & DEV_HAS_LARGEDESC) { /* packet format 2: supports jumbo frames */ np->desc_ver = DESC_VER_2; -- cgit v0.10.2 From b2556da55f78a9dbe92830b1d1c0b612edfea9fd Mon Sep 17 00:00:00 2001 From: Uwe Zeisberger Date: Tue, 2 May 2006 20:40:56 +0100 Subject: [ARM] 3488/1: make icedcc_putc do the right thing Patch from Uwe Zeisberger a) use coprocessor 14 b) make reading the dcc status volatile Signed-off-by: Uwe Zeisberger Signed-off-by: Russell King diff --git a/arch/arm/boot/compressed/misc.c b/arch/arm/boot/compressed/misc.c index 0af3772..ace3fb58 100644 --- a/arch/arm/boot/compressed/misc.c +++ b/arch/arm/boot/compressed/misc.c @@ -38,10 +38,10 @@ static void icedcc_putc(int ch) if (--i < 0) return; - asm("mrc p14, 0, %0, c0, c0, 0" : "=r" (status)); + asm volatile ("mrc p14, 0, %0, c0, c0, 0" : "=r" (status)); } while (status & 2); - asm("mcr p15, 0, %0, c1, c0, 0" : : "r" (ch)); + asm("mcr p14, 0, %0, c1, c0, 0" : : "r" (ch)); } #define putc(ch) icedcc_putc(ch) -- cgit v0.10.2 From e17df688f7064dae1417ce425dd1e4b71d24d63b Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 2 May 2006 23:23:07 +0200 Subject: [NETFILTER] SCTP conntrack: fix infinite loop fix infinite loop in the SCTP-netfilter code: check SCTP chunk size to guarantee progress of for_each_sctp_chunk(). (all other uses of for_each_sctp_chunk() are preceded by do_basic_checks(), so this fix should be complete.) Based on patch from Ingo Molnar CVE-2006-1527 Signed-off-by: Patrick McHardy Signed-off-by: Linus Torvalds diff --git a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c index 5259abd..0416073 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c @@ -235,12 +235,15 @@ static int do_basic_checks(struct ip_conntrack *conntrack, flag = 1; } - /* Cookie Ack/Echo chunks not the first OR - Init / Init Ack / Shutdown compl chunks not the only chunks */ - if ((sch->type == SCTP_CID_COOKIE_ACK + /* + * Cookie Ack/Echo chunks not the first OR + * Init / Init Ack / Shutdown compl chunks not the only chunks + * OR zero-length. + */ + if (((sch->type == SCTP_CID_COOKIE_ACK || sch->type == SCTP_CID_COOKIE_ECHO || flag) - && count !=0 ) { + && count !=0) || !sch->length) { DEBUGP("Basic checks failed\n"); return 1; } diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 9cccc32..0c6da49 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -240,12 +240,15 @@ static int do_basic_checks(struct nf_conn *conntrack, flag = 1; } - /* Cookie Ack/Echo chunks not the first OR - Init / Init Ack / Shutdown compl chunks not the only chunks */ - if ((sch->type == SCTP_CID_COOKIE_ACK + /* + * Cookie Ack/Echo chunks not the first OR + * Init / Init Ack / Shutdown compl chunks not the only chunks + * OR zero-length. + */ + if (((sch->type == SCTP_CID_COOKIE_ACK || sch->type == SCTP_CID_COOKIE_ECHO || flag) - && count !=0 ) { + && count !=0) || !sch->length) { DEBUGP("Basic checks failed\n"); return 1; } -- cgit v0.10.2 From 054d8ff37710efaebd1998ce94d366df315a354f Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Thu, 27 Apr 2006 02:31:20 -0700 Subject: [PATCH] powerpc/pseries: avoid crash in PCI code if mem system not up The powerpc code is currently performing PCI setup before memory initialization. PCI setup touches PCI config space registers. If the PCI card is bad, this will evoke an error, which currrently can't be handled, as the PCI error recovery code expects kmalloc() to be functional. This patch will cause the system to punt instead of crashing with cpu 0x0: Vector: 300 (Data Access) at [c0000000004434d0] pc: c0000000000c06b4: .kmem_cache_alloc+0x8c/0xf4 lr: c00000000004ad6c: .eeh_send_failure_event+0x48/0xfc This patch will also print name of the offending pci device. Signed-off-by: Linas Vepstas Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/platforms/pseries/eeh_event.c b/arch/powerpc/platforms/pseries/eeh_event.c index a1bda6f..40020c6 100644 --- a/arch/powerpc/platforms/pseries/eeh_event.c +++ b/arch/powerpc/platforms/pseries/eeh_event.c @@ -118,7 +118,15 @@ int eeh_send_failure_event (struct device_node *dn, { unsigned long flags; struct eeh_event *event; + char *location; + if (!mem_init_done) { + printk(KERN_ERR "EEH: event during early boot not handled\n"); + location = (char *) get_property(dn, "ibm,loc-code", NULL); + printk(KERN_ERR "EEH: device node = %s\n", dn->full_name); + printk(KERN_ERR "EEH: PCI location = %s\n", location); + return 1; + } event = kmalloc(sizeof(*event), GFP_ATOMIC); if (event == NULL) { printk (KERN_ERR "EEH: out of memory, event not handled\n"); -- cgit v0.10.2 From 0ccde0a290b44b8296b82a7683b4c299eb51ba6b Mon Sep 17 00:00:00 2001 From: Ananth N Mavinakayanahalli Date: Fri, 28 Apr 2006 17:38:42 +0530 Subject: [PATCH] powerpc/kprobes: fix singlestep out-of-line We currently single-step inline if the instruction on which a kprobe is inserted is a trap variant. - variants (such as tdnei, used by BUG()) typically evaluate a condition and cause a trap only if the condition is satisfied. - kprobes uses the unconditional "trap" (0x7fe00008) and single-stepping again on this instruction, resulting in another trap without evaluating the condition is obviously incorrect. Signed-off-by: Ananth N Mavinakayanahalli Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index 856ef1a..f788663 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -90,15 +90,15 @@ void __kprobes arch_remove_kprobe(struct kprobe *p) static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) { - kprobe_opcode_t insn = *p->ainsn.insn; - regs->msr |= MSR_SE; - /* single step inline if it is a trap variant */ - if (is_trap(insn)) - regs->nip = (unsigned long)p->addr; - else - regs->nip = (unsigned long)p->ainsn.insn; + /* + * On powerpc we should single step on the original + * instruction even if the probed insn is a trap + * variant as values in regs could play a part in + * if the trap is taken or not + */ + regs->nip = (unsigned long)p->ainsn.insn; } static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb) -- cgit v0.10.2 From 61f5657c50341198ff05e375e6f1fc0476556562 Mon Sep 17 00:00:00 2001 From: Vitaly Bordug Date: Sat, 29 Apr 2006 22:32:44 +0400 Subject: [PATCH] ppc32 CPM_UART: Fixed break send on SCC SCC uart sends a break sequence each time it is stopped with the CPM_CR_STOP_TX command. That means that each time an application closes the serial device, a break is transmitted. To fix this, graceful tx stop is issued for SCC. Signed-off-by: David Jander Signed-off-by: Vitaly Bordug Signed-off-by: Paul Mackerras diff --git a/drivers/serial/cpm_uart/cpm_uart_core.c b/drivers/serial/cpm_uart/cpm_uart_core.c index ced193b..b9d1185 100644 --- a/drivers/serial/cpm_uart/cpm_uart_core.c +++ b/drivers/serial/cpm_uart/cpm_uart_core.c @@ -457,7 +457,11 @@ static void cpm_uart_shutdown(struct uart_port *port) } /* Shut them really down and reinit buffer descriptors */ - cpm_line_cr_cmd(line, CPM_CR_STOP_TX); + if (IS_SMC(pinfo)) + cpm_line_cr_cmd(line, CPM_CR_STOP_TX); + else + cpm_line_cr_cmd(line, CPM_CR_GRA_STOP_TX); + cpm_uart_initbd(pinfo); } } diff --git a/include/asm-ppc/commproc.h b/include/asm-ppc/commproc.h index 973e609..31f3629 100644 --- a/include/asm-ppc/commproc.h +++ b/include/asm-ppc/commproc.h @@ -35,6 +35,7 @@ #define CPM_CR_INIT_TX ((ushort)0x0002) #define CPM_CR_HUNT_MODE ((ushort)0x0003) #define CPM_CR_STOP_TX ((ushort)0x0004) +#define CPM_CR_GRA_STOP_TX ((ushort)0x0005) #define CPM_CR_RESTART_TX ((ushort)0x0006) #define CPM_CR_CLOSE_RX_BD ((ushort)0x0007) #define CPM_CR_SET_GADDR ((ushort)0x0008) diff --git a/include/asm-ppc/cpm2.h b/include/asm-ppc/cpm2.h index b638b87..c70344b 100644 --- a/include/asm-ppc/cpm2.h +++ b/include/asm-ppc/cpm2.h @@ -69,7 +69,7 @@ #define CPM_CR_INIT_TX ((ushort)0x0002) #define CPM_CR_HUNT_MODE ((ushort)0x0003) #define CPM_CR_STOP_TX ((ushort)0x0004) -#define CPM_CR_GRA_STOP_TX ((ushort)0x0005) +#define CPM_CR_GRA_STOP_TX ((ushort)0x0005) #define CPM_CR_RESTART_TX ((ushort)0x0006) #define CPM_CR_SET_GADDR ((ushort)0x0008) #define CPM_CR_START_IDMA ((ushort)0x0009) -- cgit v0.10.2 From 6e1976961c9bd9a3dc368139fab1883961efc879 Mon Sep 17 00:00:00 2001 From: Vitaly Bordug Date: Sat, 29 Apr 2006 23:06:00 +0400 Subject: [PATCH] ppc32 CPM_UART: fixes and improvements A number of small issues are fixed, and added the header file, missed from the original series. With this, driver should be pretty stable as tested among both platform-device-driven and "old way" boards. Also added missing GPL statement , and updated year field on existing ones to reflect code update. Signed-off-by: Vitaly Bordug Signed-off-by: Paul Mackerras diff --git a/arch/ppc/platforms/mpc866ads_setup.c b/arch/ppc/platforms/mpc866ads_setup.c index 6ce3b84..d919dab 100644 --- a/arch/ppc/platforms/mpc866ads_setup.c +++ b/arch/ppc/platforms/mpc866ads_setup.c @@ -378,7 +378,7 @@ int __init mpc866ads_init(void) ppc_sys_device_setfunc(MPC8xx_CPM_SMC1, PPC_SYS_FUNC_UART); #endif -#ifdef CONFIG_SERIAL_CPM_SMCer +#ifdef CONFIG_SERIAL_CPM_SMC ppc_sys_device_enable(MPC8xx_CPM_SMC2); ppc_sys_device_setfunc(MPC8xx_CPM_SMC2, PPC_SYS_FUNC_UART); #endif diff --git a/drivers/serial/cpm_uart/cpm_uart.h b/drivers/serial/cpm_uart/cpm_uart.h index aa5eb7d..3b35cb7 100644 --- a/drivers/serial/cpm_uart/cpm_uart.h +++ b/drivers/serial/cpm_uart/cpm_uart.h @@ -5,6 +5,13 @@ * * Copyright (C) 2004 Freescale Semiconductor, Inc. * + * 2006 (c) MontaVista Software, Inc. + * Vitaly Bordug + * + * This file is licensed under the terms of the GNU General Public License + * version 2. This program is licensed "as is" without any warranty of any + * kind, whether express or implied. + * */ #ifndef CPM_UART_H #define CPM_UART_H @@ -101,12 +108,13 @@ static inline unsigned long cpu2cpm_addr(void* addr, struct uart_cpm_port *pinfo int offset; u32 val = (u32)addr; /* sane check */ - if ((val >= (u32)pinfo->mem_addr) && + if (likely((val >= (u32)pinfo->mem_addr)) && (val<((u32)pinfo->mem_addr + pinfo->mem_size))) { offset = val - (u32)pinfo->mem_addr; return pinfo->dma_addr+offset; } - printk("%s(): address %x to translate out of range!\n", __FUNCTION__, val); + /* something nasty happened */ + BUG(); return 0; } @@ -115,12 +123,13 @@ static inline void *cpm2cpu_addr(unsigned long addr, struct uart_cpm_port *pinfo int offset; u32 val = addr; /* sane check */ - if ((val >= pinfo->dma_addr) && - (val<(pinfo->dma_addr + pinfo->mem_size))) { + if (likely((val >= pinfo->dma_addr) && + (val<(pinfo->dma_addr + pinfo->mem_size)))) { offset = val - (u32)pinfo->dma_addr; return (void*)(pinfo->mem_addr+offset); } - printk("%s(): address %x to translate out of range!\n", __FUNCTION__, val); + /* something nasty happened */ + BUG(); return 0; } diff --git a/drivers/serial/cpm_uart/cpm_uart_core.c b/drivers/serial/cpm_uart/cpm_uart_core.c index b9d1185..969f949 100644 --- a/drivers/serial/cpm_uart/cpm_uart_core.c +++ b/drivers/serial/cpm_uart/cpm_uart_core.c @@ -12,7 +12,8 @@ * * Copyright (C) 2004 Freescale Semiconductor, Inc. * (C) 2004 Intracom, S.A. - * (C) 2005 MontaVista Software, Inc. by Vitaly Bordug + * (C) 2005-2006 MontaVista Software, Inc. + * Vitaly Bordug * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -81,7 +82,7 @@ early_uart_get_pdev(int index) } -void cpm_uart_count(void) +static void cpm_uart_count(void) { cpm_uart_nr = 0; #ifdef CONFIG_SERIAL_CPM_SMC1 @@ -104,6 +105,21 @@ void cpm_uart_count(void) #endif } +/* Get UART number by its id */ +static int cpm_uart_id2nr(int id) +{ + int i; + if (id < UART_NR) { + for (i=0; ifs_no; line++); + line = cpm_uart_id2nr(idx); + if(line < 0) { + printk(KERN_ERR"%s(): port %d is not registered", __FUNCTION__, idx); + return -1; + } pinfo = (struct uart_cpm_port *) &cpm_uart_ports[idx]; @@ -1245,8 +1265,7 @@ static int cpm_uart_drv_probe(struct device *dev) } pdata = pdev->dev.platform_data; - pr_debug("cpm_uart_drv_probe: Adding CPM UART %d\n", - cpm_uart_port_map[pdata->fs_no]); + pr_debug("cpm_uart_drv_probe: Adding CPM UART %d\n", cpm_uart_id2nr(pdata->fs_no)); if ((ret = cpm_uart_drv_get_platform_data(pdev, 0))) return ret; @@ -1265,7 +1284,7 @@ static int cpm_uart_drv_remove(struct device *dev) struct fs_uart_platform_info *pdata = pdev->dev.platform_data; pr_debug("cpm_uart_drv_remove: Removing CPM UART %d\n", - cpm_uart_port_map[pdata->fs_no]); + cpm_uart_id2nr(pdata->fs_no)); uart_remove_one_port(&cpm_reg, &cpm_uart_ports[pdata->fs_no].port); return 0; diff --git a/drivers/serial/cpm_uart/cpm_uart_cpm1.c b/drivers/serial/cpm_uart/cpm_uart_cpm1.c index a5a3062..17406a0 100644 --- a/drivers/serial/cpm_uart/cpm_uart_cpm1.c +++ b/drivers/serial/cpm_uart/cpm_uart_cpm1.c @@ -8,6 +8,8 @@ * * Copyright (C) 2004 Freescale Semiconductor, Inc. * (C) 2004 Intracom, S.A. + * (C) 2006 MontaVista Software, Inc. + * Vitaly Bordug * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/drivers/serial/cpm_uart/cpm_uart_cpm2.c b/drivers/serial/cpm_uart/cpm_uart_cpm2.c index 7c6b07a..4b2de08 100644 --- a/drivers/serial/cpm_uart/cpm_uart_cpm2.c +++ b/drivers/serial/cpm_uart/cpm_uart_cpm2.c @@ -8,6 +8,8 @@ * * Copyright (C) 2004 Freescale Semiconductor, Inc. * (C) 2004 Intracom, S.A. + * (C) 2006 MontaVista Software, Inc. + * Vitaly Bordug * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/include/linux/fs_uart_pd.h b/include/linux/fs_uart_pd.h new file mode 100644 index 0000000..f597512 --- /dev/null +++ b/include/linux/fs_uart_pd.h @@ -0,0 +1,60 @@ +/* + * Platform information definitions for the CPM Uart driver. + * + * 2006 (c) MontaVista Software, Inc. + * Vitaly Bordug + * + * This file is licensed under the terms of the GNU General Public License + * version 2. This program is licensed "as is" without any warranty of any + * kind, whether express or implied. + */ + +#ifndef FS_UART_PD_H +#define FS_UART_PD_H + +#include +#include + +enum fs_uart_id { + fsid_smc1_uart, + fsid_smc2_uart, + fsid_scc1_uart, + fsid_scc2_uart, + fsid_scc3_uart, + fsid_scc4_uart, + fs_uart_nr, +}; + +static inline int fs_uart_id_scc2fsid(int id) +{ + return fsid_scc1_uart + id - 1; +} + +static inline int fs_uart_id_fsid2scc(int id) +{ + return id - fsid_scc1_uart + 1; +} + +static inline int fs_uart_id_smc2fsid(int id) +{ + return fsid_smc1_uart + id - 1; +} + +static inline int fs_uart_id_fsid2smc(int id) +{ + return id - fsid_smc1_uart + 1; +} + +struct fs_uart_platform_info { + void(*init_ioports)(void); + /* device specific information */ + int fs_no; /* controller index */ + u32 uart_clk; + u8 tx_num_fifo; + u8 tx_buf_size; + u8 rx_num_fifo; + u8 rx_buf_size; + u8 brg; +}; + +#endif -- cgit v0.10.2 From 6bfd93c32a5065d0e858780b3beb0b667081601c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 3 May 2006 23:02:04 +1000 Subject: powerpc: Fix incorrect might_sleep in __get_user/__put_user on kernel addresses We have a case where __get_user and __put_user can validly be used on kernel addresses in interrupt context - namely, the alignment exception handler, as our get/put_unaligned just do a single access and rely on the alignment exception handler to fix things up in the rare cases where the cpu can't handle it in hardware. Thus we can get alignment exceptions in the network stack at interrupt level. The alignment exception handler does a __get_user to read the instruction and blows up in might_sleep(). Since a __get_user on a kernel address won't actually ever sleep, this makes the might_sleep conditional on the address being less than PAGE_OFFSET. Signed-off-by: Paul Mackerras diff --git a/include/asm-powerpc/uaccess.h b/include/asm-powerpc/uaccess.h index 3872e92..d83fc29 100644 --- a/include/asm-powerpc/uaccess.h +++ b/include/asm-powerpc/uaccess.h @@ -7,6 +7,7 @@ #include #include #include +#include #define VERIFY_READ 0 #define VERIFY_WRITE 1 @@ -179,9 +180,11 @@ do { \ #define __put_user_nocheck(x, ptr, size) \ ({ \ long __pu_err; \ - might_sleep(); \ + __typeof__(*(ptr)) __user *__pu_addr = (ptr); \ + if (!is_kernel_addr((unsigned long)__pu_addr)) \ + might_sleep(); \ __chk_user_ptr(ptr); \ - __put_user_size((x), (ptr), (size), __pu_err); \ + __put_user_size((x), __pu_addr, (size), __pu_err); \ __pu_err; \ }) @@ -258,9 +261,11 @@ do { \ ({ \ long __gu_err; \ unsigned long __gu_val; \ + const __typeof__(*(ptr)) __user *__gu_addr = (ptr); \ __chk_user_ptr(ptr); \ - might_sleep(); \ - __get_user_size(__gu_val, (ptr), (size), __gu_err); \ + if (!is_kernel_addr((unsigned long)__gu_addr)) \ + might_sleep(); \ + __get_user_size(__gu_val, __gu_addr, (size), __gu_err); \ (x) = (__typeof__(*(ptr)))__gu_val; \ __gu_err; \ }) @@ -270,9 +275,11 @@ do { \ ({ \ long __gu_err; \ long long __gu_val; \ + const __typeof__(*(ptr)) __user *__gu_addr = (ptr); \ __chk_user_ptr(ptr); \ - might_sleep(); \ - __get_user_size(__gu_val, (ptr), (size), __gu_err); \ + if (!is_kernel_addr((unsigned long)__gu_addr)) \ + might_sleep(); \ + __get_user_size(__gu_val, __gu_addr, (size), __gu_err); \ (x) = (__typeof__(*(ptr)))__gu_val; \ __gu_err; \ }) -- cgit v0.10.2 From d205819e2346d20fee41297ea6cf789c591abccf Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 3 May 2006 23:04:37 +1000 Subject: [PATCH] powerpc: Use the ibm,pa-features property if available Forthcoming IBM machines will have a "ibm,pa-features" property on CPU nodes, that contains bits indicating which optional architecture features are implemented by the CPU. This adds code to use the property, if present, to update our CPU feature bitmaps. Note that this means we can both set and clear feature bits based on what the firmware tells us. This is based on a patch by Will Schmidt . Signed-off-by: Paul Mackerras diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 1cb69e8..9a07f97 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -885,6 +885,74 @@ void __init unflatten_device_tree(void) DBG(" <- unflatten_device_tree()\n"); } +/* + * ibm,pa-features is a per-cpu property that contains a string of + * attribute descriptors, each of which has a 2 byte header plus up + * to 254 bytes worth of processor attribute bits. First header + * byte specifies the number of bytes following the header. + * Second header byte is an "attribute-specifier" type, of which + * zero is the only currently-defined value. + * Implementation: Pass in the byte and bit offset for the feature + * that we are interested in. The function will return -1 if the + * pa-features property is missing, or a 1/0 to indicate if the feature + * is supported/not supported. Note that the bit numbers are + * big-endian to match the definition in PAPR. + */ +static struct ibm_pa_feature { + unsigned long cpu_features; /* CPU_FTR_xxx bit */ + unsigned int cpu_user_ftrs; /* PPC_FEATURE_xxx bit */ + unsigned char pabyte; /* byte number in ibm,pa-features */ + unsigned char pabit; /* bit number (big-endian) */ + unsigned char invert; /* if 1, pa bit set => clear feature */ +} ibm_pa_features[] __initdata = { + {0, PPC_FEATURE_HAS_MMU, 0, 0, 0}, + {0, PPC_FEATURE_HAS_FPU, 0, 1, 0}, + {CPU_FTR_SLB, 0, 0, 2, 0}, + {CPU_FTR_CTRL, 0, 0, 3, 0}, + {CPU_FTR_NOEXECUTE, 0, 0, 6, 0}, + {CPU_FTR_NODSISRALIGN, 0, 1, 1, 1}, + {CPU_FTR_CI_LARGE_PAGE, 0, 1, 2, 0}, +}; + +static void __init check_cpu_pa_features(unsigned long node) +{ + unsigned char *pa_ftrs; + unsigned long len, tablelen, i, bit; + + pa_ftrs = of_get_flat_dt_prop(node, "ibm,pa-features", &tablelen); + if (pa_ftrs == NULL) + return; + + /* find descriptor with type == 0 */ + for (;;) { + if (tablelen < 3) + return; + len = 2 + pa_ftrs[0]; + if (tablelen < len) + return; /* descriptor 0 not found */ + if (pa_ftrs[1] == 0) + break; + tablelen -= len; + pa_ftrs += len; + } + + /* loop over bits we know about */ + for (i = 0; i < ARRAY_SIZE(ibm_pa_features); ++i) { + struct ibm_pa_feature *fp = &ibm_pa_features[i]; + + if (fp->pabyte >= pa_ftrs[0]) + continue; + bit = (pa_ftrs[2 + fp->pabyte] >> (7 - fp->pabit)) & 1; + if (bit ^ fp->invert) { + cur_cpu_spec->cpu_features |= fp->cpu_features; + cur_cpu_spec->cpu_user_features |= fp->cpu_user_ftrs; + } else { + cur_cpu_spec->cpu_features &= ~fp->cpu_features; + cur_cpu_spec->cpu_user_features &= ~fp->cpu_user_ftrs; + } + } +} + static int __init early_init_dt_scan_cpus(unsigned long node, const char *uname, int depth, void *data) @@ -969,6 +1037,8 @@ static int __init early_init_dt_scan_cpus(unsigned long node, } #endif /* CONFIG_ALTIVEC */ + check_cpu_pa_features(node); + #ifdef CONFIG_PPC_PSERIES if (nthreads > 1) cur_cpu_spec->cpu_features |= CPU_FTR_SMT; -- cgit v0.10.2 From 30d55280b867aa0cae99f836ad0181bb0bf8f9cb Mon Sep 17 00:00:00 2001 From: Stephen Smalley Date: Wed, 3 May 2006 10:52:36 -0400 Subject: [PATCH] selinux: Clear selinux_enabled flag upon runtime disable. Clear selinux_enabled flag upon runtime disable of SELinux by userspace, and make sure it is defined even if selinux= boot parameter support is not enabled in configuration. Signed-off-by: Stephen Smalley Acked-by: James Morris Tested-by: Jon Smirl Acked-by: Al Viro Signed-off-by: Linus Torvalds diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 3cf368a..d987048 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -101,6 +101,8 @@ static int __init selinux_enabled_setup(char *str) return 1; } __setup("selinux=", selinux_enabled_setup); +#else +int selinux_enabled = 1; #endif /* Original (dummy) security module. */ @@ -4535,6 +4537,7 @@ int selinux_disable(void) printk(KERN_INFO "SELinux: Disabled at runtime.\n"); selinux_disabled = 1; + selinux_enabled = 0; /* Reset security_ops to the secondary module, dummy or capability. */ security_ops = secondary_ops; diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h index 5f016c9..063af47 100644 --- a/security/selinux/include/security.h +++ b/security/selinux/include/security.h @@ -29,12 +29,7 @@ #define POLICYDB_VERSION_MIN POLICYDB_VERSION_BASE #define POLICYDB_VERSION_MAX POLICYDB_VERSION_AVTAB -#ifdef CONFIG_SECURITY_SELINUX_BOOTPARAM extern int selinux_enabled; -#else -#define selinux_enabled 1 -#endif - extern int selinux_mls_enabled; int security_load_policy(void * data, size_t len); -- cgit v0.10.2 From 8c45112b823972e9ff7bbca77dc592ca2224951b Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 3 May 2006 13:55:46 -0700 Subject: [SPARC]: Hook up vmsplice into syscall tables. Signed-off-by: David S. Miller diff --git a/arch/sparc/kernel/systbls.S b/arch/sparc/kernel/systbls.S index db8faa7..6e1135c 100644 --- a/arch/sparc/kernel/systbls.S +++ b/arch/sparc/kernel/systbls.S @@ -23,7 +23,7 @@ sys_call_table: /*10*/ .long sys_unlink, sunos_execv, sys_chdir, sys_chown16, sys_mknod /*15*/ .long sys_chmod, sys_lchown16, sparc_brk, sys_nis_syscall, sys_lseek /*20*/ .long sys_getpid, sys_capget, sys_capset, sys_setuid16, sys_getuid16 -/*25*/ .long sys_time, sys_ptrace, sys_alarm, sys_sigaltstack, sys_pause +/*25*/ .long sys_vmsplice, sys_ptrace, sys_alarm, sys_sigaltstack, sys_pause /*30*/ .long sys_utime, sys_lchown, sys_fchown, sys_access, sys_nice /*35*/ .long sys_chown, sys_sync, sys_kill, sys_newstat, sys_sendfile /*40*/ .long sys_newlstat, sys_dup, sys_pipe, sys_times, sys_getuid diff --git a/arch/sparc64/kernel/sys32.S b/arch/sparc64/kernel/sys32.S index f9b7576..bdf1f4d 100644 --- a/arch/sparc64/kernel/sys32.S +++ b/arch/sparc64/kernel/sys32.S @@ -139,6 +139,7 @@ SIGN3(sys32_ioprio_set, sys_ioprio_set, %o0, %o1, %o2) SIGN2(sys32_splice, sys_splice, %o0, %o1) SIGN2(sys32_sync_file_range, compat_sync_file_range, %o0, %o5) SIGN2(sys32_tee, sys_tee, %o0, %o1) +SIGN1(sys32_vmsplice, compat_sys_vmsplice, %o0) .globl sys32_mmap2 sys32_mmap2: diff --git a/arch/sparc64/kernel/systbls.S b/arch/sparc64/kernel/systbls.S index 62672cd..d4b39cd 100644 --- a/arch/sparc64/kernel/systbls.S +++ b/arch/sparc64/kernel/systbls.S @@ -25,7 +25,7 @@ sys_call_table32: /*10*/ .word sys_unlink, sunos_execv, sys_chdir, sys32_chown16, sys32_mknod /*15*/ .word sys_chmod, sys32_lchown16, sparc_brk, sys32_perfctr, sys32_lseek /*20*/ .word sys_getpid, sys_capget, sys_capset, sys32_setuid16, sys32_getuid16 -/*25*/ .word compat_sys_time, sys_ptrace, sys_alarm, sys32_sigaltstack, sys32_pause +/*25*/ .word sys32_vmsplice, sys_ptrace, sys_alarm, sys32_sigaltstack, sys32_pause /*30*/ .word compat_sys_utime, sys_lchown, sys_fchown, sys32_access, sys32_nice .word sys_chown, sys_sync, sys32_kill, compat_sys_newstat, sys32_sendfile /*40*/ .word compat_sys_newlstat, sys_dup, sys_pipe, compat_sys_times, sys_getuid @@ -94,7 +94,7 @@ sys_call_table: /*10*/ .word sys_unlink, sys_nis_syscall, sys_chdir, sys_chown, sys_mknod /*15*/ .word sys_chmod, sys_lchown, sparc_brk, sys_perfctr, sys_lseek /*20*/ .word sys_getpid, sys_capget, sys_capset, sys_setuid, sys_getuid -/*25*/ .word sys_nis_syscall, sys_ptrace, sys_alarm, sys_sigaltstack, sys_nis_syscall +/*25*/ .word sys_vmsplice, sys_ptrace, sys_alarm, sys_sigaltstack, sys_nis_syscall /*30*/ .word sys_utime, sys_nis_syscall, sys_nis_syscall, sys_access, sys_nice .word sys_nis_syscall, sys_sync, sys_kill, sys_newstat, sys_sendfile64 /*40*/ .word sys_newlstat, sys_dup, sys_pipe, sys_times, sys_nis_syscall diff --git a/include/asm-sparc/unistd.h b/include/asm-sparc/unistd.h index 32a48f6..f5611a7 100644 --- a/include/asm-sparc/unistd.h +++ b/include/asm-sparc/unistd.h @@ -41,7 +41,7 @@ #define __NR_capset 22 /* Linux Specific */ #define __NR_setuid 23 /* Implemented via setreuid in SunOS */ #define __NR_getuid 24 /* Common */ -/* #define __NR_time alias 25 ENOSYS under SunOS */ +#define __NR_vmsplice 25 /* ENOSYS under SunOS */ #define __NR_ptrace 26 /* Common */ #define __NR_alarm 27 /* Implemented via setitimer in SunOS */ #define __NR_sigaltstack 28 /* Common */ diff --git a/include/asm-sparc64/unistd.h b/include/asm-sparc64/unistd.h index ca80e8a..6870574 100644 --- a/include/asm-sparc64/unistd.h +++ b/include/asm-sparc64/unistd.h @@ -41,7 +41,7 @@ #define __NR_capset 22 /* Linux Specific */ #define __NR_setuid 23 /* Implemented via setreuid in SunOS */ #define __NR_getuid 24 /* Common */ -/* #define __NR_time alias 25 ENOSYS under SunOS */ +#define __NR_vmsplice 25 /* ENOSYS under SunOS */ #define __NR_ptrace 26 /* Common */ #define __NR_alarm 27 /* Implemented via setitimer in SunOS */ #define __NR_sigaltstack 28 /* Common */ -- cgit v0.10.2 From f0ec5e39765cd254d436a6d86e211d81795952a4 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 3 May 2006 19:54:57 -0700 Subject: [PATCH] Remove wrong cpu_has_apic checks that came from mismerging We only need to check cpu_has_apic in the IO-APIC/L-APIC parsing, not for all of ACPI. Signed-off-by: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c index 4c785a6..40e5aba 100644 --- a/arch/i386/kernel/acpi/boot.c +++ b/arch/i386/kernel/acpi/boot.c @@ -1102,9 +1102,6 @@ int __init acpi_boot_table_init(void) dmi_check_system(acpi_dmi_table); #endif - if (!cpu_has_apic) - return -ENODEV; - /* * If acpi_disabled, bail out * One exception: acpi=ht continues far enough to enumerate LAPICs @@ -1151,9 +1148,6 @@ int __init acpi_boot_init(void) acpi_table_parse(ACPI_BOOT, acpi_parse_sbf); - if (!cpu_has_apic) - return -ENODEV; - /* * set sci_int and PM timer address */ -- cgit v0.10.2 From 6760da0197a6ee327a09dafc070b26e2f02651fe Mon Sep 17 00:00:00 2001 From: Jeff Dike Date: Wed, 3 May 2006 19:55:03 -0700 Subject: [PATCH] uml: change timer initialization inet_init, which schedules, is called before the UML timer_init, which sets up the timer. The result is the interval timers being manipulated before the appropriate signal handlers are established, causing unhandled timers. This is fixed by making timer_init be called earlier. Signed-off-by: Jeff Dike Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/um/kernel/time_kern.c b/arch/um/kernel/time_kern.c index 3c7626c..528cf62 100644 --- a/arch/um/kernel/time_kern.c +++ b/arch/um/kernel/time_kern.c @@ -209,4 +209,4 @@ int __init timer_init(void) return(0); } -__initcall(timer_init); +arch_initcall(timer_init); -- cgit v0.10.2 From 96941026a51e9cb8c2d2be7499301b7fcd9ee225 Mon Sep 17 00:00:00 2001 From: mark gross Date: Wed, 3 May 2006 19:55:07 -0700 Subject: [PATCH] EDAC Coexistence with BIOS Address the issue of EDAC/BIOS coexistence for the e752x chip-sets. We have found a problem where the BIOS will start the system with the error registers (dev0:fun1) hidden and assuming it has exclusive access to them. The edac driver violates this assumption. The workaround this patch offers is to honor the hidden-ness as an indication that it is not safe to use those registers. Signed-off-by: Mark Gross Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/edac/e752x_edac.c b/drivers/edac/e752x_edac.c index 66572c5..fce3193 100644 --- a/drivers/edac/e752x_edac.c +++ b/drivers/edac/e752x_edac.c @@ -25,6 +25,8 @@ #include #include "edac_mc.h" +static int force_function_unhide; + #define e752x_printk(level, fmt, arg...) \ edac_printk(level, "e752x", fmt, ##arg) @@ -782,8 +784,16 @@ static int e752x_probe1(struct pci_dev *pdev, int dev_idx) debugf0("%s(): mci\n", __func__); debugf0("Starting Probe1\n"); - /* enable device 0 function 1 */ + /* check to see if device 0 function 1 is enabled; if it isn't, we + * assume the BIOS has reserved it for a reason and is expecting + * exclusive access, we take care not to violate that assumption and + * fail the probe. */ pci_read_config_byte(pdev, E752X_DEVPRES1, &stat8); + if (!force_function_unhide && !(stat8 & (1 << 5))) { + printk(KERN_INFO "Contact your BIOS vendor to see if the " + "E752x error registers can be safely un-hidden\n"); + goto fail; + } stat8 |= (1 << 5); pci_write_config_byte(pdev, E752X_DEVPRES1, stat8); @@ -1063,3 +1073,8 @@ module_exit(e752x_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Linux Networx (http://lnxi.com) Tom Zimmerman\n"); MODULE_DESCRIPTION("MC support for Intel e752x memory controllers"); + +module_param(force_function_unhide, int, 0444); +MODULE_PARM_DESC(force_function_unhide, "if BIOS sets Dev0:Fun1 up as hidden:" +" 1=force unhide and hope BIOS doesn't fight driver for Dev0:Fun1 access"); + -- cgit v0.10.2 From 8683dc9990158c221e05959935e7dd50a956c574 Mon Sep 17 00:00:00 2001 From: Brent Casavant Date: Wed, 3 May 2006 19:55:10 -0700 Subject: [PATCH] Altix: correct ioc4 port order Currently loading the ioc3 as a module will cause the ports to be numbered in reverse order. This mod maintains the proper order of cards for port numbering. Signed-off-by: Brent Casavant Cc: Pat Gefre Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/sn/ioc4.c b/drivers/sn/ioc4.c index 67140a5..cdeff90 100644 --- a/drivers/sn/ioc4.c +++ b/drivers/sn/ioc4.c @@ -310,7 +310,7 @@ ioc4_probe(struct pci_dev *pdev, const struct pci_device_id *pci_id) pci_set_drvdata(idd->idd_pdev, idd); mutex_lock(&ioc4_mutex); - list_add(&idd->idd_list, &ioc4_devices); + list_add_tail(&idd->idd_list, &ioc4_devices); /* Add this IOC4 to all submodules */ list_for_each_entry(is, &ioc4_submodules, is_list) { -- cgit v0.10.2 From 5dea5176e5c32ef9f0d1a41d28427b3bf6881b3a Mon Sep 17 00:00:00 2001 From: Mingming Cao Date: Wed, 3 May 2006 19:55:12 -0700 Subject: [PATCH] ext3: multile block allocate little endian fixes Some places in ext3 multiple block allocation code (in 2.6.17-rc3) don't handle the little endian well. This was resulting in *wrong* block numbers being assigned to in-memory block variables and then stored on disk eventually. The following patch has been verified to fix an ext3 filesystem failure when run ltp test on a 64 bit machine. Signed-off-by; Mingming Cao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 48ae033..2edd7ee 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -711,7 +711,7 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode, * direct blocks blocks */ if (num == 0 && blks > 1) { - current_block = le32_to_cpu(where->key + 1); + current_block = le32_to_cpu(where->key) + 1; for (i = 1; i < blks; i++) *(where->p + i ) = cpu_to_le32(current_block++); } @@ -724,7 +724,7 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode, if (block_i) { block_i->last_alloc_logical_block = block + blks - 1; block_i->last_alloc_physical_block = - le32_to_cpu(where[num].key + blks - 1); + le32_to_cpu(where[num].key) + blks - 1; } /* We are done with atomic stuff, now do the rest of housekeeping */ @@ -814,11 +814,13 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, /* Simplest case - block found, no allocation needed */ if (!partial) { - first_block = chain[depth - 1].key; + first_block = le32_to_cpu(chain[depth - 1].key); clear_buffer_new(bh_result); count++; /*map more blocks*/ while (count < maxblocks && count <= blocks_to_boundary) { + unsigned long blk; + if (!verify_chain(chain, partial)) { /* * Indirect block might be removed by @@ -831,8 +833,9 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, count = 0; break; } - if (le32_to_cpu(*(chain[depth-1].p+count) == - (first_block + count))) + blk = le32_to_cpu(*(chain[depth-1].p + count)); + + if (blk == first_block + count) count++; else break; -- cgit v0.10.2 From bfc4ee39fdbb2deb8864785d5e5bc5cdd3b31a69 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 3 May 2006 10:35:10 +0200 Subject: [PATCH] splice: fix unlocking of page on error ->prepare_write() Looking at generic_file_buffered_write(), we need to unlock_page() if prepare write fails and it isn't due to racing with truncate(). Also trim the size if ->prepare_write() fails, if we have to. Signed-off-by: Jens Axboe diff --git a/fs/splice.c b/fs/splice.c index 7fb0497..27f5e37 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -647,11 +647,24 @@ find_page: } ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len); - if (ret == AOP_TRUNCATED_PAGE) { + if (unlikely(ret)) { + loff_t isize = i_size_read(mapping->host); + + if (ret != AOP_TRUNCATED_PAGE) + unlock_page(page); page_cache_release(page); - goto find_page; - } else if (ret) + if (ret == AOP_TRUNCATED_PAGE) + goto find_page; + + /* + * prepare_write() may have instantiated a few blocks + * outside i_size. Trim these off again. + */ + if (sd->pos + this_len > isize) + vmtruncate(mapping->host, isize); + goto out; + } if (buf->page != page) { /* -- cgit v0.10.2 From 1432873af7ae29d4bb3c56114c05b539d078ca62 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 3 May 2006 10:35:26 +0200 Subject: [PATCH] splice: LRU fixups Nick says that the current construct isn't safe. This goes back to the original, but sets PIPE_BUF_FLAG_LRU on user pages as well as they all seem to be on the LRU in the first place. Signed-off-by: Jens Axboe diff --git a/fs/splice.c b/fs/splice.c index 27f5e37..0b20242 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -78,6 +78,7 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *info, return 1; } + buf->flags |= PIPE_BUF_FLAG_LRU; return 0; } @@ -85,6 +86,7 @@ static void page_cache_pipe_buf_release(struct pipe_inode_info *info, struct pipe_buffer *buf) { page_cache_release(buf->page); + buf->flags &= ~PIPE_BUF_FLAG_LRU; } static int page_cache_pipe_buf_pin(struct pipe_inode_info *info, @@ -141,6 +143,7 @@ static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, if (!(buf->flags & PIPE_BUF_FLAG_GIFT)) return 1; + buf->flags |= PIPE_BUF_FLAG_LRU; return generic_pipe_buf_steal(pipe, buf); } @@ -566,37 +569,23 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, */ if ((sd->flags & SPLICE_F_MOVE) && this_len == PAGE_CACHE_SIZE) { /* - * If steal succeeds, buf->page is now pruned from the vm - * side (page cache) and we can reuse it. The page will also - * be locked on successful return. + * If steal succeeds, buf->page is now pruned from the + * pagecache and we can reuse it. The page will also be + * locked on successful return. */ if (buf->ops->steal(info, buf)) goto find_page; page = buf->page; - page_cache_get(page); - - /* - * page must be on the LRU for adding to the pagecache. - * Check this without grabbing the zone lock, if it isn't - * the do grab the zone lock, recheck, and add if necessary. - */ - if (!PageLRU(page)) { - struct zone *zone = page_zone(page); - - spin_lock_irq(&zone->lru_lock); - if (!PageLRU(page)) { - SetPageLRU(page); - add_page_to_inactive_list(zone, page); - } - spin_unlock_irq(&zone->lru_lock); - } - if (add_to_page_cache(page, mapping, index, gfp_mask)) { - page_cache_release(page); unlock_page(page); goto find_page; } + + page_cache_get(page); + + if (!(buf->flags & PIPE_BUF_FLAG_LRU)) + lru_cache_add(page); } else { find_page: page = find_lock_page(mapping, index); diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index ba73108..ea4f7cd 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -5,8 +5,9 @@ #define PIPE_BUFFERS (16) -#define PIPE_BUF_FLAG_ATOMIC 0x01 /* was atomically mapped */ -#define PIPE_BUF_FLAG_GIFT 0x02 /* page is a gift */ +#define PIPE_BUF_FLAG_LRU 0x01 /* page is on the LRU */ +#define PIPE_BUF_FLAG_ATOMIC 0x02 /* was atomically mapped */ +#define PIPE_BUF_FLAG_GIFT 0x04 /* page is a gift */ struct pipe_buffer { struct page *page; -- cgit v0.10.2 From 76ad4d11105ccd40a536db1057083f28326019fd Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 3 May 2006 10:41:33 +0200 Subject: [PATCH] splice: rename remaining info variables to pipe Same thing was done in fs/pipe.c and most of fs/splice.c, but we had a few missing still. Signed-off-by: Jens Axboe diff --git a/fs/splice.c b/fs/splice.c index 0b20242..8fa9217 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -51,7 +51,7 @@ struct splice_pipe_desc { * addition of remove_mapping(). If success is returned, the caller may * attempt to reuse this page for another destination. */ -static int page_cache_pipe_buf_steal(struct pipe_inode_info *info, +static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct page *page = buf->page; @@ -82,14 +82,14 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *info, return 0; } -static void page_cache_pipe_buf_release(struct pipe_inode_info *info, +static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { page_cache_release(buf->page); buf->flags &= ~PIPE_BUF_FLAG_LRU; } -static int page_cache_pipe_buf_pin(struct pipe_inode_info *info, +static int page_cache_pipe_buf_pin(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct page *page = buf->page; @@ -500,14 +500,14 @@ EXPORT_SYMBOL(generic_file_splice_read); * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' * using sendpage(). Return the number of bytes sent. */ -static int pipe_to_sendpage(struct pipe_inode_info *info, +static int pipe_to_sendpage(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) { struct file *file = sd->file; loff_t pos = sd->pos; int ret, more; - ret = buf->ops->pin(info, buf); + ret = buf->ops->pin(pipe, buf); if (!ret) { more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; @@ -538,7 +538,7 @@ static int pipe_to_sendpage(struct pipe_inode_info *info, * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create * a new page in the output file page cache and fill/dirty that. */ -static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, +static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) { struct file *file = sd->file; @@ -552,7 +552,7 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, /* * make sure the data in this buffer is uptodate */ - ret = buf->ops->pin(info, buf); + ret = buf->ops->pin(pipe, buf); if (unlikely(ret)) return ret; @@ -573,7 +573,7 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, * pagecache and we can reuse it. The page will also be * locked on successful return. */ - if (buf->ops->steal(info, buf)) + if (buf->ops->steal(pipe, buf)) goto find_page; page = buf->page; @@ -659,13 +659,13 @@ find_page: /* * Careful, ->map() uses KM_USER0! */ - char *src = buf->ops->map(info, buf, 1); + char *src = buf->ops->map(pipe, buf, 1); char *dst = kmap_atomic(page, KM_USER1); memcpy(dst + offset, src + buf->offset, this_len); flush_dcache_page(page); kunmap_atomic(dst, KM_USER1); - buf->ops->unmap(info, buf, src); + buf->ops->unmap(pipe, buf, src); } ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len); -- cgit v0.10.2 From a0548871ed267ae12eb1c860c5aaebd9e466b34e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 3 May 2006 10:58:22 +0200 Subject: [PATCH] splice: redo page lookup if add_to_page_cache() returns -EEXIST This can happen quite easily, if several processes are trying to splice the same file at the same time. It's not a failure, it just means someone raced with us in allocating this file page. So just dump the allocated page and relookup the original. Signed-off-by: Jens Axboe diff --git a/fs/splice.c b/fs/splice.c index 8fa9217..a285fd7 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -324,6 +324,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, mapping_gfp_mask(mapping)); if (unlikely(error)) { page_cache_release(page); + if (error == -EEXIST) + continue; break; } /* -- cgit v0.10.2 From 6fd737031eb6869430d0f3cf6bf1440adf7aedf5 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Wed, 3 May 2006 23:16:29 -0700 Subject: [NETFILTER]: H.323 helper: fix endless loop caused by invalid TPKT len When the TPKT len included in the packet is below the lowest valid value of 4 an underflow occurs which results in an endless loop. Found by testcase 0000058 from the PROTOS c07-h2250v4 testsuite. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller diff --git a/net/ipv4/netfilter/ip_conntrack_helper_h323.c b/net/ipv4/netfilter/ip_conntrack_helper_h323.c index 2c2fb70..518f581 100644 --- a/net/ipv4/netfilter/ip_conntrack_helper_h323.c +++ b/net/ipv4/netfilter/ip_conntrack_helper_h323.c @@ -162,6 +162,8 @@ static int get_tpkt_data(struct sk_buff **pskb, struct ip_conntrack *ct, /* Validate TPKT length */ tpktlen = tpkt[2] * 256 + tpkt[3]; + if (tpktlen < 4) + goto clear_out; if (tpktlen > tcpdatalen) { if (tcpdatalen == 4) { /* Separate TPKT header */ /* Netmeeting sends TPKT header and data separately */ -- cgit v0.10.2 From 4228e2a9890cd01b0c8cc58af6fd9e08a4b5e8a7 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Wed, 3 May 2006 23:17:11 -0700 Subject: [NETFILTER]: H.323 helper: fix use of uninitialized data When a Choice element contains an unsupported choice no error is returned and parsing continues normally, but the choice value is not set and contains data from the last parsed message. This may in turn lead to parsing of more stale data and following crashes. Fixes a crash triggered by testcase 0003243 from the PROTOS c07-h2250v4 testsuite following random other testcases: CPU: 0 EIP: 0060:[] Not tainted VLI EFLAGS: 00210646 (2.6.17-rc2 #3) EIP is at memmove+0x19/0x22 eax: d7be0307 ebx: d7be0307 ecx: e841fcf9 edx: d7be0307 esi: bfffffff edi: bfffffff ebp: da5eb980 esp: c0347e2c ds: 007b es: 007b ss: 0068 Process events/0 (pid: 4, threadinfo=c0347000 task=dff86a90) Stack: <0>00000006 c0347ea6 d7be0301 e09a6b2c 00000006 da5eb980 d7be003e d7be0052 c0347f6c e09a6d9c 00000006 c0347ea6 00000006 00000000 d7b9a548 00000000 c0347f6c d7b9a548 00000004 e0a1a119 0000028f 00000006 c0347ea6 00000006 Call Trace: [] mangle_contents+0x40/0xd8 [ip_nat] [] ip_nat_mangle_tcp_packet+0xa1/0x191 [ip_nat] [] set_addr+0x60/0x14d [ip_nat_h323] [] q931_help+0x2da/0x71a [ip_conntrack_h323] [] q931_help+0x30c/0x71a [ip_conntrack_h323] [] ip_conntrack_help+0x22/0x2f [ip_conntrack] [] nf_iterate+0x2e/0x5f [] xfrm4_output_finish+0x0/0x39f [] nf_hook_slow+0x42/0xb0 [] xfrm4_output_finish+0x0/0x39f [] xfrm4_output+0x3c/0x4e [] xfrm4_output_finish+0x0/0x39f [] ip_forward+0x1c2/0x1fa [] ip_rcv+0x388/0x3b5 [] netif_receive_skb+0x2bc/0x2ec [] process_backlog+0x6b/0xd0 [] net_rx_action+0x4b/0xb7 [] __do_softirq+0x35/0x7d [] do_softirq+0x38/0x3f Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller diff --git a/net/ipv4/netfilter/ip_conntrack_helper_h323_asn1.c b/net/ipv4/netfilter/ip_conntrack_helper_h323_asn1.c index 4807800..f52d2c4 100644 --- a/net/ipv4/netfilter/ip_conntrack_helper_h323_asn1.c +++ b/net/ipv4/netfilter/ip_conntrack_helper_h323_asn1.c @@ -703,6 +703,10 @@ int decode_choice(bitstr_t * bs, field_t * f, char *base, int level) type = get_bits(bs, f->sz); } + /* Write Type */ + if (base) + *(unsigned *) base = type; + /* Check Range */ if (type >= f->ub) { /* Newer version? */ BYTE_ALIGN(bs); @@ -712,10 +716,6 @@ int decode_choice(bitstr_t * bs, field_t * f, char *base, int level) return H323_ERROR_NONE; } - /* Write Type */ - if (base) - *(unsigned *) base = type; - /* Transfer to son level */ son = &f->fields[type]; if (son->attr & STOP) { -- cgit v0.10.2 From 2354feaeb2acb78f6aabdf8410d55b44492a7949 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Wed, 3 May 2006 23:19:26 -0700 Subject: [NETFILTER]: NAT: silence unused variable warnings with CONFIG_XFRM=n net/ipv4/netfilter/ip_nat_standalone.c: In function 'ip_nat_out': net/ipv4/netfilter/ip_nat_standalone.c:223: warning: unused variable 'ctinfo' net/ipv4/netfilter/ip_nat_standalone.c:222: warning: unused variable 'ct' Surprisingly no complaints so far .. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c index 8f760b2..67e6767 100644 --- a/net/ipv4/netfilter/ip_nat_standalone.c +++ b/net/ipv4/netfilter/ip_nat_standalone.c @@ -219,8 +219,10 @@ ip_nat_out(unsigned int hooknum, const struct net_device *out, int (*okfn)(struct sk_buff *)) { +#ifdef CONFIG_XFRM struct ip_conntrack *ct; enum ip_conntrack_info ctinfo; +#endif unsigned int ret; /* root is playing with raw sockets. */ -- cgit v0.10.2 From 7582e9d17edbabab6cbe59467c5d1b5e8c04fca8 Mon Sep 17 00:00:00 2001 From: Jing Min Zhao Date: Wed, 3 May 2006 23:19:59 -0700 Subject: [NETFILTER]: H.323 helper: Change author's email address Signed-off-by: Jing Min Zhao Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller diff --git a/include/linux/netfilter_ipv4/ip_conntrack_helper_h323_asn1.h b/include/linux/netfilter_ipv4/ip_conntrack_helper_h323_asn1.h index 0bd8280..c6e9a0b 100644 --- a/include/linux/netfilter_ipv4/ip_conntrack_helper_h323_asn1.h +++ b/include/linux/netfilter_ipv4/ip_conntrack_helper_h323_asn1.h @@ -2,7 +2,7 @@ * ip_conntrack_helper_h323_asn1.h - BER and PER decoding library for H.323 * conntrack/NAT module. * - * Copyright (c) 2006 by Jing Min Zhao + * Copyright (c) 2006 by Jing Min Zhao * * This source code is licensed under General Public License version 2. * diff --git a/net/ipv4/netfilter/ip_conntrack_helper_h323_asn1.c b/net/ipv4/netfilter/ip_conntrack_helper_h323_asn1.c index f52d2c4..355a53a 100644 --- a/net/ipv4/netfilter/ip_conntrack_helper_h323_asn1.c +++ b/net/ipv4/netfilter/ip_conntrack_helper_h323_asn1.c @@ -2,7 +2,7 @@ * ip_conntrack_helper_h323_asn1.c - BER and PER decoding library for H.323 * conntrack/NAT module. * - * Copyright (c) 2006 by Jing Min Zhao + * Copyright (c) 2006 by Jing Min Zhao * * This source code is licensed under General Public License version 2. * -- cgit v0.10.2 From 7800007c1e2d42cd4120b87b0ba3f3480f17f30a Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Wed, 3 May 2006 23:20:27 -0700 Subject: [NETFILTER]: x_tables: don't use __copy_{from,to}_user on unchecked memory in compat layer Noticed by Linus Torvalds Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 6d1c115..cee3397 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1441,7 +1441,7 @@ static int compat_copy_entry_to_user(struct ipt_entry *e, ret = -EFAULT; origsize = *size; ce = (struct compat_ipt_entry __user *)*dstptr; - if (__copy_to_user(ce, e, sizeof(struct ipt_entry))) + if (copy_to_user(ce, e, sizeof(struct ipt_entry))) goto out; *dstptr += sizeof(struct compat_ipt_entry); @@ -1459,9 +1459,9 @@ static int compat_copy_entry_to_user(struct ipt_entry *e, goto out; ret = -EFAULT; next_offset = e->next_offset - (origsize - *size); - if (__put_user(target_offset, &ce->target_offset)) + if (put_user(target_offset, &ce->target_offset)) goto out; - if (__put_user(next_offset, &ce->next_offset)) + if (put_user(next_offset, &ce->next_offset)) goto out; return 0; out: diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 17abf60..99293c6 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -289,7 +289,7 @@ int xt_compat_match(void *match, void **dstptr, int *size, int convert) case COMPAT_TO_USER: pm = (struct xt_entry_match *)match; msize = pm->u.user.match_size; - if (__copy_to_user(*dstptr, pm, msize)) { + if (copy_to_user(*dstptr, pm, msize)) { ret = -EFAULT; break; } @@ -366,7 +366,7 @@ int xt_compat_target(void *target, void **dstptr, int *size, int convert) case COMPAT_TO_USER: pt = (struct xt_entry_target *)target; tsize = pt->u.user.target_size; - if (__copy_to_user(*dstptr, pt, tsize)) { + if (copy_to_user(*dstptr, pt, tsize)) { ret = -EFAULT; break; } -- cgit v0.10.2 From 0cc5ae24af08abe8e2a467f45b54c48a0f52670f Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Wed, 3 May 2006 23:22:01 -0700 Subject: [ROSE]: Remove useless prototype for rose_remove_neigh(). Signed-off-by: Ralf Baechle Signed-off-by: David S. Miller diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index 8631b65..4cb6bfa 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -48,8 +48,6 @@ static DEFINE_SPINLOCK(rose_route_list_lock); struct rose_neigh *rose_loopback_neigh; -static void rose_remove_neigh(struct rose_neigh *); - /* * Add a new route to a node, and in the process add the node and the * neighbour if it is new. -- cgit v0.10.2 From 3f072310d0ca85891323e9d325c37c76de389387 Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Wed, 3 May 2006 23:22:36 -0700 Subject: [AX.25]: Spelling fix Signed-off-by: Ralf Baechle Signed-off-by: David S. Miller diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c index f04f863..5ac9825 100644 --- a/net/ax25/ax25_route.c +++ b/net/ax25/ax25_route.c @@ -360,7 +360,7 @@ struct file_operations ax25_route_fops = { /* * Find AX.25 route * - * Only routes with a refernce rout of zero can be destroyed. + * Only routes with a reference count of zero can be destroyed. */ static ax25_route *ax25_get_route(ax25_address *addr, struct net_device *dev) { -- cgit v0.10.2 From 86cfcb95ec60e910d7efcb35ae89bf3403befaad Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Wed, 3 May 2006 23:23:48 -0700 Subject: [AX25, ROSE]: Remove useless SET_MODULE_OWNER calls. Signed-off-by: Ralf Baechle Signed-off-by: David S. Miller diff --git a/net/netrom/nr_dev.c b/net/netrom/nr_dev.c index 509afdd..621e558 100644 --- a/net/netrom/nr_dev.c +++ b/net/netrom/nr_dev.c @@ -185,7 +185,6 @@ static struct net_device_stats *nr_get_stats(struct net_device *dev) void nr_setup(struct net_device *dev) { - SET_MODULE_OWNER(dev); dev->mtu = NR_MAX_PACKET_SIZE; dev->hard_start_xmit = nr_xmit; dev->open = nr_open; diff --git a/net/rose/rose_dev.c b/net/rose/rose_dev.c index d297af7..2a1bf8e 100644 --- a/net/rose/rose_dev.c +++ b/net/rose/rose_dev.c @@ -135,7 +135,6 @@ static struct net_device_stats *rose_get_stats(struct net_device *dev) void rose_setup(struct net_device *dev) { - SET_MODULE_OWNER(dev); dev->mtu = ROSE_MAX_PACKET_SIZE - 2; dev->hard_start_xmit = rose_xmit; dev->open = rose_open; -- cgit v0.10.2 From 3ab33dcc82e014c69ebad3b524d0053378ef76c3 Mon Sep 17 00:00:00 2001 From: Ralf Baechle DL5RB Date: Wed, 3 May 2006 23:24:35 -0700 Subject: [HAMRADIO]: Remove remaining SET_MODULE_OWNER calls from hamradio drivers. Signed-off-by: Ralf Baechle DL5RB Signed-off-by: David S. Miller diff --git a/drivers/net/hamradio/dmascc.c b/drivers/net/hamradio/dmascc.c index 79a8fbc..0d5fccc 100644 --- a/drivers/net/hamradio/dmascc.c +++ b/drivers/net/hamradio/dmascc.c @@ -582,7 +582,6 @@ static int __init setup_adapter(int card_base, int type, int n) INIT_WORK(&priv->rx_work, rx_bh, priv); dev->priv = priv; sprintf(dev->name, "dmascc%i", 2 * n + i); - SET_MODULE_OWNER(dev); dev->base_addr = card_base; dev->irq = irq; dev->open = scc_open; diff --git a/drivers/net/hamradio/scc.c b/drivers/net/hamradio/scc.c index 6ace0e9..5927784 100644 --- a/drivers/net/hamradio/scc.c +++ b/drivers/net/hamradio/scc.c @@ -1550,7 +1550,6 @@ static unsigned char ax25_nocall[AX25_ADDR_LEN] = static void scc_net_setup(struct net_device *dev) { - SET_MODULE_OWNER(dev); dev->tx_queue_len = 16; /* should be enough... */ dev->open = scc_net_open; diff --git a/drivers/net/hamradio/yam.c b/drivers/net/hamradio/yam.c index fe22479..b498840 100644 --- a/drivers/net/hamradio/yam.c +++ b/drivers/net/hamradio/yam.c @@ -1098,7 +1098,6 @@ static void yam_setup(struct net_device *dev) dev->base_addr = yp->iobase; dev->irq = yp->irq; - SET_MODULE_OWNER(dev); dev->open = yam_open; dev->stop = yam_close; -- cgit v0.10.2 From 70868eace5031298c6f6e991a40a2106957f582c Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Wed, 3 May 2006 23:25:17 -0700 Subject: [AX.25]: Move AX.25 symbol exports Move AX.25 symbol exports to next to their definitions where they're supposed to be these days. Signed-off-by: Ralf Baechle Signed-off-by: David S. Miller diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index dbf9b47..a9f13df 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -228,6 +228,8 @@ ax25_cb *ax25_find_cb(ax25_address *src_addr, ax25_address *dest_addr, return NULL; } +EXPORT_SYMBOL(ax25_find_cb); + void ax25_send_to_raw(ax25_address *addr, struct sk_buff *skb, int proto) { ax25_cb *s; @@ -1979,24 +1981,6 @@ static struct notifier_block ax25_dev_notifier = { .notifier_call =ax25_device_event, }; -EXPORT_SYMBOL(ax25_hard_header); -EXPORT_SYMBOL(ax25_rebuild_header); -EXPORT_SYMBOL(ax25_findbyuid); -EXPORT_SYMBOL(ax25_find_cb); -EXPORT_SYMBOL(ax25_linkfail_register); -EXPORT_SYMBOL(ax25_linkfail_release); -EXPORT_SYMBOL(ax25_listen_register); -EXPORT_SYMBOL(ax25_listen_release); -EXPORT_SYMBOL(ax25_protocol_register); -EXPORT_SYMBOL(ax25_protocol_release); -EXPORT_SYMBOL(ax25_send_frame); -EXPORT_SYMBOL(ax25_uid_policy); -EXPORT_SYMBOL(ax25cmp); -EXPORT_SYMBOL(ax2asc); -EXPORT_SYMBOL(asc2ax); -EXPORT_SYMBOL(null_ax25_address); -EXPORT_SYMBOL(ax25_display_timer); - static int __init ax25_init(void) { int rc = proto_register(&ax25_proto, 0); diff --git a/net/ax25/ax25_addr.c b/net/ax25/ax25_addr.c index 0164a15..5f0896a 100644 --- a/net/ax25/ax25_addr.c +++ b/net/ax25/ax25_addr.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -33,6 +34,8 @@ */ ax25_address null_ax25_address = {{0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x00}}; +EXPORT_SYMBOL(null_ax25_address); + /* * ax25 -> ascii conversion */ @@ -64,6 +67,8 @@ char *ax2asc(char *buf, ax25_address *a) } +EXPORT_SYMBOL(ax2asc); + /* * ascii -> ax25 conversion */ @@ -97,6 +102,8 @@ void asc2ax(ax25_address *addr, char *callsign) addr->ax25_call[6] &= 0x1E; } +EXPORT_SYMBOL(asc2ax); + /* * Compare two ax.25 addresses */ @@ -116,6 +123,8 @@ int ax25cmp(ax25_address *a, ax25_address *b) return 2; /* Partial match */ } +EXPORT_SYMBOL(ax25cmp); + /* * Compare two AX.25 digipeater paths. */ diff --git a/net/ax25/ax25_iface.c b/net/ax25/ax25_iface.c index d68aff1..3bb1527 100644 --- a/net/ax25/ax25_iface.c +++ b/net/ax25/ax25_iface.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -74,6 +75,8 @@ int ax25_protocol_register(unsigned int pid, return 1; } +EXPORT_SYMBOL(ax25_protocol_register); + void ax25_protocol_release(unsigned int pid) { struct protocol_struct *s, *protocol; @@ -106,6 +109,8 @@ void ax25_protocol_release(unsigned int pid) write_unlock(&protocol_list_lock); } +EXPORT_SYMBOL(ax25_protocol_release); + int ax25_linkfail_register(void (*func)(ax25_cb *, int)) { struct linkfail_struct *linkfail; @@ -123,6 +128,8 @@ int ax25_linkfail_register(void (*func)(ax25_cb *, int)) return 1; } +EXPORT_SYMBOL(ax25_linkfail_register); + void ax25_linkfail_release(void (*func)(ax25_cb *, int)) { struct linkfail_struct *s, *linkfail; @@ -155,6 +162,8 @@ void ax25_linkfail_release(void (*func)(ax25_cb *, int)) spin_unlock_bh(&linkfail_lock); } +EXPORT_SYMBOL(ax25_linkfail_release); + int ax25_listen_register(ax25_address *callsign, struct net_device *dev) { struct listen_struct *listen; @@ -176,6 +185,8 @@ int ax25_listen_register(ax25_address *callsign, struct net_device *dev) return 1; } +EXPORT_SYMBOL(ax25_listen_register); + void ax25_listen_release(ax25_address *callsign, struct net_device *dev) { struct listen_struct *s, *listen; @@ -208,6 +219,8 @@ void ax25_listen_release(ax25_address *callsign, struct net_device *dev) spin_unlock_bh(&listen_lock); } +EXPORT_SYMBOL(ax25_listen_release); + int (*ax25_protocol_function(unsigned int pid))(struct sk_buff *, ax25_cb *) { int (*res)(struct sk_buff *, ax25_cb *) = NULL; diff --git a/net/ax25/ax25_ip.c b/net/ax25/ax25_ip.c index d643dac..a0b534f 100644 --- a/net/ax25/ax25_ip.c +++ b/net/ax25/ax25_ip.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -221,3 +222,5 @@ int ax25_rebuild_header(struct sk_buff *skb) #endif +EXPORT_SYMBOL(ax25_hard_header); +EXPORT_SYMBOL(ax25_rebuild_header); diff --git a/net/ax25/ax25_out.c b/net/ax25/ax25_out.c index 5fc048d..5d99852 100644 --- a/net/ax25/ax25_out.c +++ b/net/ax25/ax25_out.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -104,6 +105,8 @@ ax25_cb *ax25_send_frame(struct sk_buff *skb, int paclen, ax25_address *src, ax2 return ax25; /* We had to create it */ } +EXPORT_SYMBOL(ax25_send_frame); + /* * All outgoing AX.25 I frames pass via this routine. Therefore this is * where the fragmentation of frames takes place. If fragment is set to diff --git a/net/ax25/ax25_timer.c b/net/ax25/ax25_timer.c index 7a6b50a..ec25405 100644 --- a/net/ax25/ax25_timer.c +++ b/net/ax25/ax25_timer.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -137,6 +138,8 @@ unsigned long ax25_display_timer(struct timer_list *timer) return timer->expires - jiffies; } +EXPORT_SYMBOL(ax25_display_timer); + static void ax25_heartbeat_expiry(unsigned long param) { int proto = AX25_PROTO_STD_SIMPLEX; diff --git a/net/ax25/ax25_uid.c b/net/ax25/ax25_uid.c index b8b5854..5e9a81e 100644 --- a/net/ax25/ax25_uid.c +++ b/net/ax25/ax25_uid.c @@ -49,6 +49,8 @@ static DEFINE_RWLOCK(ax25_uid_lock); int ax25_uid_policy = 0; +EXPORT_SYMBOL(ax25_uid_policy); + ax25_uid_assoc *ax25_findbyuid(uid_t uid) { ax25_uid_assoc *ax25_uid, *res = NULL; @@ -67,6 +69,8 @@ ax25_uid_assoc *ax25_findbyuid(uid_t uid) return res; } +EXPORT_SYMBOL(ax25_findbyuid); + int ax25_uid_ioctl(int cmd, struct sockaddr_ax25 *sax) { ax25_uid_assoc *ax25_uid; -- cgit v0.10.2 From 4cc7c2734e2b4032103e47d8f3e8b6fa3360d3f1 Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Wed, 3 May 2006 23:26:20 -0700 Subject: [ROSE]: Fix routing table locking in rose_remove_neigh. The locking rule for rose_remove_neigh() are that the caller needs to hold rose_neigh_list_lock, so we better don't take it yet again in rose_neigh_list_lock. Signed-off-by: Ralf Baechle Signed-off-by: David S. Miller diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index 4cb6bfa..a22542f 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -233,11 +233,8 @@ static void rose_remove_neigh(struct rose_neigh *rose_neigh) skb_queue_purge(&rose_neigh->queue); - spin_lock_bh(&rose_neigh_list_lock); - if ((s = rose_neigh_list) == rose_neigh) { rose_neigh_list = rose_neigh->next; - spin_unlock_bh(&rose_neigh_list_lock); kfree(rose_neigh->digipeat); kfree(rose_neigh); return; @@ -246,7 +243,6 @@ static void rose_remove_neigh(struct rose_neigh *rose_neigh) while (s != NULL && s->next != NULL) { if (s->next == rose_neigh) { s->next = rose_neigh->next; - spin_unlock_bh(&rose_neigh_list_lock); kfree(rose_neigh->digipeat); kfree(rose_neigh); return; @@ -254,7 +250,6 @@ static void rose_remove_neigh(struct rose_neigh *rose_neigh) s = s->next; } - spin_unlock_bh(&rose_neigh_list_lock); } /* -- cgit v0.10.2 From e1fdb5b39656ea2be8cadde565e543649a988af9 Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Wed, 3 May 2006 23:27:16 -0700 Subject: [AX.25]: Eleminate HZ from AX.25 kernel interfaces Convert all AX.25 sysctl time values from jiffies to ms as units. Signed-off-by: Ralf Baechle Signed-off-by: David S. Miller diff --git a/include/net/ax25.h b/include/net/ax25.h index d052b22..5bd9974 100644 --- a/include/net/ax25.h +++ b/include/net/ax25.h @@ -145,14 +145,14 @@ enum { #define AX25_DEF_CONMODE 2 /* Connected mode allowed */ #define AX25_DEF_WINDOW 2 /* Window=2 */ #define AX25_DEF_EWINDOW 32 /* Module-128 Window=32 */ -#define AX25_DEF_T1 (10 * HZ) /* T1=10s */ -#define AX25_DEF_T2 (3 * HZ) /* T2=3s */ -#define AX25_DEF_T3 (300 * HZ) /* T3=300s */ +#define AX25_DEF_T1 10000 /* T1=10s */ +#define AX25_DEF_T2 3000 /* T2=3s */ +#define AX25_DEF_T3 300000 /* T3=300s */ #define AX25_DEF_N2 10 /* N2=10 */ -#define AX25_DEF_IDLE (0 * 60 * HZ) /* Idle=None */ +#define AX25_DEF_IDLE 0 /* Idle=None */ #define AX25_DEF_PACLEN 256 /* Paclen=256 */ #define AX25_DEF_PROTOCOL AX25_PROTO_STD_SIMPLEX /* Standard AX.25 */ -#define AX25_DEF_DS_TIMEOUT (3 * 60 * HZ) /* DAMA timeout 3 minutes */ +#define AX25_DEF_DS_TIMEOUT 180000 /* DAMA timeout 3 minutes */ typedef struct ax25_uid_assoc { struct hlist_node uid_node; diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index a9f13df..a2e0dd0 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -426,6 +426,26 @@ static int ax25_ctl_ioctl(const unsigned int cmd, void __user *arg) return 0; } +static void ax25_fillin_cb_from_dev(ax25_cb *ax25, ax25_dev *ax25_dev) +{ + ax25->rtt = msecs_to_jiffies(ax25_dev->values[AX25_VALUES_T1]) / 2; + ax25->t1 = msecs_to_jiffies(ax25_dev->values[AX25_VALUES_T1]); + ax25->t2 = msecs_to_jiffies(ax25_dev->values[AX25_VALUES_T2]); + ax25->t3 = msecs_to_jiffies(ax25_dev->values[AX25_VALUES_T3]); + ax25->n2 = ax25_dev->values[AX25_VALUES_N2]; + ax25->paclen = ax25_dev->values[AX25_VALUES_PACLEN]; + ax25->idle = msecs_to_jiffies(ax25_dev->values[AX25_VALUES_IDLE]); + ax25->backoff = ax25_dev->values[AX25_VALUES_BACKOFF]; + + if (ax25_dev->values[AX25_VALUES_AXDEFMODE]) { + ax25->modulus = AX25_EMODULUS; + ax25->window = ax25_dev->values[AX25_VALUES_EWINDOW]; + } else { + ax25->modulus = AX25_MODULUS; + ax25->window = ax25_dev->values[AX25_VALUES_WINDOW]; + } +} + /* * Fill in a created AX.25 created control block with the default * values for a particular device. @@ -435,39 +455,28 @@ void ax25_fillin_cb(ax25_cb *ax25, ax25_dev *ax25_dev) ax25->ax25_dev = ax25_dev; if (ax25->ax25_dev != NULL) { - ax25->rtt = ax25_dev->values[AX25_VALUES_T1] / 2; - ax25->t1 = ax25_dev->values[AX25_VALUES_T1]; - ax25->t2 = ax25_dev->values[AX25_VALUES_T2]; - ax25->t3 = ax25_dev->values[AX25_VALUES_T3]; - ax25->n2 = ax25_dev->values[AX25_VALUES_N2]; - ax25->paclen = ax25_dev->values[AX25_VALUES_PACLEN]; - ax25->idle = ax25_dev->values[AX25_VALUES_IDLE]; - ax25->backoff = ax25_dev->values[AX25_VALUES_BACKOFF]; - - if (ax25_dev->values[AX25_VALUES_AXDEFMODE]) { - ax25->modulus = AX25_EMODULUS; - ax25->window = ax25_dev->values[AX25_VALUES_EWINDOW]; - } else { - ax25->modulus = AX25_MODULUS; - ax25->window = ax25_dev->values[AX25_VALUES_WINDOW]; - } + ax25_fillin_cb_from_dev(ax25, ax25_dev); + return; + } + + /* + * No device, use kernel / AX.25 spec default values + */ + ax25->rtt = msecs_to_jiffies(AX25_DEF_T1) / 2; + ax25->t1 = msecs_to_jiffies(AX25_DEF_T1); + ax25->t2 = msecs_to_jiffies(AX25_DEF_T2); + ax25->t3 = msecs_to_jiffies(AX25_DEF_T3); + ax25->n2 = AX25_DEF_N2; + ax25->paclen = AX25_DEF_PACLEN; + ax25->idle = msecs_to_jiffies(AX25_DEF_IDLE); + ax25->backoff = AX25_DEF_BACKOFF; + + if (AX25_DEF_AXDEFMODE) { + ax25->modulus = AX25_EMODULUS; + ax25->window = AX25_DEF_EWINDOW; } else { - ax25->rtt = AX25_DEF_T1 / 2; - ax25->t1 = AX25_DEF_T1; - ax25->t2 = AX25_DEF_T2; - ax25->t3 = AX25_DEF_T3; - ax25->n2 = AX25_DEF_N2; - ax25->paclen = AX25_DEF_PACLEN; - ax25->idle = AX25_DEF_IDLE; - ax25->backoff = AX25_DEF_BACKOFF; - - if (AX25_DEF_AXDEFMODE) { - ax25->modulus = AX25_EMODULUS; - ax25->window = AX25_DEF_EWINDOW; - } else { - ax25->modulus = AX25_MODULUS; - ax25->window = AX25_DEF_WINDOW; - } + ax25->modulus = AX25_MODULUS; + ax25->window = AX25_DEF_WINDOW; } } diff --git a/net/ax25/ax25_ds_timer.c b/net/ax25/ax25_ds_timer.c index 061083e..5961459 100644 --- a/net/ax25/ax25_ds_timer.c +++ b/net/ax25/ax25_ds_timer.c @@ -61,7 +61,8 @@ void ax25_ds_set_timer(ax25_dev *ax25_dev) return; del_timer(&ax25_dev->dama.slave_timer); - ax25_dev->dama.slave_timeout = ax25_dev->values[AX25_VALUES_DS_TIMEOUT] / 10; + ax25_dev->dama.slave_timeout = + msecs_to_jiffies(ax25_dev->values[AX25_VALUES_DS_TIMEOUT]) / 10; ax25_ds_add_timer(ax25_dev); } diff --git a/net/ax25/sysctl_net_ax25.c b/net/ax25/sysctl_net_ax25.c index 894a225..bdb64c3 100644 --- a/net/ax25/sysctl_net_ax25.c +++ b/net/ax25/sysctl_net_ax25.c @@ -18,14 +18,14 @@ static int min_backoff[1], max_backoff[] = {2}; static int min_conmode[1], max_conmode[] = {2}; static int min_window[] = {1}, max_window[] = {7}; static int min_ewindow[] = {1}, max_ewindow[] = {63}; -static int min_t1[] = {1}, max_t1[] = {30 * HZ}; -static int min_t2[] = {1}, max_t2[] = {20 * HZ}; -static int min_t3[1], max_t3[] = {3600 * HZ}; -static int min_idle[1], max_idle[] = {65535 * HZ}; +static int min_t1[] = {1}, max_t1[] = {30000}; +static int min_t2[] = {1}, max_t2[] = {20000}; +static int min_t3[1], max_t3[] = {3600000}; +static int min_idle[1], max_idle[] = {65535000}; static int min_n2[] = {1}, max_n2[] = {31}; static int min_paclen[] = {1}, max_paclen[] = {512}; static int min_proto[1], max_proto[] = { AX25_PROTO_MAX }; -static int min_ds_timeout[1], max_ds_timeout[] = {65535 * HZ}; +static int min_ds_timeout[1], max_ds_timeout[] = {65535000}; static struct ctl_table_header *ax25_table_header; -- cgit v0.10.2 From 4d8937d0b113e8ec39f7d18cf13804f3b5fb8fd4 Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Wed, 3 May 2006 23:27:47 -0700 Subject: [NETROM]: Eleminate HZ from NET/ROM kernel interfaces Convert all NET/ROM sysctl time values from jiffies to ms as units. Signed-off-by: Ralf Baechle Signed-off-by: David S. Miller diff --git a/include/net/netrom.h b/include/net/netrom.h index a5ee53b..e0ca112 100644 --- a/include/net/netrom.h +++ b/include/net/netrom.h @@ -42,11 +42,11 @@ enum { #define NR_COND_PEER_RX_BUSY 0x04 #define NR_COND_OWN_RX_BUSY 0x08 -#define NR_DEFAULT_T1 (120 * HZ) /* Outstanding frames - 120 seconds */ -#define NR_DEFAULT_T2 (5 * HZ) /* Response delay - 5 seconds */ +#define NR_DEFAULT_T1 120000 /* Outstanding frames - 120 seconds */ +#define NR_DEFAULT_T2 5000 /* Response delay - 5 seconds */ #define NR_DEFAULT_N2 3 /* Number of Retries - 3 */ -#define NR_DEFAULT_T4 (180 * HZ) /* Busy Delay - 180 seconds */ -#define NR_DEFAULT_IDLE (0 * 60 * HZ) /* No Activity Timeout - none */ +#define NR_DEFAULT_T4 180000 /* Busy Delay - 180 seconds */ +#define NR_DEFAULT_IDLE 0 /* No Activity Timeout - none */ #define NR_DEFAULT_WINDOW 4 /* Default Window Size - 4 */ #define NR_DEFAULT_OBS 6 /* Default Obsolescence Count - 6 */ #define NR_DEFAULT_QUAL 10 /* Default Neighbour Quality - 10 */ diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index d44981f..ecd288b 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -425,11 +425,16 @@ static int nr_create(struct socket *sock, int protocol) nr_init_timers(sk); - nr->t1 = sysctl_netrom_transport_timeout; - nr->t2 = sysctl_netrom_transport_acknowledge_delay; - nr->n2 = sysctl_netrom_transport_maximum_tries; - nr->t4 = sysctl_netrom_transport_busy_delay; - nr->idle = sysctl_netrom_transport_no_activity_timeout; + nr->t1 = + msecs_to_jiffies(sysctl_netrom_transport_timeout); + nr->t2 = + msecs_to_jiffies(sysctl_netrom_transport_acknowledge_delay); + nr->n2 = + msecs_to_jiffies(sysctl_netrom_transport_maximum_tries); + nr->t4 = + msecs_to_jiffies(sysctl_netrom_transport_busy_delay); + nr->idle = + msecs_to_jiffies(sysctl_netrom_transport_no_activity_timeout); nr->window = sysctl_netrom_transport_requested_window_size; nr->bpqext = 1; -- cgit v0.10.2 From 82e84249f0ee098e004c8bd6d90a1640bd56cfbb Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Wed, 3 May 2006 23:28:20 -0700 Subject: [ROSE]: Eleminate HZ from ROSE kernel interfaces Convert all ROSE sysctl time values from jiffies to ms as units. Signed-off-by: Ralf Baechle Signed-off-by: David S. Miller diff --git a/include/net/rose.h b/include/net/rose.h index 3249b97..012b09e 100644 --- a/include/net/rose.h +++ b/include/net/rose.h @@ -49,14 +49,14 @@ enum { ROSE_STATE_5 /* Deferred Call Acceptance */ }; -#define ROSE_DEFAULT_T0 (180 * HZ) /* Default T10 T20 value */ -#define ROSE_DEFAULT_T1 (200 * HZ) /* Default T11 T21 value */ -#define ROSE_DEFAULT_T2 (180 * HZ) /* Default T12 T22 value */ -#define ROSE_DEFAULT_T3 (180 * HZ) /* Default T13 T23 value */ -#define ROSE_DEFAULT_HB (5 * HZ) /* Default Holdback value */ -#define ROSE_DEFAULT_IDLE (0 * 60 * HZ) /* No Activity Timeout - none */ +#define ROSE_DEFAULT_T0 180000 /* Default T10 T20 value */ +#define ROSE_DEFAULT_T1 200000 /* Default T11 T21 value */ +#define ROSE_DEFAULT_T2 180000 /* Default T12 T22 value */ +#define ROSE_DEFAULT_T3 180000 /* Default T13 T23 value */ +#define ROSE_DEFAULT_HB 5000 /* Default Holdback value */ +#define ROSE_DEFAULT_IDLE 0 /* No Activity Timeout - none */ #define ROSE_DEFAULT_ROUTING 1 /* Default routing flag */ -#define ROSE_DEFAULT_FAIL_TIMEOUT (120 * HZ) /* Time until link considered usable */ +#define ROSE_DEFAULT_FAIL_TIMEOUT 120000 /* Time until link considered usable */ #define ROSE_DEFAULT_MAXVC 50 /* Maximum number of VCs per neighbour */ #define ROSE_DEFAULT_WINDOW_SIZE 7 /* Default window size */ diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index ea65396..ef4538a 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -518,11 +518,11 @@ static int rose_create(struct socket *sock, int protocol) init_timer(&rose->timer); init_timer(&rose->idletimer); - rose->t1 = sysctl_rose_call_request_timeout; - rose->t2 = sysctl_rose_reset_request_timeout; - rose->t3 = sysctl_rose_clear_request_timeout; - rose->hb = sysctl_rose_ack_hold_back_timeout; - rose->idle = sysctl_rose_no_activity_timeout; + rose->t1 = msecs_to_jiffies(sysctl_rose_call_request_timeout); + rose->t2 = msecs_to_jiffies(sysctl_rose_reset_request_timeout); + rose->t3 = msecs_to_jiffies(sysctl_rose_clear_request_timeout); + rose->hb = msecs_to_jiffies(sysctl_rose_ack_hold_back_timeout); + rose->idle = msecs_to_jiffies(sysctl_rose_no_activity_timeout); rose->state = ROSE_STATE_0; diff --git a/net/rose/rose_link.c b/net/rose/rose_link.c index 09e9e9d..bd86a63 100644 --- a/net/rose/rose_link.c +++ b/net/rose/rose_link.c @@ -40,7 +40,8 @@ void rose_start_ftimer(struct rose_neigh *neigh) neigh->ftimer.data = (unsigned long)neigh; neigh->ftimer.function = &rose_ftimer_expiry; - neigh->ftimer.expires = jiffies + sysctl_rose_link_fail_timeout; + neigh->ftimer.expires = + jiffies + msecs_to_jiffies(sysctl_rose_link_fail_timeout); add_timer(&neigh->ftimer); } @@ -51,7 +52,8 @@ static void rose_start_t0timer(struct rose_neigh *neigh) neigh->t0timer.data = (unsigned long)neigh; neigh->t0timer.function = &rose_t0timer_expiry; - neigh->t0timer.expires = jiffies + sysctl_rose_restart_request_timeout; + neigh->t0timer.expires = + jiffies + msecs_to_jiffies(sysctl_rose_restart_request_timeout); add_timer(&neigh->t0timer); } -- cgit v0.10.2 From 75c2d9077c63ac21488129cc23561d4f4fd0f5e5 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 3 May 2006 23:31:35 -0700 Subject: [TCP]: Fix sock_orphan dead lock Calling sock_orphan inside bh_lock_sock in tcp_close can lead to dead locks. For example, the inet_diag code holds sk_callback_lock without disabling BH. If an inbound packet arrives during that admittedly tiny window, it will cause a dead lock on bh_lock_sock. Another possible path would be through sock_wfree if the network device driver frees the tx skb in process context with BH enabled. We can fix this by moving sock_orphan out of bh_lock_sock. The tricky bit is to work out when we need to destroy the socket ourselves and when it has already been destroyed by someone else. By moving sock_orphan before the release_sock we can solve this problem. This is because as long as we own the socket lock its state cannot change. So we simply record the socket state before the release_sock and then check the state again after we regain the socket lock. If the socket state has transitioned to TCP_CLOSE in the time being, we know that the socket has been destroyed. Otherwise the socket is still ours to keep. Note that I've also moved the increment on the orphan count forward. This may look like a problem as we're increasing it even if the socket is just about to be destroyed where it'll be decreased again. However, this simply enlarges a window that already exists. This also changes the orphan count test by one. Considering what the orphan count is meant to do this is no big deal. This problem was discoverd by Ingo Molnar using his lock validator. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 87f68e7..e2b7b80 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1468,6 +1468,7 @@ void tcp_close(struct sock *sk, long timeout) { struct sk_buff *skb; int data_was_unread = 0; + int state; lock_sock(sk); sk->sk_shutdown = SHUTDOWN_MASK; @@ -1544,6 +1545,11 @@ void tcp_close(struct sock *sk, long timeout) sk_stream_wait_close(sk, timeout); adjudge_to_death: + state = sk->sk_state; + sock_hold(sk); + sock_orphan(sk); + atomic_inc(sk->sk_prot->orphan_count); + /* It is the last release_sock in its life. It will remove backlog. */ release_sock(sk); @@ -1555,8 +1561,9 @@ adjudge_to_death: bh_lock_sock(sk); BUG_TRAP(!sock_owned_by_user(sk)); - sock_hold(sk); - sock_orphan(sk); + /* Have we already been destroyed by a softirq or backlog? */ + if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE) + goto out; /* This is a (useful) BSD violating of the RFC. There is a * problem with TCP as specified in that the other end could @@ -1584,7 +1591,6 @@ adjudge_to_death: if (tmo > TCP_TIMEWAIT_LEN) { inet_csk_reset_keepalive_timer(sk, tcp_fin_time(sk)); } else { - atomic_inc(sk->sk_prot->orphan_count); tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); goto out; } @@ -1603,7 +1609,6 @@ adjudge_to_death: NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY); } } - atomic_inc(sk->sk_prot->orphan_count); if (sk->sk_state == TCP_CLOSE) inet_csk_destroy_sock(sk); -- cgit v0.10.2 From d1a649838802edd94b6335834919463c6ae61f40 Mon Sep 17 00:00:00 2001 From: Patrick Caulfield Date: Wed, 3 May 2006 23:36:23 -0700 Subject: [DECNET]: Fix level1 router hello This patch fixes hello messages sent when a node is a level 1 router. Slightly contrary to the spec (maybe) VMS ignores hello messages that do not name level2 routers that it also knows about. So, here we simply name all the routers that the node knows about rather just other level1 routers. (I hope the patch is clearer than the description. sorry). Signed-off-by: Patrick Caulfield Signed-off-by: David S. Miller diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c index 7c8692c..66e230c 100644 --- a/net/decnet/dn_neigh.c +++ b/net/decnet/dn_neigh.c @@ -493,7 +493,6 @@ struct elist_cb_state { static void neigh_elist_cb(struct neighbour *neigh, void *_info) { struct elist_cb_state *s = _info; - struct dn_dev *dn_db; struct dn_neigh *dn; if (neigh->dev != s->dev) @@ -503,10 +502,6 @@ static void neigh_elist_cb(struct neighbour *neigh, void *_info) if (!(dn->flags & (DN_NDFLAG_R1|DN_NDFLAG_R2))) return; - dn_db = (struct dn_dev *) s->dev->dn_ptr; - if (dn_db->parms.forwarding == 1 && (dn->flags & DN_NDFLAG_R2)) - return; - if (s->t == s->n) s->rs = dn_find_slot(s->ptr, s->n, dn->priority); else -- cgit v0.10.2 From 98232d504db0a1f91ecaa93686ed3bf61963103b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 4 May 2006 09:13:49 +0200 Subject: [PATCH] compat_sys_vmsplice: one-off in UIO_MAXIOV check nr_segs may not be > UIO_MAXIOV, however it may be equal to. This makes the behaviour identical to the real sys_vmsplice(). The other foov syscalls also agree that this is the way to go. Signed-off-by: Jens Axboe diff --git a/fs/compat.c b/fs/compat.c index 3f3e8f4..970888a 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1323,7 +1323,7 @@ compat_sys_vmsplice(int fd, const struct compat_iovec __user *iov32, { unsigned i; struct iovec *iov; - if (nr_segs >= UIO_MAXIOV) + if (nr_segs > UIO_MAXIOV) return -EINVAL; iov = compat_alloc_user_space(nr_segs * sizeof(struct iovec)); for (i = 0; i < nr_segs; i++) { -- cgit v0.10.2 From fe10c6abea8bc83291a13e0580b3e4c355710b09 Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 4 May 2006 13:51:45 +0100 Subject: [MMC] Correct mmc_request_done comments mmc_request_done should be called at the end of handling a request, not between the data and initial command parts of the request. Signed-off-by: Russell King diff --git a/drivers/mmc/mmc.c b/drivers/mmc/mmc.c index 05aa4d6..91c6b10 100644 --- a/drivers/mmc/mmc.c +++ b/drivers/mmc/mmc.c @@ -59,13 +59,12 @@ static const unsigned int tacc_mant[] = { /** - * mmc_request_done - finish processing an MMC command - * @host: MMC host which completed command - * @mrq: MMC request which completed + * mmc_request_done - finish processing an MMC request + * @host: MMC host which completed request + * @mrq: MMC request which request * * MMC drivers should call this function when they have completed - * their processing of a command. This should be called before the - * data part of the command has completed. + * their processing of a request. */ void mmc_request_done(struct mmc_host *host, struct mmc_request *mrq) { -- cgit v0.10.2 From 5b802344357338a4d645beac8ca97470bcbe3542 Mon Sep 17 00:00:00 2001 From: Sascha Hauer Date: Thu, 4 May 2006 14:07:42 +0100 Subject: [ARM] 3490/1: i.MX: move uart resources to board files Patch from Sascha Hauer This patch moves the i.MX uart resources and the gpio pin setup to the board files. This allows the boards to decide how many internal uarts are connected to the outside world and whether they use rts/cts or not. Signed-off-by: Sascha Hauer Signed-off-by: Russell King diff --git a/arch/arm/mach-imx/generic.c b/arch/arm/mach-imx/generic.c index 9d8331b..12ea58a 100644 --- a/arch/arm/mach-imx/generic.c +++ b/arch/arm/mach-imx/generic.c @@ -195,56 +195,6 @@ void __init imx_set_mmc_info(struct imxmmc_platform_data *info) } EXPORT_SYMBOL(imx_set_mmc_info); -static struct resource imx_uart1_resources[] = { - [0] = { - .start = 0x00206000, - .end = 0x002060FF, - .flags = IORESOURCE_MEM, - }, - [1] = { - .start = (UART1_MINT_RX), - .end = (UART1_MINT_RX), - .flags = IORESOURCE_IRQ, - }, - [2] = { - .start = (UART1_MINT_TX), - .end = (UART1_MINT_TX), - .flags = IORESOURCE_IRQ, - }, -}; - -static struct platform_device imx_uart1_device = { - .name = "imx-uart", - .id = 0, - .num_resources = ARRAY_SIZE(imx_uart1_resources), - .resource = imx_uart1_resources, -}; - -static struct resource imx_uart2_resources[] = { - [0] = { - .start = 0x00207000, - .end = 0x002070FF, - .flags = IORESOURCE_MEM, - }, - [1] = { - .start = (UART2_MINT_RX), - .end = (UART2_MINT_RX), - .flags = IORESOURCE_IRQ, - }, - [2] = { - .start = (UART2_MINT_TX), - .end = (UART2_MINT_TX), - .flags = IORESOURCE_IRQ, - }, -}; - -static struct platform_device imx_uart2_device = { - .name = "imx-uart", - .id = 1, - .num_resources = ARRAY_SIZE(imx_uart2_resources), - .resource = imx_uart2_resources, -}; - static struct imxfb_mach_info imx_fb_info; void __init set_imx_fb_info(struct imxfb_mach_info *hard_imx_fb_info) @@ -283,8 +233,6 @@ static struct platform_device imxfb_device = { static struct platform_device *devices[] __initdata = { &imx_mmc_device, &imxfb_device, - &imx_uart1_device, - &imx_uart2_device, }; static struct map_desc imx_io_desc[] __initdata = { diff --git a/arch/arm/mach-imx/mx1ads.c b/arch/arm/mach-imx/mx1ads.c index e34d0df..e1f6c0b 100644 --- a/arch/arm/mach-imx/mx1ads.c +++ b/arch/arm/mach-imx/mx1ads.c @@ -26,6 +26,7 @@ #include #include +#include #include #include "generic.h" @@ -48,8 +49,70 @@ static struct platform_device cs89x0_device = { .resource = cs89x0_resources, }; +static struct imxuart_platform_data uart_pdata = { + .flags = IMXUART_HAVE_RTSCTS, +}; + +static struct resource imx_uart1_resources[] = { + [0] = { + .start = 0x00206000, + .end = 0x002060FF, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = (UART1_MINT_RX), + .end = (UART1_MINT_RX), + .flags = IORESOURCE_IRQ, + }, + [2] = { + .start = (UART1_MINT_TX), + .end = (UART1_MINT_TX), + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device imx_uart1_device = { + .name = "imx-uart", + .id = 0, + .num_resources = ARRAY_SIZE(imx_uart1_resources), + .resource = imx_uart1_resources, + .dev = { + .platform_data = &uart_pdata, + } +}; + +static struct resource imx_uart2_resources[] = { + [0] = { + .start = 0x00207000, + .end = 0x002070FF, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = (UART2_MINT_RX), + .end = (UART2_MINT_RX), + .flags = IORESOURCE_IRQ, + }, + [2] = { + .start = (UART2_MINT_TX), + .end = (UART2_MINT_TX), + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device imx_uart2_device = { + .name = "imx-uart", + .id = 1, + .num_resources = ARRAY_SIZE(imx_uart2_resources), + .resource = imx_uart2_resources, + .dev = { + .platform_data = &uart_pdata, + } +}; + static struct platform_device *devices[] __initdata = { &cs89x0_device, + &imx_uart1_device, + &imx_uart2_device, }; #ifdef CONFIG_MMC_IMX @@ -75,6 +138,17 @@ mx1ads_init(void) imx_gpio_mode(GPIO_PORTB | GPIO_GIUS | GPIO_IN | 20); imx_set_mmc_info(&mx1ads_mmc_info); #endif + + imx_gpio_mode(PC9_PF_UART1_CTS); + imx_gpio_mode(PC10_PF_UART1_RTS); + imx_gpio_mode(PC11_PF_UART1_TXD); + imx_gpio_mode(PC12_PF_UART1_RXD); + + imx_gpio_mode(PB28_PF_UART2_CTS); + imx_gpio_mode(PB29_PF_UART2_RTS); + imx_gpio_mode(PB30_PF_UART2_TXD); + imx_gpio_mode(PB31_PF_UART2_RXD); + platform_add_devices(devices, ARRAY_SIZE(devices)); } diff --git a/drivers/serial/imx.c b/drivers/serial/imx.c index c3b7a66..d202eb4 100644 --- a/drivers/serial/imx.c +++ b/drivers/serial/imx.c @@ -45,6 +45,7 @@ #include #include #include +#include /* We've been assigned a range on the "Low-density serial ports" major */ #define SERIAL_IMX_MAJOR 204 @@ -73,7 +74,8 @@ struct imx_port { struct uart_port port; struct timer_list timer; unsigned int old_status; - int txirq,rxirq,rtsirq; + int txirq,rxirq,rtsirq; + int have_rtscts:1; }; /* @@ -491,8 +493,12 @@ imx_set_termios(struct uart_port *port, struct termios *termios, ucr2 = UCR2_SRST | UCR2_IRTS; if (termios->c_cflag & CRTSCTS) { - ucr2 &= ~UCR2_IRTS; - ucr2 |= UCR2_CTSC; + if( sport->have_rtscts ) { + ucr2 &= ~UCR2_IRTS; + ucr2 |= UCR2_CTSC; + } else { + termios->c_cflag &= ~CRTSCTS; + } } if (termios->c_cflag & CSTOPB) @@ -719,27 +725,6 @@ static void __init imx_init_ports(void) imx_ports[i].timer.function = imx_timeout; imx_ports[i].timer.data = (unsigned long)&imx_ports[i]; } - - imx_gpio_mode(PC9_PF_UART1_CTS); - imx_gpio_mode(PC10_PF_UART1_RTS); - imx_gpio_mode(PC11_PF_UART1_TXD); - imx_gpio_mode(PC12_PF_UART1_RXD); - imx_gpio_mode(PB28_PF_UART2_CTS); - imx_gpio_mode(PB29_PF_UART2_RTS); - - imx_gpio_mode(PB30_PF_UART2_TXD); - imx_gpio_mode(PB31_PF_UART2_RXD); - -#if 0 /* We don't need these, on the mx1 the _modem_ side of the uart - * is implemented. - */ - imx_gpio_mode(PD7_AF_UART2_DTR); - imx_gpio_mode(PD8_AF_UART2_DCD); - imx_gpio_mode(PD9_AF_UART2_RI); - imx_gpio_mode(PD10_AF_UART2_DSR); -#endif - - } #ifdef CONFIG_SERIAL_IMX_CONSOLE @@ -932,7 +917,14 @@ static int serial_imx_resume(struct platform_device *dev) static int serial_imx_probe(struct platform_device *dev) { + struct imxuart_platform_data *pdata; + imx_ports[dev->id].port.dev = &dev->dev; + + pdata = (struct imxuart_platform_data *)dev->dev.platform_data; + if(pdata && (pdata->flags & IMXUART_HAVE_RTSCTS)) + imx_ports[dev->id].have_rtscts = 1; + uart_add_one_port(&imx_reg, &imx_ports[dev->id].port); platform_set_drvdata(dev, &imx_ports[dev->id]); return 0; diff --git a/include/asm-arm/arch-imx/imx-uart.h b/include/asm-arm/arch-imx/imx-uart.h new file mode 100644 index 0000000..3a685e1 --- /dev/null +++ b/include/asm-arm/arch-imx/imx-uart.h @@ -0,0 +1,10 @@ +#ifndef ASMARM_ARCH_UART_H +#define ASMARM_ARCH_UART_H + +#define IMXUART_HAVE_RTSCTS (1<<0) + +struct imxuart_platform_data { + unsigned int flags; +}; + +#endif -- cgit v0.10.2 From 920e70c5c603ada05dd480ca0ccc0ae12a5fdc39 Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 4 May 2006 18:22:51 +0100 Subject: [MMC] Move set_ios debugging into mmc.c Rather than having every driver duplicate the set_ios debugging, provide a single version in mmc.c which can be expanded as we add additional functionality. Signed-off-by: Russell King diff --git a/drivers/mmc/at91_mci.c b/drivers/mmc/at91_mci.c index 6061c2d..88f0eef 100644 --- a/drivers/mmc/at91_mci.c +++ b/drivers/mmc/at91_mci.c @@ -621,9 +621,6 @@ static void at91_mci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) struct at91mci_host *host = mmc_priv(mmc); unsigned long at91_master_clock = clk_get_rate(mci_clk); - DBG("Clock %uHz, busmode %u, powermode %u, Vdd %u\n", - ios->clock, ios->bus_mode, ios->power_mode, ios->vdd); - if (host) host->bus_mode = ios->bus_mode; else diff --git a/drivers/mmc/au1xmmc.c b/drivers/mmc/au1xmmc.c index c0326bb..914d62b 100644 --- a/drivers/mmc/au1xmmc.c +++ b/drivers/mmc/au1xmmc.c @@ -720,10 +720,6 @@ static void au1xmmc_set_ios(struct mmc_host* mmc, struct mmc_ios* ios) { struct au1xmmc_host *host = mmc_priv(mmc); - DBG("set_ios (power=%u, clock=%uHz, vdd=%u, mode=%u)\n", - host->id, ios->power_mode, ios->clock, ios->vdd, - ios->bus_mode); - if (ios->power_mode == MMC_POWER_OFF) au1xmmc_set_power(host, 0); else if (ios->power_mode == MMC_POWER_ON) { diff --git a/drivers/mmc/imxmmc.c b/drivers/mmc/imxmmc.c index bc27192..79358e22 100644 --- a/drivers/mmc/imxmmc.c +++ b/drivers/mmc/imxmmc.c @@ -807,10 +807,6 @@ static void imxmci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) struct imxmci_host *host = mmc_priv(mmc); int prescaler; - dev_dbg(mmc_dev(host->mmc), "clock %u power %u vdd %u width %u\n", - ios->clock, ios->power_mode, ios->vdd, - (ios->bus_width==MMC_BUS_WIDTH_4)?4:1); - if( ios->bus_width==MMC_BUS_WIDTH_4 ) { host->actual_bus_width = MMC_BUS_WIDTH_4; imx_gpio_mode(PB11_PF_SD_DAT3); diff --git a/drivers/mmc/mmc.c b/drivers/mmc/mmc.c index 91c6b10..1ca2c8b 100644 --- a/drivers/mmc/mmc.c +++ b/drivers/mmc/mmc.c @@ -69,10 +69,13 @@ static const unsigned int tacc_mant[] = { void mmc_request_done(struct mmc_host *host, struct mmc_request *mrq) { struct mmc_command *cmd = mrq->cmd; - int err = mrq->cmd->error; - pr_debug("MMC: req done (%02x): %d: %08x %08x %08x %08x\n", - cmd->opcode, err, cmd->resp[0], cmd->resp[1], - cmd->resp[2], cmd->resp[3]); + int err = cmd->error; + + pr_debug("%s: req done (CMD%u): %d/%d/%d: %08x %08x %08x %08x\n", + mmc_hostname(host), cmd->opcode, err, + mrq->data ? mrq->data->error : 0, + mrq->stop ? mrq->stop->error : 0, + cmd->resp[0], cmd->resp[1], cmd->resp[2], cmd->resp[3]); if (err && cmd->retries) { cmd->retries--; @@ -96,8 +99,9 @@ EXPORT_SYMBOL(mmc_request_done); void mmc_start_request(struct mmc_host *host, struct mmc_request *mrq) { - pr_debug("MMC: starting cmd %02x arg %08x flags %08x\n", - mrq->cmd->opcode, mrq->cmd->arg, mrq->cmd->flags); + pr_debug("%s: starting CMD%u arg %08x flags %08x\n", + mmc_hostname(host), mrq->cmd->opcode, + mrq->cmd->arg, mrq->cmd->flags); WARN_ON(host->card_busy == NULL); @@ -311,6 +315,18 @@ void mmc_release_host(struct mmc_host *host) EXPORT_SYMBOL(mmc_release_host); +static inline void mmc_set_ios(struct mmc_host *host) +{ + struct mmc_ios *ios = &host->ios; + + pr_debug("%s: clock %uHz busmode %u powermode %u cs %u Vdd %u width %u\n", + mmc_hostname(host), ios->clock, ios->bus_mode, + ios->power_mode, ios->chip_select, ios->vdd, + ios->bus_width); + + host->ops->set_ios(host, ios); +} + static int mmc_select_card(struct mmc_host *host, struct mmc_card *card) { int err; @@ -363,7 +379,7 @@ static int mmc_select_card(struct mmc_host *host, struct mmc_card *card) } } - host->ops->set_ios(host, &host->ios); + mmc_set_ios(host); return MMC_ERR_NONE; } @@ -414,7 +430,7 @@ static u32 mmc_select_voltage(struct mmc_host *host, u32 ocr) ocr = 3 << bit; host->ios.vdd = bit; - host->ops->set_ios(host, &host->ios); + mmc_set_ios(host); } else { ocr = 0; } @@ -667,7 +683,7 @@ static void mmc_idle_cards(struct mmc_host *host) struct mmc_command cmd; host->ios.chip_select = MMC_CS_HIGH; - host->ops->set_ios(host, &host->ios); + mmc_set_ios(host); mmc_delay(1); @@ -680,7 +696,7 @@ static void mmc_idle_cards(struct mmc_host *host) mmc_delay(1); host->ios.chip_select = MMC_CS_DONTCARE; - host->ops->set_ios(host, &host->ios); + mmc_set_ios(host); mmc_delay(1); } @@ -705,13 +721,13 @@ static void mmc_power_up(struct mmc_host *host) host->ios.chip_select = MMC_CS_DONTCARE; host->ios.power_mode = MMC_POWER_UP; host->ios.bus_width = MMC_BUS_WIDTH_1; - host->ops->set_ios(host, &host->ios); + mmc_set_ios(host); mmc_delay(1); host->ios.clock = host->f_min; host->ios.power_mode = MMC_POWER_ON; - host->ops->set_ios(host, &host->ios); + mmc_set_ios(host); mmc_delay(2); } @@ -724,7 +740,7 @@ static void mmc_power_off(struct mmc_host *host) host->ios.chip_select = MMC_CS_DONTCARE; host->ios.power_mode = MMC_POWER_OFF; host->ios.bus_width = MMC_BUS_WIDTH_1; - host->ops->set_ios(host, &host->ios); + mmc_set_ios(host); } static int mmc_send_op_cond(struct mmc_host *host, u32 ocr, u32 *rocr) @@ -972,7 +988,8 @@ static unsigned int mmc_calculate_clock(struct mmc_host *host) if (!mmc_card_dead(card) && max_dtr > card->csd.max_dtr) max_dtr = card->csd.max_dtr; - pr_debug("MMC: selected %d.%03dMHz transfer rate\n", + pr_debug("%s: selected %d.%03dMHz transfer rate\n", + mmc_hostname(host), max_dtr / 1000000, (max_dtr / 1000) % 1000); return max_dtr; @@ -1047,7 +1064,7 @@ static void mmc_setup(struct mmc_host *host) } else { host->ios.bus_mode = MMC_BUSMODE_OPENDRAIN; host->ios.clock = host->f_min; - host->ops->set_ios(host, &host->ios); + mmc_set_ios(host); /* * We should remember the OCR mask from the existing @@ -1083,7 +1100,7 @@ static void mmc_setup(struct mmc_host *host) * Ok, now switch to push-pull mode. */ host->ios.bus_mode = MMC_BUSMODE_PUSHPULL; - host->ops->set_ios(host, &host->ios); + mmc_set_ios(host); mmc_read_csds(host); @@ -1129,7 +1146,7 @@ static void mmc_rescan(void *data) * attached cards and the host support. */ host->ios.clock = mmc_calculate_clock(host); - host->ops->set_ios(host, &host->ios); + mmc_set_ios(host); } mmc_release_host(host); diff --git a/drivers/mmc/mmci.c b/drivers/mmc/mmci.c index df7e861e..da8e4d7 100644 --- a/drivers/mmc/mmci.c +++ b/drivers/mmc/mmci.c @@ -402,9 +402,6 @@ static void mmci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) struct mmci_host *host = mmc_priv(mmc); u32 clk = 0, pwr = 0; - DBG(host, "clock %uHz busmode %u powermode %u Vdd %u\n", - ios->clock, ios->bus_mode, ios->power_mode, ios->vdd); - if (ios->clock) { if (ios->clock >= host->mclk) { clk = MCI_CLK_BYPASS; diff --git a/drivers/mmc/pxamci.c b/drivers/mmc/pxamci.c index 40970c4..f97b472 100644 --- a/drivers/mmc/pxamci.c +++ b/drivers/mmc/pxamci.c @@ -365,10 +365,6 @@ static void pxamci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) { struct pxamci_host *host = mmc_priv(mmc); - pr_debug("PXAMCI: clock %u power %u vdd %u.%02u\n", - ios->clock, ios->power_mode, ios->vdd / 100, - ios->vdd % 100); - if (ios->clock) { unsigned int clk = CLOCKRATE / ios->clock; if (CLOCKRATE / clk > ios->clock) diff --git a/drivers/mmc/sdhci.c b/drivers/mmc/sdhci.c index bdbfca0..b005328 100644 --- a/drivers/mmc/sdhci.c +++ b/drivers/mmc/sdhci.c @@ -570,10 +570,6 @@ static void sdhci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) spin_lock_irqsave(&host->lock, flags); - DBG("clock %uHz busmode %u powermode %u cs %u Vdd %u width %u\n", - ios->clock, ios->bus_mode, ios->power_mode, ios->chip_select, - ios->vdd, ios->bus_width); - /* * Reset the chip on each power off. * Should clear out any weird states. diff --git a/drivers/mmc/wbsd.c b/drivers/mmc/wbsd.c index 511f7b0..39b3d97 100644 --- a/drivers/mmc/wbsd.c +++ b/drivers/mmc/wbsd.c @@ -931,10 +931,6 @@ static void wbsd_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) struct wbsd_host *host = mmc_priv(mmc); u8 clk, setup, pwr; - DBGF("clock %uHz busmode %u powermode %u cs %u Vdd %u width %u\n", - ios->clock, ios->bus_mode, ios->power_mode, ios->chip_select, - ios->vdd, ios->bus_width); - spin_lock_bh(&host->lock); /* -- cgit v0.10.2 From fed3be9bd56e67c9b9324277b7f95c32e73a75bb Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 4 May 2006 13:23:40 -0400 Subject: CREDITS file update (Tristan Greaves) By request from Tristan. Signed-off-by: Linus Torvalds diff --git a/CREDITS b/CREDITS index 787564b..6f50be3 100644 --- a/CREDITS +++ b/CREDITS @@ -1194,15 +1194,9 @@ S: Brecksville, OH 44141-1334 S: USA N: Tristan Greaves -E: Tristan.Greaves@icl.com -E: tmg296@ecs.soton.ac.uk -W: http://www.ecs.soton.ac.uk/~tmg296 +E: tristan@extricate.org +W: http://www.extricate.org/ D: Miscellaneous ipv4 sysctl patches -S: 15 Little Mead -S: Denmead -S: Hampshire -S: PO7 6HS -S: United Kingdom N: Michael A. Griffith E: grif@cs.ucr.edu -- cgit v0.10.2 From ff10952a547dad934d9ed9afc5cf579ed1ccb53a Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Fri, 5 May 2006 15:11:14 +0100 Subject: [ARM] 3494/1: asm-arm/bug.h needs linux/stddef.h Patch from Nicolas Pitre ... for the definition of NULL. Signed-off-by: Nicolas Pitre Signed-off-by: Russell King diff --git a/include/asm-arm/bug.h b/include/asm-arm/bug.h index 7fb0213..5ab8216 100644 --- a/include/asm-arm/bug.h +++ b/include/asm-arm/bug.h @@ -2,6 +2,7 @@ #define _ASMARM_BUG_H #include +#include #ifdef CONFIG_BUG #ifdef CONFIG_DEBUG_BUGVERBOSE -- cgit v0.10.2 From 2eb9d3157107497fdccb51e1570fea677f6e3c82 Mon Sep 17 00:00:00 2001 From: Uwe Zeisberger Date: Fri, 5 May 2006 15:11:14 +0100 Subject: [ARM] 3496/1: more constants for asm-offsets.h Patch from Uwe Zeisberger added the following constants: - MACHINFO_TYPE - MACHINFO_NAME - MACHINFO_PHYSIO - MACHINFO_PGOFFIO - PROCINFO_INITFUNC - PROCINFO_MMUFLAGS and removed their definition from head.S and head-nommu.S Signed-off-by: Uwe Zeisberger Signed-off-by: Russell King diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c index b324dca..45fdf4a 100644 --- a/arch/arm/kernel/asm-offsets.c +++ b/arch/arm/kernel/asm-offsets.c @@ -95,5 +95,11 @@ int main(void) DEFINE(SYS_ERROR0, 0x9f0000); BLANK(); DEFINE(SIZEOF_MACHINE_DESC, sizeof(struct machine_desc)); + DEFINE(MACHINFO_TYPE, offsetof(struct machine_desc, nr)); + DEFINE(MACHINFO_NAME, offsetof(struct machine_desc, name)); + DEFINE(MACHINFO_PHYSIO, offsetof(struct machine_desc, phys_io)); + DEFINE(MACHINFO_PGOFFIO, offsetof(struct machine_desc, io_pg_offst)); + DEFINE(PROCINFO_INITFUNC, offsetof(struct proc_info_list, __cpu_flush)); + DEFINE(PROCINFO_MMUFLAGS, offsetof(struct proc_info_list, __cpu_mmu_flags)); return 0; } diff --git a/arch/arm/kernel/head-nommu.S b/arch/arm/kernel/head-nommu.S index 0bea658..adf62e5e 100644 --- a/arch/arm/kernel/head-nommu.S +++ b/arch/arm/kernel/head-nommu.S @@ -20,12 +20,10 @@ #include #include #include +#include #include #include -#define PROCINFO_INITFUNC 12 -#define MACHINFO_TYPE 0 - /* * Kernel startup entry point. * --------------------------- diff --git a/arch/arm/kernel/head.S b/arch/arm/kernel/head.S index 04b66a9..04f7344 100644 --- a/arch/arm/kernel/head.S +++ b/arch/arm/kernel/head.S @@ -24,14 +24,6 @@ #include #include -#define PROCINFO_MMUFLAGS 8 -#define PROCINFO_INITFUNC 12 - -#define MACHINFO_TYPE 0 -#define MACHINFO_PHYSIO 4 -#define MACHINFO_PGOFFIO 8 -#define MACHINFO_NAME 12 - #define KERNEL_RAM_ADDR (PAGE_OFFSET + TEXT_OFFSET) /* -- cgit v0.10.2 From 56cf6504fc1c0c221b82cebc16a444b684140fb7 Mon Sep 17 00:00:00 2001 From: Russell King Date: Fri, 5 May 2006 17:57:52 +0100 Subject: [BLOCK] Fix oops on removal of SD/MMC card The block layer keeps a reference (driverfs_dev) to the struct device associated with the block device, and uses it internally for generating uevents in block_uevent. Block device uevents include umounting the partition, which can occur after the backing device has been removed. Unfortunately, this reference is not counted. This means that if the struct device is removed from the device tree, the block layers reference will become stale. Guard against this by holding a reference to the struct device in add_disk(), and only drop the reference when we're releasing the gendisk kobject - in other words when we can be sure that no further uevents will be generated for this block device. Signed-off-by: Russell King Acked-by: Jens Axboe diff --git a/block/genhd.c b/block/genhd.c index 5a8d3bf..d965725 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -182,6 +182,7 @@ static int exact_lock(dev_t dev, void *data) */ void add_disk(struct gendisk *disk) { + get_device(disk->driverfs_dev); disk->flags |= GENHD_FL_UP; blk_register_region(MKDEV(disk->major, disk->first_minor), disk->minors, NULL, exact_match, exact_lock, disk); @@ -427,6 +428,7 @@ static struct attribute * default_attrs[] = { static void disk_release(struct kobject * kobj) { struct gendisk *disk = to_disk(kobj); + put_device(disk->driverfs_dev); kfree(disk->random); kfree(disk->part); free_disk_stats(disk); -- cgit v0.10.2 From b7d7ef87e15dea105be59ec8f14e2f92182dd421 Mon Sep 17 00:00:00 2001 From: "George G. Davis" Date: Fri, 5 May 2006 22:32:23 +0100 Subject: [ARM] 3499/1: Fix VFP FPSCR corruption for double exception case Patch from George G. Davis The ARM VFP FPSCR register is corrupted when a condition flags modifying VFP instruction is followed by a non-condition flags modifying VFP instruction and both instructions raise exceptions. The fix is to read the current FPSCR in between emulation of these two instructions and use the current FPSCR value when handling the second exception. Signed-off-by: George G. Davis Signed-off-by: Russell King diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c index 37ff814..03486be 100644 --- a/arch/arm/vfp/vfpmodule.c +++ b/arch/arm/vfp/vfpmodule.c @@ -245,7 +245,7 @@ void VFP9_bounce(u32 trigger, u32 fpexc, struct pt_regs *regs) */ barrier(); trigger = fmrx(FPINST2); - fpscr = fmrx(FPSCR); + orig_fpscr = fpscr = fmrx(FPSCR); emulate: exceptions = vfp_emulate_instruction(trigger, fpscr, regs); -- cgit v0.10.2 From 99532559dc7a8e686b2cef14c780a7ad5dbd4a31 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Fri, 5 May 2006 22:32:24 +0100 Subject: [ARM] 3500/1: fix PXA27x DMA allocation priority Patch from Nicolas Pitre Intel PXA27x developers manual section 5.4.1.1 lists a priority distribution for the DMA channels differently than what the code currently assumes. This patch fixes that. Noticed by Simon Vogl Signed-off-by: Nicolas Pitre Signed-off-by: Russell King diff --git a/arch/arm/mach-pxa/dma.c b/arch/arm/mach-pxa/dma.c index 458112b..7d8c854 100644 --- a/arch/arm/mach-pxa/dma.c +++ b/arch/arm/mach-pxa/dma.c @@ -45,23 +45,16 @@ int pxa_request_dma (char *name, pxa_dma_prio prio, local_irq_save(flags); - /* try grabbing a DMA channel with the requested priority */ - for (i = prio; i < prio + PXA_DMA_NBCH(prio); i++) { - if (!dma_channels[i].name) { - found = 1; - break; - } - } - - if (!found) { - /* requested prio group is full, try hier priorities */ - for (i = prio-1; i >= 0; i--) { + do { + /* try grabbing a DMA channel with the requested priority */ + pxa_for_each_dma_prio (i, prio) { if (!dma_channels[i].name) { found = 1; break; } } - } + /* if requested prio group is full, try a hier priority */ + } while (!found && prio--); if (found) { DCSR(i) = DCSR_STARTINTR|DCSR_ENDINTR|DCSR_BUSERR; diff --git a/include/asm-arm/arch-pxa/dma.h b/include/asm-arm/arch-pxa/dma.h index 3e88a2a..a008150 100644 --- a/include/asm-arm/arch-pxa/dma.h +++ b/include/asm-arm/arch-pxa/dma.h @@ -24,27 +24,29 @@ typedef struct pxa_dma_desc { volatile u32 dcmd; /* DCMD value for the current transfer */ } pxa_dma_desc; +typedef enum { + DMA_PRIO_HIGH = 0, + DMA_PRIO_MEDIUM = 1, + DMA_PRIO_LOW = 2 +} pxa_dma_prio; + #if defined(CONFIG_PXA27x) #define PXA_DMA_CHANNELS 32 -#define PXA_DMA_NBCH(prio) ((prio == DMA_PRIO_LOW) ? 16 : 8) -typedef enum { - DMA_PRIO_HIGH = 0, - DMA_PRIO_MEDIUM = 8, - DMA_PRIO_LOW = 16 -} pxa_dma_prio; +#define pxa_for_each_dma_prio(ch, prio) \ +for ( \ + ch = prio * 4; \ + ch != (4 << prio) + 16; \ + ch = (ch + 1 == (4 << prio)) ? (prio * 4 + 16) : (ch + 1) \ +) #elif defined(CONFIG_PXA25x) #define PXA_DMA_CHANNELS 16 -#define PXA_DMA_NBCH(prio) ((prio == DMA_PRIO_LOW) ? 8 : 4) -typedef enum { - DMA_PRIO_HIGH = 0, - DMA_PRIO_MEDIUM = 4, - DMA_PRIO_LOW = 8 -} pxa_dma_prio; +#define pxa_for_each_dma_prio(ch, prio) \ + for (ch = prio * 4; ch != (4 << prio); ch++) #endif -- cgit v0.10.2 From 568cb09b9d889b6f2852ede19772b8e9eed36c1e Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Fri, 5 May 2006 22:35:05 +0100 Subject: [ARM] 3495/1: EABI: undefine removed syscalls, but... Patch from Nicolas Pitre ... but only for user space. Signed-off-by: Nicolas Pitre Signed-off-by: Russell King diff --git a/include/asm-arm/unistd.h b/include/asm-arm/unistd.h index 26f2f48..cbf39a5 100644 --- a/include/asm-arm/unistd.h +++ b/include/asm-arm/unistd.h @@ -363,7 +363,7 @@ /* * The following syscalls are obsolete and no longer available for EABI. */ -#if defined(__ARM_EABI__) +#if defined(__ARM_EABI__) && !defined(__KERNEL__) #undef __NR_time #undef __NR_umount #undef __NR_stime -- cgit v0.10.2 From 7c3ceb4fb9667f34f1599a062efecf4cdc4a4ce5 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Fri, 5 May 2006 17:02:09 -0700 Subject: [SCTP]: Allow spillover of receive buffer to avoid deadlock. This patch fixes a deadlock situation in the receive path by allowing temporary spillover of the receive buffer. - If the chunk we receive has a tsn that immediately follows the ctsn, accept it even if we run out of receive buffer space and renege data with higher TSNs. - Once we accept one chunk in a packet, accept all the remaining chunks even if we run out of receive buffer space. Signed-off-by: Neil Horman Acked-by: Mark Butler Acked-by: Vlad Yasevich Signed-off-by: Sridhar Samudrala Signed-off-by: David S. Miller diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index eba99f3..7f4fea1 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -712,6 +712,7 @@ struct sctp_chunk { __u8 tsn_gap_acked; /* Is this chunk acked by a GAP ACK? */ __s8 fast_retransmit; /* Is this chunk fast retransmitted? */ __u8 tsn_missing_report; /* Data chunk missing counter. */ + __u8 data_accepted; /* At least 1 chunk in this packet accepted */ }; void sctp_chunk_hold(struct sctp_chunk *); diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c index 297b895..cf0c767 100644 --- a/net/sctp/inqueue.c +++ b/net/sctp/inqueue.c @@ -149,6 +149,7 @@ struct sctp_chunk *sctp_inq_pop(struct sctp_inq *queue) /* This is the first chunk in the packet. */ chunk->singleton = 1; ch = (sctp_chunkhdr_t *) chunk->skb->data; + chunk->data_accepted = 0; } chunk->chunk_hdr = ch; diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 2b9a832..f5d131f 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -5151,7 +5151,9 @@ static int sctp_eat_data(const struct sctp_association *asoc, int tmp; __u32 tsn; int account_value; + struct sctp_tsnmap *map = (struct sctp_tsnmap *)&asoc->peer.tsn_map; struct sock *sk = asoc->base.sk; + int rcvbuf_over = 0; data_hdr = chunk->subh.data_hdr = (sctp_datahdr_t *)chunk->skb->data; skb_pull(chunk->skb, sizeof(sctp_datahdr_t)); @@ -5162,10 +5164,16 @@ static int sctp_eat_data(const struct sctp_association *asoc, /* ASSERT: Now skb->data is really the user data. */ /* - * if we are established, and we have used up our receive - * buffer memory, drop the frame - */ - if (asoc->state == SCTP_STATE_ESTABLISHED) { + * If we are established, and we have used up our receive buffer + * memory, think about droping the frame. + * Note that we have an opportunity to improve performance here. + * If we accept one chunk from an skbuff, we have to keep all the + * memory of that skbuff around until the chunk is read into user + * space. Therefore, once we accept 1 chunk we may as well accept all + * remaining chunks in the skbuff. The data_accepted flag helps us do + * that. + */ + if ((asoc->state == SCTP_STATE_ESTABLISHED) && (!chunk->data_accepted)) { /* * If the receive buffer policy is 1, then each * association can allocate up to sk_rcvbuf bytes @@ -5176,9 +5184,25 @@ static int sctp_eat_data(const struct sctp_association *asoc, account_value = atomic_read(&asoc->rmem_alloc); else account_value = atomic_read(&sk->sk_rmem_alloc); - - if (account_value > sk->sk_rcvbuf) - return SCTP_IERROR_IGNORE_TSN; + if (account_value > sk->sk_rcvbuf) { + /* + * We need to make forward progress, even when we are + * under memory pressure, so we always allow the + * next tsn after the ctsn ack point to be accepted. + * This lets us avoid deadlocks in which we have to + * drop frames that would otherwise let us drain the + * receive queue. + */ + if ((sctp_tsnmap_get_ctsn(map) + 1) != tsn) + return SCTP_IERROR_IGNORE_TSN; + + /* + * We're going to accept the frame but we should renege + * to make space for it. This will send us down that + * path later in this function. + */ + rcvbuf_over = 1; + } } /* Process ECN based congestion. @@ -5226,6 +5250,7 @@ static int sctp_eat_data(const struct sctp_association *asoc, datalen -= sizeof(sctp_data_chunk_t); deliver = SCTP_CMD_CHUNK_ULP; + chunk->data_accepted = 1; /* Think about partial delivery. */ if ((datalen >= asoc->rwnd) && (!asoc->ulpq.pd_mode)) { @@ -5242,7 +5267,8 @@ static int sctp_eat_data(const struct sctp_association *asoc, * large spill over. */ if (!asoc->rwnd || asoc->rwnd_over || - (datalen > asoc->rwnd + asoc->frag_point)) { + (datalen > asoc->rwnd + asoc->frag_point) || + rcvbuf_over) { /* If this is the next TSN, consider reneging to make * room. Note: Playing nice with a confused sender. A @@ -5250,8 +5276,8 @@ static int sctp_eat_data(const struct sctp_association *asoc, * space and in the future we may want to detect and * do more drastic reneging. */ - if (sctp_tsnmap_has_gap(&asoc->peer.tsn_map) && - (sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map) + 1) == tsn) { + if (sctp_tsnmap_has_gap(map) && + (sctp_tsnmap_get_ctsn(map) + 1) == tsn) { SCTP_DEBUG_PRINTK("Reneging for tsn:%u\n", tsn); deliver = SCTP_CMD_RENEGE; } else { -- cgit v0.10.2 From 672e7cca17ed6036a1756ed34cf20dbd72d5e5f6 Mon Sep 17 00:00:00 2001 From: Vladislav Yasevich Date: Fri, 5 May 2006 17:03:49 -0700 Subject: [SCTP]: Prevent possible infinite recursion with multiple bundled DATA. There is a rare situation that causes lksctp to go into infinite recursion and crash the system. The trigger is a packet that contains at least the first two DATA fragments of a message bundled together. The recursion is triggered when the user data buffer is smaller that the full data message. The problem is that we clone the skb for every fragment in the message. When reassembling the full message, we try to link skbs from the "first fragment" clone using the frag_list. However, since the frag_list is shared between two clones in this rare situation, we end up setting the frag_list pointer of the second fragment to point to itself. This causes sctp_skb_pull() to potentially recurse indefinitely. Proposed solution is to make a copy of the skb when attempting to link things using frag_list. Signed-off-by: Vladislav Yasevich Signed-off-by: Sridhar Samudrala Signed-off-by: David S. Miller diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index 2080b2d..575e556 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -279,6 +279,7 @@ static inline void sctp_ulpq_store_reasm(struct sctp_ulpq *ulpq, static struct sctp_ulpevent *sctp_make_reassembled_event(struct sk_buff_head *queue, struct sk_buff *f_frag, struct sk_buff *l_frag) { struct sk_buff *pos; + struct sk_buff *new = NULL; struct sctp_ulpevent *event; struct sk_buff *pnext, *last; struct sk_buff *list = skb_shinfo(f_frag)->frag_list; @@ -297,11 +298,33 @@ static struct sctp_ulpevent *sctp_make_reassembled_event(struct sk_buff_head *qu */ if (last) last->next = pos; - else - skb_shinfo(f_frag)->frag_list = pos; + else { + if (skb_cloned(f_frag)) { + /* This is a cloned skb, we can't just modify + * the frag_list. We need a new skb to do that. + * Instead of calling skb_unshare(), we'll do it + * ourselves since we need to delay the free. + */ + new = skb_copy(f_frag, GFP_ATOMIC); + if (!new) + return NULL; /* try again later */ + + new->sk = f_frag->sk; + + skb_shinfo(new)->frag_list = pos; + } else + skb_shinfo(f_frag)->frag_list = pos; + } /* Remove the first fragment from the reassembly queue. */ __skb_unlink(f_frag, queue); + + /* if we did unshare, then free the old skb and re-assign */ + if (new) { + kfree_skb(f_frag); + f_frag = new; + } + while (pos) { pnext = pos->next; -- cgit v0.10.2 From 62b08083ec3dbfd7e533c8d230dd1d8191a6e813 Mon Sep 17 00:00:00 2001 From: Sridhar Samudrala Date: Fri, 5 May 2006 17:04:43 -0700 Subject: [SCTP]: Fix panic's when receiving fragmented SCTP control chunks. Use pskb_pull() to handle incoming COOKIE_ECHO and HEARTBEAT chunks that are received as skb's with fragment list. Signed-off-by: Sridhar Samudrala Signed-off-by: David S. Miller diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index f5d131f..8cdba51 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -636,8 +636,9 @@ sctp_disposition_t sctp_sf_do_5_1D_ce(const struct sctp_endpoint *ep, */ chunk->subh.cookie_hdr = (struct sctp_signed_cookie *)chunk->skb->data; - skb_pull(chunk->skb, - ntohs(chunk->chunk_hdr->length) - sizeof(sctp_chunkhdr_t)); + if (!pskb_pull(chunk->skb, ntohs(chunk->chunk_hdr->length) - + sizeof(sctp_chunkhdr_t))) + goto nomem; /* 5.1 D) Upon reception of the COOKIE ECHO chunk, Endpoint * "Z" will reply with a COOKIE ACK chunk after building a TCB @@ -965,7 +966,8 @@ sctp_disposition_t sctp_sf_beat_8_3(const struct sctp_endpoint *ep, */ chunk->subh.hb_hdr = (sctp_heartbeathdr_t *) chunk->skb->data; paylen = ntohs(chunk->chunk_hdr->length) - sizeof(sctp_chunkhdr_t); - skb_pull(chunk->skb, paylen); + if (!pskb_pull(chunk->skb, paylen)) + goto nomem; reply = sctp_make_heartbeat_ack(asoc, chunk, chunk->subh.hb_hdr, paylen); @@ -1860,8 +1862,9 @@ sctp_disposition_t sctp_sf_do_5_2_4_dupcook(const struct sctp_endpoint *ep, * are in good shape. */ chunk->subh.cookie_hdr = (struct sctp_signed_cookie *)chunk->skb->data; - skb_pull(chunk->skb, ntohs(chunk->chunk_hdr->length) - - sizeof(sctp_chunkhdr_t)); + if (!pskb_pull(chunk->skb, ntohs(chunk->chunk_hdr->length) - + sizeof(sctp_chunkhdr_t))) + goto nomem; /* In RFC 2960 5.2.4 3, if both Verification Tags in the State Cookie * of a duplicate COOKIE ECHO match the Verification Tags of the -- cgit v0.10.2 From 35d63edb1c807bc5317e49592260e84637bc432e Mon Sep 17 00:00:00 2001 From: Sridhar Samudrala Date: Fri, 5 May 2006 17:05:23 -0700 Subject: [SCTP]: Fix state table entries for chunks received in CLOSED state. Discard an unexpected chunk in CLOSED state rather can calling BUG(). Signed-off-by: Sridhar Samudrala Signed-off-by: David S. Miller diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c index 75ef104..8bcca56 100644 --- a/net/sctp/sm_statetable.c +++ b/net/sctp/sm_statetable.c @@ -366,9 +366,9 @@ const sctp_sm_table_entry_t *sctp_sm_lookup_event(sctp_event_t event_type, /* SCTP_STATE_EMPTY */ \ {.fn = sctp_sf_ootb, .name = "sctp_sf_ootb"}, \ /* SCTP_STATE_CLOSED */ \ - {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ /* SCTP_STATE_COOKIE_WAIT */ \ - {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ /* SCTP_STATE_COOKIE_ECHOED */ \ {.fn = sctp_sf_do_ecne, .name = "sctp_sf_do_ecne"}, \ /* SCTP_STATE_ESTABLISHED */ \ @@ -380,7 +380,7 @@ const sctp_sm_table_entry_t *sctp_sm_lookup_event(sctp_event_t event_type, /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ {.fn = sctp_sf_do_ecne, .name = "sctp_sf_do_ecne"}, \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ - {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ } /* TYPE_SCTP_ECN_ECNE */ #define TYPE_SCTP_ECN_CWR { \ @@ -401,7 +401,7 @@ const sctp_sm_table_entry_t *sctp_sm_lookup_event(sctp_event_t event_type, /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ - {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ } /* TYPE_SCTP_ECN_CWR */ #define TYPE_SCTP_SHUTDOWN_COMPLETE { \ @@ -647,7 +647,7 @@ chunk_event_table_unknown[SCTP_STATE_NUM_STATES] = { /* SCTP_STATE_EMPTY */ \ {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ /* SCTP_STATE_CLOSED */ \ - {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ + {.fn = sctp_sf_error_closed, .name = "sctp_sf_error_closed"}, \ /* SCTP_STATE_COOKIE_WAIT */ \ {.fn = sctp_sf_do_prm_requestheartbeat, \ .name = "sctp_sf_do_prm_requestheartbeat"}, \ -- cgit v0.10.2 From 1c29fc4989bc2a3838b2837adc12b8aeb0feeede Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Fri, 5 May 2006 17:07:13 -0700 Subject: [BRIDGE]: keep track of received multicast packets It makes sense to add this simple statistic to keep track of received multicast packets. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index b0b7f55..bfa4d8c 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -66,6 +66,7 @@ int br_handle_frame_finish(struct sk_buff *skb) } if (is_multicast_ether_addr(dest)) { + br->statistics.multicast++; br_flood_forward(br, skb, !passedup); if (!passedup) br_pass_frame_up(br, skb); -- cgit v0.10.2 From 134af34632a7b3b0a98a79a2e56bf9cc927e0eac Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 5 May 2006 17:09:13 -0700 Subject: [DCCP]: Fix sock_orphan dead lock Calling sock_orphan inside bh_lock_sock in dccp_close can lead to dead locks. For example, the inet_diag code holds sk_callback_lock without disabling BH. If an inbound packet arrives during that admittedly tiny window, it will cause a dead lock on bh_lock_sock. Another possible path would be through sock_wfree if the network device driver frees the tx skb in process context with BH enabled. We can fix this by moving sock_orphan out of bh_lock_sock. The tricky bit is to work out when we need to destroy the socket ourselves and when it has already been destroyed by someone else. By moving sock_orphan before the release_sock we can solve this problem. This is because as long as we own the socket lock its state cannot change. So we simply record the socket state before the release_sock and then check the state again after we regain the socket lock. If the socket state has transitioned to DCCP_CLOSED in the time being, we know that the socket has been destroyed. Otherwise the socket is still ours to keep. This problem was discoverd by Ingo Molnar using his lock validator. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 1ff7328..2e0ee83 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -848,6 +848,7 @@ static int dccp_close_state(struct sock *sk) void dccp_close(struct sock *sk, long timeout) { struct sk_buff *skb; + int state; lock_sock(sk); @@ -882,6 +883,11 @@ void dccp_close(struct sock *sk, long timeout) sk_stream_wait_close(sk, timeout); adjudge_to_death: + state = sk->sk_state; + sock_hold(sk); + sock_orphan(sk); + atomic_inc(sk->sk_prot->orphan_count); + /* * It is the last release_sock in its life. It will remove backlog. */ @@ -894,8 +900,9 @@ adjudge_to_death: bh_lock_sock(sk); BUG_TRAP(!sock_owned_by_user(sk)); - sock_hold(sk); - sock_orphan(sk); + /* Have we already been destroyed by a softirq or backlog? */ + if (state != DCCP_CLOSED && sk->sk_state == DCCP_CLOSED) + goto out; /* * The last release_sock may have processed the CLOSE or RESET @@ -915,12 +922,12 @@ adjudge_to_death: #endif } - atomic_inc(sk->sk_prot->orphan_count); if (sk->sk_state == DCCP_CLOSED) inet_csk_destroy_sock(sk); /* Otherwise, socket is reprieved until protocol close. */ +out: bh_unlock_sock(sk); local_bh_enable(); sock_put(sk); -- cgit v0.10.2 From f530937b2cccdb131cb459977943c98421ab09b3 Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Fri, 5 May 2006 17:19:26 -0700 Subject: [NETROM/ROSE]: Kill module init version kernel log messages. There are out of date and don't tell the user anything useful. The similar messages which IPV4 and the core networking used to output were killed a long time ago. Signed-off-by: Ralf Baechle Signed-off-by: David S. Miller diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index ecd288b..3669cb9 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -1370,8 +1370,6 @@ static struct notifier_block nr_dev_notifier = { static struct net_device **dev_nr; -static char banner[] __initdata = KERN_INFO "G4KLX NET/ROM for Linux. Version 0.7 for AX25.037 Linux 2.4\n"; - static int __init nr_proto_init(void) { int i; @@ -1419,7 +1417,6 @@ static int __init nr_proto_init(void) } register_netdevice_notifier(&nr_dev_notifier); - printk(banner); ax25_protocol_register(AX25_P_NETROM, nr_route_frame); ax25_linkfail_register(nr_link_failed); diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index ef4538a..55564ef 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -1469,8 +1469,6 @@ static struct notifier_block rose_dev_notifier = { static struct net_device **dev_rose; -static const char banner[] = KERN_INFO "F6FBB/G4KLX ROSE for Linux. Version 0.62 for AX25.037 Linux 2.4\n"; - static int __init rose_proto_init(void) { int i; @@ -1519,7 +1517,6 @@ static int __init rose_proto_init(void) sock_register(&rose_family_ops); register_netdevice_notifier(&rose_dev_notifier); - printk(banner); ax25_protocol_register(AX25_P_ROSE, rose_route_frame); ax25_linkfail_register(rose_link_failed); -- cgit v0.10.2 From 5528e568a760442e0ec8fd2dea1f0791875a066b Mon Sep 17 00:00:00 2001 From: John Heffner Date: Fri, 5 May 2006 17:41:44 -0700 Subject: [TCP]: Fix snd_cwnd adjustments in tcp_highspeed.c Xiaoliang (David) Wei wrote: > Hi gurus, > > I am reading the code of tcp_highspeed.c in the kernel and have a > question on the hstcp_cong_avoid function, specifically the following > AI part (line 136~143 in net/ipv4/tcp_highspeed.c ): > > /* Do additive increase */ > if (tp->snd_cwnd < tp->snd_cwnd_clamp) { > tp->snd_cwnd_cnt += ca->ai; > if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { > tp->snd_cwnd++; > tp->snd_cwnd_cnt -= tp->snd_cwnd; > } > } > > In this part, when (tp->snd_cwnd_cnt == tp->snd_cwnd), > snd_cwnd_cnt will be -1... snd_cwnd_cnt is defined as u16, will this > small chance of getting -1 becomes a problem? > Shall we change it by reversing the order of the cwnd++ and cwnd_cnt -= > cwnd? Absolutely correct. Thanks. Signed-off-by: John Heffner Signed-off-by: David S. Miller diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index e0e9d13..b72fa55 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -137,8 +137,8 @@ static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt, if (tp->snd_cwnd < tp->snd_cwnd_clamp) { tp->snd_cwnd_cnt += ca->ai; if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { - tp->snd_cwnd++; tp->snd_cwnd_cnt -= tp->snd_cwnd; + tp->snd_cwnd++; } } } -- cgit v0.10.2