From ca3060d39ae7a0964f8c123a4833029981e86476 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Tue, 15 Sep 2015 16:09:22 -0700 Subject: c6x: Use generic clkdev.h header The c6x clkdev.h header is the same as the asm-generic header, so just use the asm-generic one. Signed-off-by: Stephen Boyd Signed-off-by: Mark Salter diff --git a/arch/c6x/include/asm/Kbuild b/arch/c6x/include/asm/Kbuild index f17c4dc..a9bc2c5 100644 --- a/arch/c6x/include/asm/Kbuild +++ b/arch/c6x/include/asm/Kbuild @@ -4,6 +4,7 @@ generic-y += auxvec.h generic-y += barrier.h generic-y += bitsperlong.h generic-y += bugs.h +generic-y += clkdev.h generic-y += cputime.h generic-y += current.h generic-y += device.h diff --git a/arch/c6x/include/asm/clkdev.h b/arch/c6x/include/asm/clkdev.h deleted file mode 100644 index 76a070b..0000000 --- a/arch/c6x/include/asm/clkdev.h +++ /dev/null @@ -1,22 +0,0 @@ -#ifndef _ASM_CLKDEV_H -#define _ASM_CLKDEV_H - -#include - -struct clk; - -static inline int __clk_get(struct clk *clk) -{ - return 1; -} - -static inline void __clk_put(struct clk *clk) -{ -} - -static inline struct clk_lookup_alloc *__clkdev_alloc(size_t size) -{ - return kzalloc(size, GFP_KERNEL); -} - -#endif /* _ASM_CLKDEV_H */ -- cgit v0.10.2 From 2c9bceced3a7105665488085072bb7979cdc5257 Mon Sep 17 00:00:00 2001 From: Manikanta Pubbisetty Date: Tue, 20 Oct 2015 12:11:40 +0530 Subject: ath10k: consolidate if statements in ath10k_wmi_event_mgmt_rx This patch replaces multiple if conditional checks with a single if condition in WMI management rx handler. Found during code review. Signed-off-by: Manikanta Pubbisetty Signed-off-by: Kalle Valo diff --git a/drivers/net/wireless/ath/ath10k/wmi.c b/drivers/net/wireless/ath/ath10k/wmi.c index 7569db0..9e93ba3 100644 --- a/drivers/net/wireless/ath/ath10k/wmi.c +++ b/drivers/net/wireless/ath/ath10k/wmi.c @@ -2204,22 +2204,9 @@ int ath10k_wmi_event_mgmt_rx(struct ath10k *ar, struct sk_buff *skb) ath10k_dbg(ar, ATH10K_DBG_MGMT, "event mgmt rx status %08x\n", rx_status); - if (test_bit(ATH10K_CAC_RUNNING, &ar->dev_flags)) { - dev_kfree_skb(skb); - return 0; - } - - if (rx_status & WMI_RX_STATUS_ERR_DECRYPT) { - dev_kfree_skb(skb); - return 0; - } - - if (rx_status & WMI_RX_STATUS_ERR_KEY_CACHE_MISS) { - dev_kfree_skb(skb); - return 0; - } - - if (rx_status & WMI_RX_STATUS_ERR_CRC) { + if ((test_bit(ATH10K_CAC_RUNNING, &ar->dev_flags)) || + (rx_status & (WMI_RX_STATUS_ERR_DECRYPT | + WMI_RX_STATUS_ERR_KEY_CACHE_MISS | WMI_RX_STATUS_ERR_CRC))) { dev_kfree_skb(skb); return 0; } -- cgit v0.10.2 From 3fab30f7e8f31a06702ee6b03a902caffa5bc724 Mon Sep 17 00:00:00 2001 From: Tamizh chelvam Date: Thu, 29 Oct 2015 14:27:37 +0200 Subject: ath10k: add abstraction layer for peer flags Abstraction layer for peer flags is added to fix ABI breakage. Signed-off-by: Tamizh chelvam Signed-off-by: SenthilKumar Jegadeesan Signed-off-by: Kalle Valo diff --git a/drivers/net/wireless/ath/ath10k/core.h b/drivers/net/wireless/ath/ath10k/core.h index 4a23015..01a4173 100644 --- a/drivers/net/wireless/ath/ath10k/core.h +++ b/drivers/net/wireless/ath/ath10k/core.h @@ -151,6 +151,7 @@ struct ath10k_wmi { struct wmi_vdev_param_map *vdev_param; struct wmi_pdev_param_map *pdev_param; const struct wmi_ops *ops; + const struct wmi_peer_flags_map *peer_flags; u32 num_mem_chunks; u32 rx_decap_mode; diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c index a7411fe..2ccda0e 100644 --- a/drivers/net/wireless/ath/ath10k/mac.c +++ b/drivers/net/wireless/ath/ath10k/mac.c @@ -1960,7 +1960,7 @@ static void ath10k_peer_assoc_h_basic(struct ath10k *ar, ether_addr_copy(arg->addr, sta->addr); arg->vdev_id = arvif->vdev_id; arg->peer_aid = aid; - arg->peer_flags |= WMI_PEER_AUTH; + arg->peer_flags |= arvif->ar->wmi.peer_flags->auth; arg->peer_listen_intval = ath10k_peer_assoc_h_listen_intval(ar, vif); arg->peer_num_spatial_streams = 1; arg->peer_caps = vif->bss_conf.assoc_capability; @@ -2002,12 +2002,12 @@ static void ath10k_peer_assoc_h_crypto(struct ath10k *ar, /* FIXME: base on RSN IE/WPA IE is a correct idea? */ if (rsnie || wpaie) { ath10k_dbg(ar, ATH10K_DBG_WMI, "%s: rsn ie found\n", __func__); - arg->peer_flags |= WMI_PEER_NEED_PTK_4_WAY; + arg->peer_flags |= ar->wmi.peer_flags->need_ptk_4_way; } if (wpaie) { ath10k_dbg(ar, ATH10K_DBG_WMI, "%s: wpa ie found\n", __func__); - arg->peer_flags |= WMI_PEER_NEED_GTK_2_WAY; + arg->peer_flags |= ar->wmi.peer_flags->need_gtk_2_way; } } @@ -2104,7 +2104,7 @@ static void ath10k_peer_assoc_h_ht(struct ath10k *ar, ath10k_peer_assoc_h_vht_masked(vht_mcs_mask)) return; - arg->peer_flags |= WMI_PEER_HT; + arg->peer_flags |= ar->wmi.peer_flags->ht; arg->peer_max_mpdu = (1 << (IEEE80211_HT_MAX_AMPDU_FACTOR + ht_cap->ampdu_factor)) - 1; @@ -2115,10 +2115,10 @@ static void ath10k_peer_assoc_h_ht(struct ath10k *ar, arg->peer_rate_caps |= WMI_RC_HT_FLAG; if (ht_cap->cap & IEEE80211_HT_CAP_LDPC_CODING) - arg->peer_flags |= WMI_PEER_LDPC; + arg->peer_flags |= ar->wmi.peer_flags->ldbc; if (sta->bandwidth >= IEEE80211_STA_RX_BW_40) { - arg->peer_flags |= WMI_PEER_40MHZ; + arg->peer_flags |= ar->wmi.peer_flags->bw40; arg->peer_rate_caps |= WMI_RC_CW40_FLAG; } @@ -2132,7 +2132,7 @@ static void ath10k_peer_assoc_h_ht(struct ath10k *ar, if (ht_cap->cap & IEEE80211_HT_CAP_TX_STBC) { arg->peer_rate_caps |= WMI_RC_TX_STBC_FLAG; - arg->peer_flags |= WMI_PEER_STBC; + arg->peer_flags |= ar->wmi.peer_flags->stbc; } if (ht_cap->cap & IEEE80211_HT_CAP_RX_STBC) { @@ -2140,7 +2140,7 @@ static void ath10k_peer_assoc_h_ht(struct ath10k *ar, stbc = stbc >> IEEE80211_HT_CAP_RX_STBC_SHIFT; stbc = stbc << WMI_RC_RX_STBC_FLAG_S; arg->peer_rate_caps |= stbc; - arg->peer_flags |= WMI_PEER_STBC; + arg->peer_flags |= ar->wmi.peer_flags->stbc; } if (ht_cap->mcs.rx_mask[1] && ht_cap->mcs.rx_mask[2]) @@ -2321,10 +2321,10 @@ static void ath10k_peer_assoc_h_vht(struct ath10k *ar, if (ath10k_peer_assoc_h_vht_masked(vht_mcs_mask)) return; - arg->peer_flags |= WMI_PEER_VHT; + arg->peer_flags |= ar->wmi.peer_flags->vht; if (def.chan->band == IEEE80211_BAND_2GHZ) - arg->peer_flags |= WMI_PEER_VHT_2G; + arg->peer_flags |= ar->wmi.peer_flags->vht_2g; arg->peer_vht_caps = vht_cap->cap; @@ -2341,7 +2341,7 @@ static void ath10k_peer_assoc_h_vht(struct ath10k *ar, ampdu_factor)) - 1); if (sta->bandwidth == IEEE80211_STA_RX_BW_80) - arg->peer_flags |= WMI_PEER_80MHZ; + arg->peer_flags |= ar->wmi.peer_flags->bw80; arg->peer_vht_rates.rx_max_rate = __le16_to_cpu(vht_cap->vht_mcs.rx_highest); @@ -2366,27 +2366,28 @@ static void ath10k_peer_assoc_h_qos(struct ath10k *ar, switch (arvif->vdev_type) { case WMI_VDEV_TYPE_AP: if (sta->wme) - arg->peer_flags |= WMI_PEER_QOS; + arg->peer_flags |= arvif->ar->wmi.peer_flags->qos; if (sta->wme && sta->uapsd_queues) { - arg->peer_flags |= WMI_PEER_APSD; + arg->peer_flags |= arvif->ar->wmi.peer_flags->apsd; arg->peer_rate_caps |= WMI_RC_UAPSD_FLAG; } break; case WMI_VDEV_TYPE_STA: if (vif->bss_conf.qos) - arg->peer_flags |= WMI_PEER_QOS; + arg->peer_flags |= arvif->ar->wmi.peer_flags->qos; break; case WMI_VDEV_TYPE_IBSS: if (sta->wme) - arg->peer_flags |= WMI_PEER_QOS; + arg->peer_flags |= arvif->ar->wmi.peer_flags->qos; break; default: break; } ath10k_dbg(ar, ATH10K_DBG_MAC, "mac peer %pM qos %d\n", - sta->addr, !!(arg->peer_flags & WMI_PEER_QOS)); + sta->addr, !!(arg->peer_flags & + arvif->ar->wmi.peer_flags->qos)); } static bool ath10k_mac_sta_has_ofdm_only(struct ieee80211_sta *sta) diff --git a/drivers/net/wireless/ath/ath10k/wmi-tlv.c b/drivers/net/wireless/ath/ath10k/wmi-tlv.c index 6fbd17b..3b3a27b 100644 --- a/drivers/net/wireless/ath/ath10k/wmi-tlv.c +++ b/drivers/net/wireless/ath/ath10k/wmi-tlv.c @@ -3485,6 +3485,24 @@ static const struct wmi_ops wmi_tlv_ops = { .fw_stats_fill = ath10k_wmi_main_op_fw_stats_fill, }; +static const struct wmi_peer_flags_map wmi_tlv_peer_flags_map = { + .auth = WMI_TLV_PEER_AUTH, + .qos = WMI_TLV_PEER_QOS, + .need_ptk_4_way = WMI_TLV_PEER_NEED_PTK_4_WAY, + .need_gtk_2_way = WMI_TLV_PEER_NEED_GTK_2_WAY, + .apsd = WMI_TLV_PEER_APSD, + .ht = WMI_TLV_PEER_HT, + .bw40 = WMI_TLV_PEER_40MHZ, + .stbc = WMI_TLV_PEER_STBC, + .ldbc = WMI_TLV_PEER_LDPC, + .dyn_mimops = WMI_TLV_PEER_DYN_MIMOPS, + .static_mimops = WMI_TLV_PEER_STATIC_MIMOPS, + .spatial_mux = WMI_TLV_PEER_SPATIAL_MUX, + .vht = WMI_TLV_PEER_VHT, + .bw80 = WMI_TLV_PEER_80MHZ, + .pmf = WMI_TLV_PEER_PMF, +}; + /************/ /* TLV init */ /************/ @@ -3495,4 +3513,5 @@ void ath10k_wmi_tlv_attach(struct ath10k *ar) ar->wmi.vdev_param = &wmi_tlv_vdev_param_map; ar->wmi.pdev_param = &wmi_tlv_pdev_param_map; ar->wmi.ops = &wmi_tlv_ops; + ar->wmi.peer_flags = &wmi_tlv_peer_flags_map; } diff --git a/drivers/net/wireless/ath/ath10k/wmi-tlv.h b/drivers/net/wireless/ath/ath10k/wmi-tlv.h index ad655c4..dd67859 100644 --- a/drivers/net/wireless/ath/ath10k/wmi-tlv.h +++ b/drivers/net/wireless/ath/ath10k/wmi-tlv.h @@ -527,6 +527,24 @@ enum wmi_tlv_vdev_param { WMI_TLV_VDEV_PARAM_IBSS_PS_1RX_CHAIN_IN_ATIM_WINDOW_ENABLE, }; +enum wmi_tlv_peer_flags { + WMI_TLV_PEER_AUTH = 0x00000001, + WMI_TLV_PEER_QOS = 0x00000002, + WMI_TLV_PEER_NEED_PTK_4_WAY = 0x00000004, + WMI_TLV_PEER_NEED_GTK_2_WAY = 0x00000010, + WMI_TLV_PEER_APSD = 0x00000800, + WMI_TLV_PEER_HT = 0x00001000, + WMI_TLV_PEER_40MHZ = 0x00002000, + WMI_TLV_PEER_STBC = 0x00008000, + WMI_TLV_PEER_LDPC = 0x00010000, + WMI_TLV_PEER_DYN_MIMOPS = 0x00020000, + WMI_TLV_PEER_STATIC_MIMOPS = 0x00040000, + WMI_TLV_PEER_SPATIAL_MUX = 0x00200000, + WMI_TLV_PEER_VHT = 0x02000000, + WMI_TLV_PEER_80MHZ = 0x04000000, + WMI_TLV_PEER_PMF = 0x08000000, +}; + enum wmi_tlv_tag { WMI_TLV_TAG_LAST_RESERVED = 15, diff --git a/drivers/net/wireless/ath/ath10k/wmi.c b/drivers/net/wireless/ath/ath10k/wmi.c index 9e93ba3..8cd068b 100644 --- a/drivers/net/wireless/ath/ath10k/wmi.c +++ b/drivers/net/wireless/ath/ath10k/wmi.c @@ -1546,6 +1546,61 @@ static struct wmi_pdev_param_map wmi_10_4_pdev_param_map = { .arp_dstaddr = WMI_10_4_PDEV_PARAM_ARP_DSTADDR, }; +static const struct wmi_peer_flags_map wmi_peer_flags_map = { + .auth = WMI_PEER_AUTH, + .qos = WMI_PEER_QOS, + .need_ptk_4_way = WMI_PEER_NEED_PTK_4_WAY, + .need_gtk_2_way = WMI_PEER_NEED_GTK_2_WAY, + .apsd = WMI_PEER_APSD, + .ht = WMI_PEER_HT, + .bw40 = WMI_PEER_40MHZ, + .stbc = WMI_PEER_STBC, + .ldbc = WMI_PEER_LDPC, + .dyn_mimops = WMI_PEER_DYN_MIMOPS, + .static_mimops = WMI_PEER_STATIC_MIMOPS, + .spatial_mux = WMI_PEER_SPATIAL_MUX, + .vht = WMI_PEER_VHT, + .bw80 = WMI_PEER_80MHZ, + .vht_2g = WMI_PEER_VHT_2G, + .pmf = WMI_PEER_PMF, +}; + +static const struct wmi_peer_flags_map wmi_10x_peer_flags_map = { + .auth = WMI_10X_PEER_AUTH, + .qos = WMI_10X_PEER_QOS, + .need_ptk_4_way = WMI_10X_PEER_NEED_PTK_4_WAY, + .need_gtk_2_way = WMI_10X_PEER_NEED_GTK_2_WAY, + .apsd = WMI_10X_PEER_APSD, + .ht = WMI_10X_PEER_HT, + .bw40 = WMI_10X_PEER_40MHZ, + .stbc = WMI_10X_PEER_STBC, + .ldbc = WMI_10X_PEER_LDPC, + .dyn_mimops = WMI_10X_PEER_DYN_MIMOPS, + .static_mimops = WMI_10X_PEER_STATIC_MIMOPS, + .spatial_mux = WMI_10X_PEER_SPATIAL_MUX, + .vht = WMI_10X_PEER_VHT, + .bw80 = WMI_10X_PEER_80MHZ, +}; + +static const struct wmi_peer_flags_map wmi_10_2_peer_flags_map = { + .auth = WMI_10_2_PEER_AUTH, + .qos = WMI_10_2_PEER_QOS, + .need_ptk_4_way = WMI_10_2_PEER_NEED_PTK_4_WAY, + .need_gtk_2_way = WMI_10_2_PEER_NEED_GTK_2_WAY, + .apsd = WMI_10_2_PEER_APSD, + .ht = WMI_10_2_PEER_HT, + .bw40 = WMI_10_2_PEER_40MHZ, + .stbc = WMI_10_2_PEER_STBC, + .ldbc = WMI_10_2_PEER_LDPC, + .dyn_mimops = WMI_10_2_PEER_DYN_MIMOPS, + .static_mimops = WMI_10_2_PEER_STATIC_MIMOPS, + .spatial_mux = WMI_10_2_PEER_SPATIAL_MUX, + .vht = WMI_10_2_PEER_VHT, + .bw80 = WMI_10_2_PEER_80MHZ, + .vht_2g = WMI_10_2_PEER_VHT_2G, + .pmf = WMI_10_2_PEER_PMF, +}; + void ath10k_wmi_put_wmi_channel(struct wmi_channel *ch, const struct wmi_channel_arg *arg) { @@ -7554,30 +7609,35 @@ int ath10k_wmi_attach(struct ath10k *ar) ar->wmi.cmd = &wmi_10_4_cmd_map; ar->wmi.vdev_param = &wmi_10_4_vdev_param_map; ar->wmi.pdev_param = &wmi_10_4_pdev_param_map; + ar->wmi.peer_flags = &wmi_10_2_peer_flags_map; break; case ATH10K_FW_WMI_OP_VERSION_10_2_4: ar->wmi.cmd = &wmi_10_2_4_cmd_map; ar->wmi.ops = &wmi_10_2_4_ops; ar->wmi.vdev_param = &wmi_10_2_4_vdev_param_map; ar->wmi.pdev_param = &wmi_10_2_4_pdev_param_map; + ar->wmi.peer_flags = &wmi_10_2_peer_flags_map; break; case ATH10K_FW_WMI_OP_VERSION_10_2: ar->wmi.cmd = &wmi_10_2_cmd_map; ar->wmi.ops = &wmi_10_2_ops; ar->wmi.vdev_param = &wmi_10x_vdev_param_map; ar->wmi.pdev_param = &wmi_10x_pdev_param_map; + ar->wmi.peer_flags = &wmi_10_2_peer_flags_map; break; case ATH10K_FW_WMI_OP_VERSION_10_1: ar->wmi.cmd = &wmi_10x_cmd_map; ar->wmi.ops = &wmi_10_1_ops; ar->wmi.vdev_param = &wmi_10x_vdev_param_map; ar->wmi.pdev_param = &wmi_10x_pdev_param_map; + ar->wmi.peer_flags = &wmi_10x_peer_flags_map; break; case ATH10K_FW_WMI_OP_VERSION_MAIN: ar->wmi.cmd = &wmi_cmd_map; ar->wmi.ops = &wmi_ops; ar->wmi.vdev_param = &wmi_vdev_param_map; ar->wmi.pdev_param = &wmi_pdev_param_map; + ar->wmi.peer_flags = &wmi_peer_flags_map; break; case ATH10K_FW_WMI_OP_VERSION_TLV: ath10k_wmi_tlv_attach(ar); diff --git a/drivers/net/wireless/ath/ath10k/wmi.h b/drivers/net/wireless/ath/ath10k/wmi.h index 72a4ef7..96dd98c 100644 --- a/drivers/net/wireless/ath/ath10k/wmi.h +++ b/drivers/net/wireless/ath/ath10k/wmi.h @@ -5641,21 +5641,79 @@ struct wmi_peer_set_q_empty_callback_cmd { __le32 callback_enable; } __packed; -#define WMI_PEER_AUTH 0x00000001 -#define WMI_PEER_QOS 0x00000002 -#define WMI_PEER_NEED_PTK_4_WAY 0x00000004 -#define WMI_PEER_NEED_GTK_2_WAY 0x00000010 -#define WMI_PEER_APSD 0x00000800 -#define WMI_PEER_HT 0x00001000 -#define WMI_PEER_40MHZ 0x00002000 -#define WMI_PEER_STBC 0x00008000 -#define WMI_PEER_LDPC 0x00010000 -#define WMI_PEER_DYN_MIMOPS 0x00020000 -#define WMI_PEER_STATIC_MIMOPS 0x00040000 -#define WMI_PEER_SPATIAL_MUX 0x00200000 -#define WMI_PEER_VHT 0x02000000 -#define WMI_PEER_80MHZ 0x04000000 -#define WMI_PEER_VHT_2G 0x08000000 +struct wmi_peer_flags_map { + u32 auth; + u32 qos; + u32 need_ptk_4_way; + u32 need_gtk_2_way; + u32 apsd; + u32 ht; + u32 bw40; + u32 stbc; + u32 ldbc; + u32 dyn_mimops; + u32 static_mimops; + u32 spatial_mux; + u32 vht; + u32 bw80; + u32 vht_2g; + u32 pmf; +}; + +enum wmi_peer_flags { + WMI_PEER_AUTH = 0x00000001, + WMI_PEER_QOS = 0x00000002, + WMI_PEER_NEED_PTK_4_WAY = 0x00000004, + WMI_PEER_NEED_GTK_2_WAY = 0x00000010, + WMI_PEER_APSD = 0x00000800, + WMI_PEER_HT = 0x00001000, + WMI_PEER_40MHZ = 0x00002000, + WMI_PEER_STBC = 0x00008000, + WMI_PEER_LDPC = 0x00010000, + WMI_PEER_DYN_MIMOPS = 0x00020000, + WMI_PEER_STATIC_MIMOPS = 0x00040000, + WMI_PEER_SPATIAL_MUX = 0x00200000, + WMI_PEER_VHT = 0x02000000, + WMI_PEER_80MHZ = 0x04000000, + WMI_PEER_VHT_2G = 0x08000000, + WMI_PEER_PMF = 0x10000000, +}; + +enum wmi_10x_peer_flags { + WMI_10X_PEER_AUTH = 0x00000001, + WMI_10X_PEER_QOS = 0x00000002, + WMI_10X_PEER_NEED_PTK_4_WAY = 0x00000004, + WMI_10X_PEER_NEED_GTK_2_WAY = 0x00000010, + WMI_10X_PEER_APSD = 0x00000800, + WMI_10X_PEER_HT = 0x00001000, + WMI_10X_PEER_40MHZ = 0x00002000, + WMI_10X_PEER_STBC = 0x00008000, + WMI_10X_PEER_LDPC = 0x00010000, + WMI_10X_PEER_DYN_MIMOPS = 0x00020000, + WMI_10X_PEER_STATIC_MIMOPS = 0x00040000, + WMI_10X_PEER_SPATIAL_MUX = 0x00200000, + WMI_10X_PEER_VHT = 0x02000000, + WMI_10X_PEER_80MHZ = 0x04000000, +}; + +enum wmi_10_2_peer_flags { + WMI_10_2_PEER_AUTH = 0x00000001, + WMI_10_2_PEER_QOS = 0x00000002, + WMI_10_2_PEER_NEED_PTK_4_WAY = 0x00000004, + WMI_10_2_PEER_NEED_GTK_2_WAY = 0x00000010, + WMI_10_2_PEER_APSD = 0x00000800, + WMI_10_2_PEER_HT = 0x00001000, + WMI_10_2_PEER_40MHZ = 0x00002000, + WMI_10_2_PEER_STBC = 0x00008000, + WMI_10_2_PEER_LDPC = 0x00010000, + WMI_10_2_PEER_DYN_MIMOPS = 0x00020000, + WMI_10_2_PEER_STATIC_MIMOPS = 0x00040000, + WMI_10_2_PEER_SPATIAL_MUX = 0x00200000, + WMI_10_2_PEER_VHT = 0x02000000, + WMI_10_2_PEER_80MHZ = 0x04000000, + WMI_10_2_PEER_VHT_2G = 0x08000000, + WMI_10_2_PEER_PMF = 0x10000000, +}; /* * Peer rate capabilities. -- cgit v0.10.2 From 90eceb3b5fb0d0f413f475165314d4578c3a46c4 Mon Sep 17 00:00:00 2001 From: Tamizh chelvam Date: Thu, 29 Oct 2015 14:27:42 +0200 Subject: ath10k: set peer MFP flag in peer assoc command Set peer's management frame protection flag in peer assoc command, this setting will enable/disable encrytion of management frames in fw. Setting of this flag is based on whether MFP is enabled/disabled at STA and a firmware feature flag ATH10K_FW_FEATURE_MFP_SUPPORT. This is because only firmwares 10.1.561 and above have support for MFP. Signed-off-by: Tamizh chelvam Signed-off-by: Manikanta pubbisetty Signed-off-by: Kalle Valo diff --git a/drivers/net/wireless/ath/ath10k/core.c b/drivers/net/wireless/ath/ath10k/core.c index aa9bd92..dc4fc4e 100644 --- a/drivers/net/wireless/ath/ath10k/core.c +++ b/drivers/net/wireless/ath/ath10k/core.c @@ -167,6 +167,7 @@ static const char *const ath10k_core_fw_feature_str[] = { [ATH10K_FW_FEATURE_SUPPORTS_SKIP_CLOCK_INIT] = "skip-clock-init", [ATH10K_FW_FEATURE_RAW_MODE_SUPPORT] = "raw-mode", [ATH10K_FW_FEATURE_SUPPORTS_ADAPTIVE_CCA] = "adaptive-cca", + [ATH10K_FW_FEATURE_MFP_SUPPORT] = "mfp", }; static unsigned int ath10k_core_get_fw_feature_str(char *buf, diff --git a/drivers/net/wireless/ath/ath10k/core.h b/drivers/net/wireless/ath/ath10k/core.h index 01a4173..c26f84e 100644 --- a/drivers/net/wireless/ath/ath10k/core.h +++ b/drivers/net/wireless/ath/ath10k/core.h @@ -513,6 +513,9 @@ enum ath10k_fw_features { /* Firmware Supports Adaptive CCA*/ ATH10K_FW_FEATURE_SUPPORTS_ADAPTIVE_CCA = 11, + /* Firmware supports management frame protection */ + ATH10K_FW_FEATURE_MFP_SUPPORT = 12, + /* keep last */ ATH10K_FW_FEATURE_COUNT, }; diff --git a/drivers/net/wireless/ath/ath10k/htt_tx.c b/drivers/net/wireless/ath/ath10k/htt_tx.c index 1682397..bd8f264 100644 --- a/drivers/net/wireless/ath/ath10k/htt_tx.c +++ b/drivers/net/wireless/ath/ath10k/htt_tx.c @@ -477,6 +477,13 @@ int ath10k_htt_mgmt_tx(struct ath10k_htt *htt, struct sk_buff *msdu) msdu_id = res; + if ((ieee80211_is_action(hdr->frame_control) || + ieee80211_is_deauth(hdr->frame_control) || + ieee80211_is_disassoc(hdr->frame_control)) && + ieee80211_has_protected(hdr->frame_control)) { + skb_put(msdu, IEEE80211_CCMP_MIC_LEN); + } + txdesc = ath10k_htc_alloc_skb(ar, len); if (!txdesc) { res = -ENOMEM; diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c index 2ccda0e..a53e213 100644 --- a/drivers/net/wireless/ath/ath10k/mac.c +++ b/drivers/net/wireless/ath/ath10k/mac.c @@ -1968,6 +1968,7 @@ static void ath10k_peer_assoc_h_basic(struct ath10k *ar, static void ath10k_peer_assoc_h_crypto(struct ath10k *ar, struct ieee80211_vif *vif, + struct ieee80211_sta *sta, struct wmi_peer_assoc_complete_arg *arg) { struct ieee80211_bss_conf *info = &vif->bss_conf; @@ -2009,6 +2010,11 @@ static void ath10k_peer_assoc_h_crypto(struct ath10k *ar, ath10k_dbg(ar, ATH10K_DBG_WMI, "%s: wpa ie found\n", __func__); arg->peer_flags |= ar->wmi.peer_flags->need_gtk_2_way; } + + if (sta->mfp && + test_bit(ATH10K_FW_FEATURE_MFP_SUPPORT, ar->fw_features)) { + arg->peer_flags |= ar->wmi.peer_flags->pmf; + } } static void ath10k_peer_assoc_h_rates(struct ath10k *ar, @@ -2480,7 +2486,7 @@ static int ath10k_peer_assoc_prepare(struct ath10k *ar, memset(arg, 0, sizeof(*arg)); ath10k_peer_assoc_h_basic(ar, vif, sta, arg); - ath10k_peer_assoc_h_crypto(ar, vif, arg); + ath10k_peer_assoc_h_crypto(ar, vif, sta, arg); ath10k_peer_assoc_h_rates(ar, vif, sta, arg); ath10k_peer_assoc_h_ht(ar, vif, sta, arg); ath10k_peer_assoc_h_vht(ar, vif, sta, arg); -- cgit v0.10.2 From 6dd46348b935043d8748ad39ef7e9275b0c53c47 Mon Sep 17 00:00:00 2001 From: Tamizh chelvam Date: Sat, 31 Oct 2015 11:20:32 +0200 Subject: ath10k: add thermal throttling support for 10.4 firmware This patch enables thermal throttling support for 10.4 firmware. Signed-off-by: Tamizh chelvam Signed-off-by: Kalle Valo diff --git a/drivers/net/wireless/ath/ath10k/thermal.c b/drivers/net/wireless/ath/ath10k/thermal.c index 60fe562..444b52c 100644 --- a/drivers/net/wireless/ath/ath10k/thermal.c +++ b/drivers/net/wireless/ath/ath10k/thermal.c @@ -187,7 +187,7 @@ int ath10k_thermal_register(struct ath10k *ar) /* Do not register hwmon device when temperature reading is not * supported by firmware */ - if (ar->wmi.op_version != ATH10K_FW_WMI_OP_VERSION_10_2_4) + if (!(ar->wmi.ops->gen_pdev_get_temperature)) return 0; /* Avoid linking error on devm_hwmon_device_register_with_groups, I diff --git a/drivers/net/wireless/ath/ath10k/wmi.c b/drivers/net/wireless/ath/ath10k/wmi.c index 8cd068b..e3ce8f0 100644 --- a/drivers/net/wireless/ath/ath10k/wmi.c +++ b/drivers/net/wireless/ath/ath10k/wmi.c @@ -5103,6 +5103,9 @@ static void ath10k_wmi_10_4_op_rx(struct ath10k *ar, struct sk_buff *skb) case WMI_10_4_UPDATE_STATS_EVENTID: ath10k_wmi_event_update_stats(ar, skb); break; + case WMI_10_4_PDEV_TEMPERATURE_EVENTID: + ath10k_wmi_event_temperature(ar, skb); + break; default: ath10k_warn(ar, "Unknown eventid: %d\n", id); break; @@ -7599,6 +7602,7 @@ static const struct wmi_ops wmi_10_4_ops = { /* shared with 10.2 */ .gen_peer_assoc = ath10k_wmi_10_2_op_gen_peer_assoc, .gen_request_stats = ath10k_wmi_op_gen_request_stats, + .gen_pdev_get_temperature = ath10k_wmi_10_2_op_gen_pdev_get_temperature, }; int ath10k_wmi_attach(struct ath10k *ar) -- cgit v0.10.2 From e3c6225d3a2aae18b1e4bf7356660786a6595982 Mon Sep 17 00:00:00 2001 From: Vasanthakumar Thiagarajan Date: Thu, 29 Oct 2015 19:11:32 +0530 Subject: ath10k: add new service defines for 10.4 No functional changes, adds new wmi service bits for 10.4 firmware to be sync with 10.4 firmware. Signed-off-by: Vasanthakumar Thiagarajan Signed-off-by: Kalle Valo diff --git a/drivers/net/wireless/ath/ath10k/wmi.h b/drivers/net/wireless/ath/ath10k/wmi.h index 96dd98c..6fb4961 100644 --- a/drivers/net/wireless/ath/ath10k/wmi.h +++ b/drivers/net/wireless/ath/ath10k/wmi.h @@ -175,6 +175,8 @@ enum wmi_service { WMI_SERVICE_AUX_SPECTRAL_INTF, WMI_SERVICE_AUX_CHAN_LOAD_INTF, WMI_SERVICE_BSS_CHANNEL_INFO_64, + WMI_SERVICE_EXT_RES_CFG_SUPPORT, + WMI_SERVICE_MESH, /* keep last */ WMI_SERVICE_MAX, @@ -286,6 +288,8 @@ enum wmi_10_4_service { WMI_10_4_SERVICE_AUX_SPECTRAL_INTF, WMI_10_4_SERVICE_AUX_CHAN_LOAD_INTF, WMI_10_4_SERVICE_BSS_CHANNEL_INFO_64, + WMI_10_4_SERVICE_EXT_RES_CFG_SUPPORT, + WMI_10_4_SERVICE_MESH, }; static inline char *wmi_service_name(int service_id) @@ -375,6 +379,8 @@ static inline char *wmi_service_name(int service_id) SVCSTR(WMI_SERVICE_AUX_SPECTRAL_INTF); SVCSTR(WMI_SERVICE_AUX_CHAN_LOAD_INTF); SVCSTR(WMI_SERVICE_BSS_CHANNEL_INFO_64); + SVCSTR(WMI_SERVICE_EXT_RES_CFG_SUPPORT); + SVCSTR(WMI_SERVICE_MESH); default: return NULL; } @@ -600,6 +606,10 @@ static inline void wmi_10_4_svc_map(const __le32 *in, unsigned long *out, WMI_SERVICE_AUX_CHAN_LOAD_INTF, len); SVCMAP(WMI_10_4_SERVICE_BSS_CHANNEL_INFO_64, WMI_SERVICE_BSS_CHANNEL_INFO_64, len); + SVCMAP(WMI_10_4_SERVICE_EXT_RES_CFG_SUPPORT, + WMI_SERVICE_EXT_RES_CFG_SUPPORT, len); + SVCMAP(WMI_10_4_SERVICE_MESH, + WMI_SERVICE_MESH, len); } #undef SVCMAP -- cgit v0.10.2 From 69d4315cc255b21c913f24662383dc8f8fc0718c Mon Sep 17 00:00:00 2001 From: Vasanthakumar Thiagarajan Date: Thu, 29 Oct 2015 19:11:33 +0530 Subject: ath10k: add new WMI cmd/event defines for 10.4 No real functionality change, add WMI command/event defines to be in sync with 10.4 firmware. Signed-off-by: Vasanthakumar Thiagarajan Signed-off-by: Kalle Valo diff --git a/drivers/net/wireless/ath/ath10k/wmi.h b/drivers/net/wireless/ath/ath10k/wmi.h index 6fb4961..9e8f7d0 100644 --- a/drivers/net/wireless/ath/ath10k/wmi.h +++ b/drivers/net/wireless/ath/ath10k/wmi.h @@ -1586,6 +1586,9 @@ enum wmi_10_4_cmd_id { WMI_10_4_MU_CAL_START_CMDID, WMI_10_4_SET_CCA_PARAMS_CMDID, WMI_10_4_PDEV_BSS_CHAN_INFO_REQUEST_CMDID, + WMI_10_4_EXT_RESOURCE_CFG_CMDID, + WMI_10_4_VDEV_SET_IE_CMDID, + WMI_10_4_SET_LTEU_CONFIG_CMDID, WMI_10_4_PDEV_UTF_CMDID = WMI_10_4_END_CMDID - 1, }; @@ -1648,6 +1651,7 @@ enum wmi_10_4_event_id { WMI_10_4_PDEV_TEMPERATURE_EVENTID, WMI_10_4_PDEV_NFCAL_POWER_ALL_CHANNELS_EVENTID, WMI_10_4_PDEV_BSS_CHAN_INFO_EVENTID, + WMI_10_4_MU_REPORT_EVENTID, WMI_10_4_PDEV_UTF_EVENTID = WMI_10_4_END_EVENTID - 1, }; -- cgit v0.10.2 From 52e8ce133e955aa7ff996247a4a276bee55ac046 Mon Sep 17 00:00:00 2001 From: Vasanthakumar Thiagarajan Date: Thu, 29 Oct 2015 19:11:34 +0530 Subject: ath10k: add new pdev params defines to 10.4 No functionality change, just sync to the latest pdev params that 10.4 firmware defines. Signed-off-by: Vasanthakumar Thiagarajan Signed-off-by: Kalle Valo diff --git a/drivers/net/wireless/ath/ath10k/wmi.h b/drivers/net/wireless/ath/ath10k/wmi.h index 9e8f7d0..a35c91e 100644 --- a/drivers/net/wireless/ath/ath10k/wmi.h +++ b/drivers/net/wireless/ath/ath10k/wmi.h @@ -3664,6 +3664,12 @@ enum wmi_10_4_pdev_param { WMI_10_4_PDEV_PARAM_WAPI_MBSSID_OFFSET, WMI_10_4_PDEV_PARAM_ARP_SRCADDR, WMI_10_4_PDEV_PARAM_ARP_DSTADDR, + WMI_10_4_PDEV_PARAM_TXPOWER_DECR_DB, + WMI_10_4_PDEV_PARAM_RX_BATCHMODE, + WMI_10_4_PDEV_PARAM_PACKET_AGGR_DELAY, + WMI_10_4_PDEV_PARAM_ATF_OBSS_NOISE_SCH, + WMI_10_4_PDEV_PARAM_ATF_OBSS_NOISE_SCALING_FACTOR, + WMI_10_4_PDEV_PARAM_CUST_TXPOWER_SCALE, }; struct wmi_pdev_set_param_cmd { -- cgit v0.10.2 From afb0bf7f530bef214fb8db4e05502f85d72961b4 Mon Sep 17 00:00:00 2001 From: Vivek Natarajan Date: Fri, 30 Oct 2015 14:57:58 +0530 Subject: ath10k: add support for pktlog in QCA99X0 This patch adds pktlog support for 10.4 fw versions. Signed-off-by: Vivek Natarajan Signed-off-by: Kalle Valo diff --git a/drivers/net/wireless/ath/ath10k/htt.h b/drivers/net/wireless/ath/ath10k/htt.h index 2bad50e..74ccfb29 100644 --- a/drivers/net/wireless/ath/ath10k/htt.h +++ b/drivers/net/wireless/ath/ath10k/htt.h @@ -1598,5 +1598,7 @@ int ath10k_htt_tx_alloc_msdu_id(struct ath10k_htt *htt, struct sk_buff *skb); void ath10k_htt_tx_free_msdu_id(struct ath10k_htt *htt, u16 msdu_id); int ath10k_htt_mgmt_tx(struct ath10k_htt *htt, struct sk_buff *); int ath10k_htt_tx(struct ath10k_htt *htt, struct sk_buff *); +void ath10k_htt_rx_pktlog_completion_handler(struct ath10k *ar, + struct sk_buff *skb); #endif diff --git a/drivers/net/wireless/ath/ath10k/htt_rx.c b/drivers/net/wireless/ath/ath10k/htt_rx.c index 6060dda..d1dc1ba 100644 --- a/drivers/net/wireless/ath/ath10k/htt_rx.c +++ b/drivers/net/wireless/ath/ath10k/htt_rx.c @@ -2127,6 +2127,18 @@ void ath10k_htt_t2h_msg_handler(struct ath10k *ar, struct sk_buff *skb) } EXPORT_SYMBOL(ath10k_htt_t2h_msg_handler); +void ath10k_htt_rx_pktlog_completion_handler(struct ath10k *ar, + struct sk_buff *skb) +{ + struct ath10k_pktlog_10_4_hdr *hdr = + (struct ath10k_pktlog_10_4_hdr *)skb->data; + + trace_ath10k_htt_pktlog(ar, hdr->payload, + sizeof(*hdr) + __le16_to_cpu(hdr->size)); + dev_kfree_skb_any(skb); +} +EXPORT_SYMBOL(ath10k_htt_rx_pktlog_completion_handler); + static void ath10k_htt_txrx_compl_task(unsigned long ptr) { struct ath10k_htt *htt = (struct ath10k_htt *)ptr; diff --git a/drivers/net/wireless/ath/ath10k/hw.h b/drivers/net/wireless/ath/ath10k/hw.h index 39966a0..557d8d2 100644 --- a/drivers/net/wireless/ath/ath10k/hw.h +++ b/drivers/net/wireless/ath/ath10k/hw.h @@ -273,6 +273,16 @@ struct ath10k_pktlog_hdr { u8 payload[0]; } __packed; +struct ath10k_pktlog_10_4_hdr { + __le16 flags; + __le16 missed_cnt; + __le16 log_type; + __le16 size; + __le32 timestamp; + __le32 type_specific_data; + u8 payload[0]; +} __packed; + enum ath10k_hw_rate_ofdm { ATH10K_HW_RATE_OFDM_48M = 0, ATH10K_HW_RATE_OFDM_24M, diff --git a/drivers/net/wireless/ath/ath10k/pci.c b/drivers/net/wireless/ath/ath10k/pci.c index 3fca200..5c91a67 100644 --- a/drivers/net/wireless/ath/ath10k/pci.c +++ b/drivers/net/wireless/ath/ath10k/pci.c @@ -111,6 +111,7 @@ static void ath10k_pci_htc_tx_cb(struct ath10k_ce_pipe *ce_state); static void ath10k_pci_htc_rx_cb(struct ath10k_ce_pipe *ce_state); static void ath10k_pci_htt_tx_cb(struct ath10k_ce_pipe *ce_state); static void ath10k_pci_htt_rx_cb(struct ath10k_ce_pipe *ce_state); +static void ath10k_pci_pktlog_rx_cb(struct ath10k_ce_pipe *ce_state); static const struct ce_attr host_ce_config_wlan[] = { /* CE0: host->target HTC control and raw streams */ @@ -189,6 +190,7 @@ static const struct ce_attr host_ce_config_wlan[] = { .src_nentries = 0, .src_sz_max = 2048, .dest_nentries = 128, + .recv_cb = ath10k_pci_pktlog_rx_cb, }, /* CE9 target autonomous qcache memcpy */ @@ -1208,6 +1210,15 @@ static void ath10k_pci_htc_rx_cb(struct ath10k_ce_pipe *ce_state) ath10k_pci_process_rx_cb(ce_state, ath10k_htc_rx_completion_handler); } +/* Called by lower (CE) layer when data is received from the Target. + * Only 10.4 firmware uses separate CE to transfer pktlog data. + */ +static void ath10k_pci_pktlog_rx_cb(struct ath10k_ce_pipe *ce_state) +{ + ath10k_pci_process_rx_cb(ce_state, + ath10k_htt_rx_pktlog_completion_handler); +} + /* Called by lower (CE) layer when a send to HTT Target completes. */ static void ath10k_pci_htt_tx_cb(struct ath10k_ce_pipe *ce_state) { -- cgit v0.10.2 From 844fa57227124c353049df02de809b3d6c9505e8 Mon Sep 17 00:00:00 2001 From: Yanbo Li Date: Sat, 31 Oct 2015 11:07:21 +0200 Subject: ath10k: debugfs file to enable Bluetooth coexistence feature As not all QCA98XX radios are not connected to Bluetooth modules, enabling the BT coex feature in firmware will have side effects if the radio's GPIO are connected with other (non-BT) HW modules. Add debugfs file to control the firmware BT coex logic and set the feature as disable by default to avoid that btcoex is accidentally enabled. To enable this feature, execute: echo 1 > /sys/kernel/debug/ieee80211/phyX/ath10k/btcoex To disable: echo 0 > /sys/kernel/debug/ieee80211/phyX/ath10k/btcoex The firmware support this feature since 10.2.4.54 on 2G-only board, dual band or 5G boards don't support this. The feature's name is WMI_SERVICE_COEX_GPIO and the btcoex file is not created if firmware doesn't support it. Signed-off-by: Yanbo Li [kvalo@qca.qualcomm.com: use btcoex filename and other smaller fixes] Signed-off-by: Kalle Valo diff --git a/drivers/net/wireless/ath/ath10k/core.h b/drivers/net/wireless/ath/ath10k/core.h index c26f84e..c16f348 100644 --- a/drivers/net/wireless/ath/ath10k/core.h +++ b/drivers/net/wireless/ath/ath10k/core.h @@ -538,6 +538,9 @@ enum ath10k_dev_flags { /* Disable HW crypto engine */ ATH10K_FLAG_HW_CRYPTO_DISABLED, + + /* Bluetooth coexistance enabled */ + ATH10K_FLAG_BTCOEX, }; enum ath10k_cal_mode { diff --git a/drivers/net/wireless/ath/ath10k/debug.c b/drivers/net/wireless/ath/ath10k/debug.c index 6cc1aa3..145c066 100644 --- a/drivers/net/wireless/ath/ath10k/debug.c +++ b/drivers/net/wireless/ath/ath10k/debug.c @@ -2074,6 +2074,68 @@ static const struct file_operations fops_quiet_period = { .open = simple_open }; +static ssize_t ath10k_write_btcoex(struct file *file, + const char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct ath10k *ar = file->private_data; + char buf[32]; + size_t buf_size; + bool val; + + buf_size = min(count, (sizeof(buf) - 1)); + if (copy_from_user(buf, ubuf, buf_size)) + return -EFAULT; + + buf[buf_size] = '\0'; + + if (strtobool(buf, &val) != 0) + return -EINVAL; + + mutex_lock(&ar->conf_mutex); + + if (!(test_bit(ATH10K_FLAG_BTCOEX, &ar->dev_flags) ^ val)) + goto exit; + + if (val) + set_bit(ATH10K_FLAG_BTCOEX, &ar->dev_flags); + else + clear_bit(ATH10K_FLAG_BTCOEX, &ar->dev_flags); + + if (ar->state != ATH10K_STATE_ON) + goto exit; + + ath10k_info(ar, "restarting firmware due to btcoex change"); + + queue_work(ar->workqueue, &ar->restart_work); + +exit: + mutex_unlock(&ar->conf_mutex); + + return count; +} + +static ssize_t ath10k_read_btcoex(struct file *file, char __user *ubuf, + size_t count, loff_t *ppos) +{ + char buf[32]; + struct ath10k *ar = file->private_data; + int len = 0; + + mutex_lock(&ar->conf_mutex); + len = scnprintf(buf, sizeof(buf) - len, "%d\n", + test_bit(ATH10K_FLAG_BTCOEX, &ar->dev_flags)); + mutex_unlock(&ar->conf_mutex); + + return simple_read_from_buffer(ubuf, count, ppos, buf, len); +} + +static const struct file_operations fops_btcoex = { + .read = ath10k_read_btcoex, + .write = ath10k_write_btcoex, + .open = simple_open +}; + int ath10k_debug_create(struct ath10k *ar) { ar->debug.fw_crash_data = vzalloc(sizeof(*ar->debug.fw_crash_data)); @@ -2183,6 +2245,10 @@ int ath10k_debug_register(struct ath10k *ar) debugfs_create_file("tpc_stats", S_IRUSR, ar->debug.debugfs_phy, ar, &fops_tpc_stats); + if (test_bit(WMI_SERVICE_COEX_GPIO, ar->wmi.svc_map)) + debugfs_create_file("btcoex", S_IRUGO | S_IWUSR, + ar->debug.debugfs_phy, ar, &fops_btcoex); + return 0; } diff --git a/drivers/net/wireless/ath/ath10k/wmi.c b/drivers/net/wireless/ath/ath10k/wmi.c index e3ce8f0..56806d9 100644 --- a/drivers/net/wireless/ath/ath10k/wmi.c +++ b/drivers/net/wireless/ath/ath10k/wmi.c @@ -5476,8 +5476,11 @@ static struct sk_buff *ath10k_wmi_10_2_op_gen_init(struct ath10k *ar) cmd = (struct wmi_init_cmd_10_2 *)buf->data; features = WMI_10_2_RX_BATCH_MODE; - if (test_bit(WMI_SERVICE_COEX_GPIO, ar->wmi.svc_map)) + + if (test_bit(ATH10K_FLAG_BTCOEX, &ar->dev_flags) && + test_bit(WMI_SERVICE_COEX_GPIO, ar->wmi.svc_map)) features |= WMI_10_2_COEX_GPIO; + cmd->resource_config.feature_mask = __cpu_to_le32(features); memcpy(&cmd->resource_config.common, &config, sizeof(config)); -- cgit v0.10.2 From 539da7877275edb21a76aa02fb2c147eff02c559 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 4 Nov 2015 22:57:00 +0000 Subject: x86/apic: Add a single-target IPI function to the apic We still fall back on the "send mask" versions if an apic definition doesn't have the single-target version, but at least this allows the (trivial) case for the common clustered x2apic case. Signed-off-by: Linus Torvalds Reviewed-by: Ingo Molnar Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Mike Travis Cc: Daniel J Blueman Link: http://lkml.kernel.org/r/20151104220848.737120838@linutronix.de Signed-off-by: Thomas Gleixner diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index a30316b..7f62ad4 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -303,6 +303,7 @@ struct apic { unsigned int *apicid); /* ipi */ + void (*send_IPI)(int cpu, int vector); void (*send_IPI_mask)(const struct cpumask *mask, int vector); void (*send_IPI_mask_allbutself)(const struct cpumask *mask, int vector); diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 12c8286..1dbf590 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -115,6 +115,18 @@ static atomic_t stopping_cpu = ATOMIC_INIT(-1); static bool smp_no_nmi_ipi = false; /* + * Helper wrapper: not all apic definitions support sending to + * a single CPU, so we fall back to sending to a mask. + */ +static void send_IPI_cpu(int cpu, int vector) +{ + if (apic->send_IPI) + apic->send_IPI(cpu, vector); + else + apic->send_IPI_mask(cpumask_of(cpu), vector); +} + +/* * this function sends a 'reschedule' IPI to another CPU. * it goes straight through and wastes no time serializing * anything. Worst case is that we lose a reschedule ... @@ -125,12 +137,12 @@ static void native_smp_send_reschedule(int cpu) WARN_ON(1); return; } - apic->send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR); + send_IPI_cpu(cpu, RESCHEDULE_VECTOR); } void native_send_call_func_single_ipi(int cpu) { - apic->send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR); + send_IPI_cpu(cpu, CALL_FUNCTION_SINGLE_VECTOR); } void native_send_call_func_ipi(const struct cpumask *mask) -- cgit v0.10.2 From 7b6ce46cb3d096831dea3accacee4717c66abac8 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 4 Nov 2015 22:57:00 +0000 Subject: x86/apic: Implement single target IPI function for x2apic_cluster [ tglx: Split it out from the patch which provides the new callback ] Signed-off-by: Linus Torvalds Reviewed-by: Ingo Molnar Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Mike Travis Cc: Daniel J Blueman Link: http://lkml.kernel.org/r/20151104220848.817975597@linutronix.de Signed-off-by: Thomas Gleixner diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index cc8311c..aca8b75 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -23,6 +23,14 @@ static inline u32 x2apic_cluster(int cpu) return per_cpu(x86_cpu_to_logical_apicid, cpu) >> 16; } +static void x2apic_send_IPI(int cpu, int vector) +{ + u32 dest = per_cpu(x86_cpu_to_logical_apicid, cpu); + + x2apic_wrmsr_fence(); + __x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL); +} + static void __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) { @@ -266,6 +274,7 @@ static struct apic apic_x2apic_cluster = { .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and, + .send_IPI = x2apic_send_IPI, .send_IPI_mask = x2apic_send_IPI_mask, .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself, .send_IPI_allbutself = x2apic_send_IPI_allbutself, -- cgit v0.10.2 From 53be0fac8bdaeec87e0df7d0334345421d2be187 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Nov 2015 22:57:01 +0000 Subject: x86/apic: Implement default single target IPI function apic_physflat and bigsmp_apic can share that implementation. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Linus Torvalds Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Mike Travis Cc: Daniel J Blueman Link: http://lkml.kernel.org/r/20151104220848.898543767@linutronix.de diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h index 615fa90..22998a8 100644 --- a/arch/x86/include/asm/ipi.h +++ b/arch/x86/include/asm/ipi.h @@ -119,6 +119,7 @@ static inline void native_apic_mem_write(APIC_ICR, cfg); } +extern void default_send_IPI_single_phys(int cpu, int vector); extern void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector); extern void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index 6207156..4fcffbf 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -18,6 +18,16 @@ #include #include +void default_send_IPI_single_phys(int cpu, int vector) +{ + unsigned long flags; + + local_irq_save(flags); + __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, cpu), + vector, APIC_DEST_PHYSICAL); + local_irq_restore(flags); +} + void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector) { unsigned long query_cpu; -- cgit v0.10.2 From 449112f4f35074f1dc70d4f0e769cb14150c159c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Nov 2015 22:57:02 +0000 Subject: x86/apic: Remove pointless indirections from apic_physflat No value in having 32 byte extra text. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Linus Torvalds Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Mike Travis Cc: Daniel J Blueman Link: http://lkml.kernel.org/r/20151104220848.975653382@linutronix.de diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index f92ab36..6d3e1a6 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -230,17 +230,6 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 0; } -static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector) -{ - default_send_IPI_mask_sequence_phys(cpumask, vector); -} - -static void physflat_send_IPI_mask_allbutself(const struct cpumask *cpumask, - int vector) -{ - default_send_IPI_mask_allbutself_phys(cpumask, vector); -} - static void physflat_send_IPI_allbutself(int vector) { default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector); @@ -248,7 +237,7 @@ static void physflat_send_IPI_allbutself(int vector) static void physflat_send_IPI_all(int vector) { - physflat_send_IPI_mask(cpu_online_mask, vector); + default_send_IPI_mask_sequence_phys(cpu_online_mask, vector); } static int physflat_probe(void) @@ -292,8 +281,8 @@ static struct apic apic_physflat = { .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, - .send_IPI_mask = physflat_send_IPI_mask, - .send_IPI_mask_allbutself = physflat_send_IPI_mask_allbutself, + .send_IPI_mask = default_send_IPI_mask_sequence_phys, + .send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_phys, .send_IPI_allbutself = physflat_send_IPI_allbutself, .send_IPI_all = physflat_send_IPI_all, .send_IPI_self = apic_send_IPI_self, -- cgit v0.10.2 From 68cd88ff8df97846eb07080f17264a4de50cb012 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Nov 2015 22:57:02 +0000 Subject: x86/apic: Wire up single IPI for apic_physflat Use the default implementation. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Linus Torvalds Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Mike Travis Cc: Daniel J Blueman Link: http://lkml.kernel.org/r/20151104220849.055046864@linutronix.de diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 6d3e1a6..9de25d4 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -281,6 +281,7 @@ static struct apic apic_physflat = { .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, + .send_IPI = default_send_IPI_single_phys, .send_IPI_mask = default_send_IPI_mask_sequence_phys, .send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_phys, .send_IPI_allbutself = physflat_send_IPI_allbutself, -- cgit v0.10.2 From 500bd02fb17e5d9296c77ccc07db61fd5d4922a4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Nov 2015 22:57:03 +0000 Subject: x86/apic: Remove pointless indirections from bigsmp_apic Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Linus Torvalds Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Mike Travis Cc: Daniel J Blueman Link: http://lkml.kernel.org/r/20151104220849.133086575@linutronix.de diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 971cf88..d4d103b 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -96,11 +96,6 @@ static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb) return cpuid_apic >> index_msb; } -static inline void bigsmp_send_IPI_mask(const struct cpumask *mask, int vector) -{ - default_send_IPI_mask_sequence_phys(mask, vector); -} - static void bigsmp_send_IPI_allbutself(int vector) { default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector); @@ -108,7 +103,7 @@ static void bigsmp_send_IPI_allbutself(int vector) static void bigsmp_send_IPI_all(int vector) { - bigsmp_send_IPI_mask(cpu_online_mask, vector); + default_send_IPI_mask_sequence_phys(cpu_online_mask, vector); } static int dmi_bigsmp; /* can be set by dmi scanners */ @@ -180,7 +175,7 @@ static struct apic apic_bigsmp = { .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, - .send_IPI_mask = bigsmp_send_IPI_mask, + .send_IPI_mask = default_send_IPI_mask_sequence_phys, .send_IPI_mask_allbutself = NULL, .send_IPI_allbutself = bigsmp_send_IPI_allbutself, .send_IPI_all = bigsmp_send_IPI_all, -- cgit v0.10.2 From 5789a12e28f7bf6a37564a5fc9ebc60dc86659b5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Nov 2015 22:57:04 +0000 Subject: x86/apic: Wire up single IPI for bigsmp_apic Use the default implementation. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Linus Torvalds Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Mike Travis Cc: Daniel J Blueman Link: http://lkml.kernel.org/r/20151104220849.213292642@linutronix.de diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index d4d103b..cf9bd89 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -175,6 +175,7 @@ static struct apic apic_bigsmp = { .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, + .send_IPI = default_send_IPI_single_phys, .send_IPI_mask = default_send_IPI_mask_sequence_phys, .send_IPI_mask_allbutself = NULL, .send_IPI_allbutself = bigsmp_send_IPI_allbutself, -- cgit v0.10.2 From f2bffe8a3eef42a1cd3393d56acd9fe598d2119c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Nov 2015 22:57:04 +0000 Subject: x86/apic: Implement single IPI for x2apic_phys Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Linus Torvalds Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Mike Travis Cc: Daniel J Blueman Link: http://lkml.kernel.org/r/20151104220849.296438009@linutronix.de diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 662e915..a1242e2 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -36,6 +36,14 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return x2apic_enabled() && (x2apic_phys || x2apic_fadt_phys()); } +static void x2apic_send_IPI(int cpu, int vector) +{ + u32 dest = per_cpu(x86_cpu_to_apicid, cpu); + + x2apic_wrmsr_fence(); + __x2apic_send_IPI_dest(dest, vector, APIC_DEST_PHYSICAL); +} + static void __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) { @@ -122,6 +130,7 @@ static struct apic apic_x2apic_phys = { .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, + .send_IPI = x2apic_send_IPI, .send_IPI_mask = x2apic_send_IPI_mask, .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself, .send_IPI_allbutself = x2apic_send_IPI_allbutself, -- cgit v0.10.2 From 8642ea953d99fc037c1076e9a8b3a822025fb251 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Nov 2015 22:57:05 +0000 Subject: x86/apic: Wire up single IPI for x2apic_uv The function already exists. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Linus Torvalds Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Mike Travis Cc: Daniel J Blueman Link: http://lkml.kernel.org/r/20151104220849.376775625@linutronix.de diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 4a13946..d760c6b 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -406,6 +406,7 @@ static struct apic __refdata apic_x2apic_uv_x = { .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and, + .send_IPI = uv_send_IPI_one, .send_IPI_mask = uv_send_IPI_mask, .send_IPI_mask_allbutself = uv_send_IPI_mask_allbutself, .send_IPI_allbutself = uv_send_IPI_allbutself, -- cgit v0.10.2 From c61a0d31ba0ce75cb1b88bb4eb2f41a1b80bc90f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Nov 2015 22:57:06 +0000 Subject: x86/apic: Wire up single IPI for apic_numachip The function already exists. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Linus Torvalds Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Mike Travis Cc: Daniel J Blueman Link: http://lkml.kernel.org/r/20151104220849.551445489@linutronix.de diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 38dd5ef..69329a6 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -276,6 +276,7 @@ static const struct apic apic_numachip1 __refconst = { .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, + .send_IPI = numachip_send_IPI_one, .send_IPI_mask = numachip_send_IPI_mask, .send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself, .send_IPI_allbutself = numachip_send_IPI_allbutself, @@ -327,6 +328,7 @@ static const struct apic apic_numachip2 __refconst = { .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, + .send_IPI = numachip_send_IPI_one, .send_IPI_mask = numachip_send_IPI_mask, .send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself, .send_IPI_allbutself = numachip_send_IPI_allbutself, -- cgit v0.10.2 From 4727da2eb1ec79fdc2acdd2f764b5b2aacab998c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Nov 2015 22:57:06 +0000 Subject: x86/apic: Implement single IPI for apic_noop Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Linus Torvalds Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Mike Travis Cc: Daniel J Blueman Link: http://lkml.kernel.org/r/20151104220849.455429817@linutronix.de diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 0d96749..331a7a0 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -30,6 +30,7 @@ #include static void noop_init_apic_ldr(void) { } +static void noop_send_IPI(int cpu, int vector) { } static void noop_send_IPI_mask(const struct cpumask *cpumask, int vector) { } static void noop_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { } static void noop_send_IPI_allbutself(int vector) { } @@ -144,6 +145,7 @@ struct apic apic_noop = { .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, + .send_IPI = noop_send_IPI, .send_IPI_mask = noop_send_IPI_mask, .send_IPI_mask_allbutself = noop_send_IPI_mask_allbutself, .send_IPI_allbutself = noop_send_IPI_allbutself, -- cgit v0.10.2 From 7e29393b20a1a863a5f9bf48dc71e5cff4035ff5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Nov 2015 22:57:07 +0000 Subject: x86/apic: Provide default send single IPI wrapper Instead of doing the wrapping in the smp code we can provide a default wrapper for those APICs which insist on cpumasks. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Linus Torvalds Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Mike Travis Cc: Daniel J Blueman Link: http://lkml.kernel.org/r/20151104220849.631111846@linutronix.de diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h index 22998a8..cfc9a0d 100644 --- a/arch/x86/include/asm/ipi.h +++ b/arch/x86/include/asm/ipi.h @@ -119,6 +119,7 @@ static inline void native_apic_mem_write(APIC_ICR, cfg); } +extern void default_send_IPI_single(int cpu, int vector); extern void default_send_IPI_single_phys(int cpu, int vector); extern void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector); diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index 4fcffbf..eb45fc9 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -65,6 +65,14 @@ void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, local_irq_restore(flags); } +/* + * Helper function for APICs which insist on cpumasks + */ +void default_send_IPI_single(int cpu, int vector) +{ + apic->send_IPI_mask(cpumask_of(cpu), vector); +} + #ifdef CONFIG_X86_32 void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, -- cgit v0.10.2 From 6153058a03f4cc5200b0b29e201caa11779ebca0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Nov 2015 22:57:08 +0000 Subject: x86/apic: Use default send single IPI wrapper Wire up the default_send_IPI_single() wrapper to the last holdouts. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Linus Torvalds Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Mike Travis Cc: Daniel J Blueman Link: http://lkml.kernel.org/r/20151104220849.711224890@linutronix.de diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 9de25d4..9968f30 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -185,6 +185,7 @@ static struct apic apic_flat = { .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, + .send_IPI = default_send_IPI_single, .send_IPI_mask = flat_send_IPI_mask, .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself, .send_IPI_allbutself = flat_send_IPI_allbutself, diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 7694ae6..f316e34 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -105,6 +105,7 @@ static struct apic apic_default = { .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, + .send_IPI = default_send_IPI_single, .send_IPI_mask = default_send_IPI_mask_logical, .send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_logical, .send_IPI_allbutself = default_send_IPI_allbutself, -- cgit v0.10.2 From 72613184a1f076659e8a902d64351f50d3f9c990 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Nov 2015 22:57:09 +0000 Subject: x86/smp: Remove single IPI wrapper All APIC implementation have send_IPI now. Remove the conditional in the calling code. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Linus Torvalds Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Mike Travis Cc: Daniel J Blueman Link: http://lkml.kernel.org/r/20151104220849.807817097@linutronix.de diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 1dbf590..658777c 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -115,18 +115,6 @@ static atomic_t stopping_cpu = ATOMIC_INIT(-1); static bool smp_no_nmi_ipi = false; /* - * Helper wrapper: not all apic definitions support sending to - * a single CPU, so we fall back to sending to a mask. - */ -static void send_IPI_cpu(int cpu, int vector) -{ - if (apic->send_IPI) - apic->send_IPI(cpu, vector); - else - apic->send_IPI_mask(cpumask_of(cpu), vector); -} - -/* * this function sends a 'reschedule' IPI to another CPU. * it goes straight through and wastes no time serializing * anything. Worst case is that we lose a reschedule ... @@ -137,12 +125,12 @@ static void native_smp_send_reschedule(int cpu) WARN_ON(1); return; } - send_IPI_cpu(cpu, RESCHEDULE_VECTOR); + apic->send_IPI(cpu, RESCHEDULE_VECTOR); } void native_send_call_func_single_ipi(int cpu) { - send_IPI_cpu(cpu, CALL_FUNCTION_SINGLE_VECTOR); + apic->send_IPI(cpu, CALL_FUNCTION_SINGLE_VECTOR); } void native_send_call_func_ipi(const struct cpumask *mask) -- cgit v0.10.2 From 79f1d836925c545b4612f7ed19423f0950978b5e Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 3 Nov 2015 10:18:49 +0100 Subject: x86/paravirt: Kill some unused patching functions paravirt_patch_ignore() is completely unused and paravirt_patch_nop() doesn't do a whole lot. Remove them both. Signed-off-by: Borislav Petkov Reviewed-by: Juergen Gross Cc: Andrew Morton Cc: Andy Lutomirski Cc: Chris Wright Cc: Jeremy Fitzhardinge Cc: "Peter Zijlstra (Intel)" Cc: Rusty Russell Cc: virtualization@lists.linux-foundation.org Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1446542329-32037-1-git-send-email-bp@alien8.de Signed-off-by: Thomas Gleixner diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 31247b5..e1f31df 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -402,10 +402,8 @@ extern struct pv_lock_ops pv_lock_ops; __visible extern const char start_##ops##_##name[], end_##ops##_##name[]; \ asm(NATIVE_LABEL("start_", ops, name) code NATIVE_LABEL("end_", ops, name)) -unsigned paravirt_patch_nop(void); unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len); unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len); -unsigned paravirt_patch_ignore(unsigned len); unsigned paravirt_patch_call(void *insnbuf, const void *target, u16 tgt_clobbers, unsigned long addr, u16 site_clobbers, diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index c2130ae..4f32a10 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -74,16 +74,6 @@ void __init default_banner(void) /* Undefined instruction for dealing with missing ops pointers. */ static const unsigned char ud2a[] = { 0x0f, 0x0b }; -unsigned paravirt_patch_nop(void) -{ - return 0; -} - -unsigned paravirt_patch_ignore(unsigned len) -{ - return len; -} - struct branch { unsigned char opcode; u32 delta; @@ -152,8 +142,7 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, /* If there's no function, patch it with a ud2a (BUG) */ ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a)); else if (opfunc == _paravirt_nop) - /* If the operation is a nop, then nop the callsite */ - ret = paravirt_patch_nop(); + ret = 0; /* identity functions just return their single argument */ else if (opfunc == _paravirt_ident_32) -- cgit v0.10.2 From acc546fd6108cb17f87f985e4235b68756d7b01f Mon Sep 17 00:00:00 2001 From: Abhi Das Date: Tue, 10 Nov 2015 15:07:26 -0600 Subject: gfs2: Automatically set GFS2_DIF_SYSTEM flag on system files When new files and directories are created inside a parent directory we automatically inherit the GFS2_DIF_SYSTEM flag (if set) and assign it to the new file/dirs. All new system files/dirs created in the metafs by, say gfs2_jadd, will have this flag set because they will have parent directories in the metafs whose GFS2_DIF_SYSTEM flag has already been set (most likely by a previous mkfs.gfs2) Signed-off-by: Abhi Das Signed-off-by: Bob Peterson diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 5e42546..2012820 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -298,9 +298,9 @@ static int gfs2_set_flags(struct file *filp, u32 __user *ptr) gfsflags &= ~GFS2_DIF_TOPDIR; if (gfsflags & GFS2_DIF_INHERIT_JDATA) gfsflags ^= (GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA); - return do_gfs2_set_flags(filp, gfsflags, ~0); + return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_SYSTEM); } - return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_JDATA); + return do_gfs2_set_flags(filp, gfsflags, ~(GFS2_DIF_SYSTEM | GFS2_DIF_JDATA)); } static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 063fdfc..2c05bc3 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -685,6 +685,11 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, ip->i_entries = 2; break; } + + /* Force SYSTEM flag on all files and subdirs of a SYSTEM directory */ + if (dip->i_diskflags & GFS2_DIF_SYSTEM) + ip->i_diskflags |= GFS2_DIF_SYSTEM; + gfs2_set_inode_flags(inode); if ((GFS2_I(d_inode(sdp->sd_root_dir)) == dip) || -- cgit v0.10.2 From b54e16f1040adcd7341ab07ca07ee8577144e4fa Mon Sep 17 00:00:00 2001 From: Vasanthakumar Thiagarajan Date: Thu, 5 Nov 2015 11:33:58 +0530 Subject: ath10k: fix peer assoc complete WMI command for 10.4 There is an extra 4-byte member when compared to WMI 10.2 added to assoc complete command in WMI 10.4. This new member is used for 160Mhz related configuration. This WMI command mismatch between host and firmware does not cause any real issues because this new member is not used in 10.4 firmwares so far (10.4.1.00030-1). This difference in WMI command interface brings in a new wmi_ops for 10.4 gen_peer_assoc(). No noticeable functionality differences with this change can be seen with the current 10.4 firmwares, but the WMI interface has to be fixed to work with future 10.4 firmwares which may be using this new member. Signed-off-by: Vasanthakumar Thiagarajan Signed-off-by: Kalle Valo diff --git a/drivers/net/wireless/ath/ath10k/wmi.c b/drivers/net/wireless/ath/ath10k/wmi.c index 56806d9..9021079 100644 --- a/drivers/net/wireless/ath/ath10k/wmi.c +++ b/drivers/net/wireless/ath/ath10k/wmi.c @@ -6376,6 +6376,16 @@ ath10k_wmi_peer_assoc_fill_10_2(struct ath10k *ar, void *buf, cmd->info0 = __cpu_to_le32(info0); } +static void +ath10k_wmi_peer_assoc_fill_10_4(struct ath10k *ar, void *buf, + const struct wmi_peer_assoc_complete_arg *arg) +{ + struct wmi_10_4_peer_assoc_complete_cmd *cmd = buf; + + ath10k_wmi_peer_assoc_fill_10_2(ar, buf, arg); + cmd->peer_bw_rxnss_override = 0; +} + static int ath10k_wmi_peer_assoc_check_arg(const struct wmi_peer_assoc_complete_arg *arg) { @@ -6465,6 +6475,31 @@ ath10k_wmi_10_2_op_gen_peer_assoc(struct ath10k *ar, } static struct sk_buff * +ath10k_wmi_10_4_op_gen_peer_assoc(struct ath10k *ar, + const struct wmi_peer_assoc_complete_arg *arg) +{ + size_t len = sizeof(struct wmi_10_4_peer_assoc_complete_cmd); + struct sk_buff *skb; + int ret; + + ret = ath10k_wmi_peer_assoc_check_arg(arg); + if (ret) + return ERR_PTR(ret); + + skb = ath10k_wmi_alloc_skb(ar, len); + if (!skb) + return ERR_PTR(-ENOMEM); + + ath10k_wmi_peer_assoc_fill_10_4(ar, skb->data, arg); + + ath10k_dbg(ar, ATH10K_DBG_WMI, + "wmi peer assoc vdev %d addr %pM (%s)\n", + arg->vdev_id, arg->addr, + arg->peer_reassoc ? "reassociate" : "new"); + return skb; +} + +static struct sk_buff * ath10k_wmi_10_2_op_gen_pdev_get_temperature(struct ath10k *ar) { struct sk_buff *skb; @@ -7584,6 +7619,7 @@ static const struct wmi_ops wmi_10_4_ops = { .gen_peer_delete = ath10k_wmi_op_gen_peer_delete, .gen_peer_flush = ath10k_wmi_op_gen_peer_flush, .gen_peer_set_param = ath10k_wmi_op_gen_peer_set_param, + .gen_peer_assoc = ath10k_wmi_10_4_op_gen_peer_assoc, .gen_set_psmode = ath10k_wmi_op_gen_set_psmode, .gen_set_sta_ps = ath10k_wmi_op_gen_set_sta_ps, .gen_set_ap_ps = ath10k_wmi_op_gen_set_ap_ps, @@ -7603,7 +7639,6 @@ static const struct wmi_ops wmi_10_4_ops = { .fw_stats_fill = ath10k_wmi_10_4_op_fw_stats_fill, /* shared with 10.2 */ - .gen_peer_assoc = ath10k_wmi_10_2_op_gen_peer_assoc, .gen_request_stats = ath10k_wmi_op_gen_request_stats, .gen_pdev_get_temperature = ath10k_wmi_10_2_op_gen_pdev_get_temperature, }; diff --git a/drivers/net/wireless/ath/ath10k/wmi.h b/drivers/net/wireless/ath/ath10k/wmi.h index a35c91e..80d3f1c 100644 --- a/drivers/net/wireless/ath/ath10k/wmi.h +++ b/drivers/net/wireless/ath/ath10k/wmi.h @@ -5799,6 +5799,11 @@ struct wmi_10_2_peer_assoc_complete_cmd { __le32 info0; /* WMI_PEER_ASSOC_INFO0_ */ } __packed; +struct wmi_10_4_peer_assoc_complete_cmd { + struct wmi_10_2_peer_assoc_complete_cmd cmd; + __le32 peer_bw_rxnss_override; +} __packed; + struct wmi_peer_assoc_complete_arg { u8 addr[ETH_ALEN]; u32 vdev_id; -- cgit v0.10.2 From 8921f5f7781db8bc5f21451eb6816c2cc293c613 Mon Sep 17 00:00:00 2001 From: Vasanthakumar Thiagarajan Date: Thu, 5 Nov 2015 11:33:59 +0530 Subject: ath10k: rename the helper which is used for off-channel tx Rename ath10k_mac_need_offchan_tx_work() to ath10k_mac_tx_frm_has_freq() to make it more meaningful. This helper will be used in the future change. No functionality change. Signed-off-by: Vasanthakumar Thiagarajan Signed-off-by: Kalle Valo diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c index a53e213..363a99c 100644 --- a/drivers/net/wireless/ath/ath10k/mac.c +++ b/drivers/net/wireless/ath/ath10k/mac.c @@ -3287,7 +3287,7 @@ static void ath10k_tx_h_add_p2p_noa_ie(struct ath10k *ar, } } -static bool ath10k_mac_need_offchan_tx_work(struct ath10k *ar) +static bool ath10k_mac_tx_frm_has_freq(struct ath10k *ar) { /* FIXME: Not really sure since when the behaviour changed. At some * point new firmware stopped requiring creation of peer entries for @@ -3295,8 +3295,8 @@ static bool ath10k_mac_need_offchan_tx_work(struct ath10k *ar) * tx credit replenishment and reliability). Assuming it's at least 3.4 * because that's when the `freq` was introduced to TX_FRM HTT command. */ - return !(ar->htt.target_version_major >= 3 && - ar->htt.target_version_minor >= 4); + return (ar->htt.target_version_major >= 3 && + ar->htt.target_version_minor >= 4); } static int ath10k_mac_tx_wmi_mgmt(struct ath10k *ar, struct sk_buff *skb) @@ -3680,7 +3680,7 @@ static void ath10k_tx(struct ieee80211_hw *hw, ATH10K_SKB_CB(skb)->vdev_id = ar->scan.vdev_id; spin_unlock_bh(&ar->data_lock); - if (ath10k_mac_need_offchan_tx_work(ar)) { + if (!ath10k_mac_tx_frm_has_freq(ar)) { ATH10K_SKB_CB(skb)->htt.freq = 0; ATH10K_SKB_CB(skb)->htt.is_offchan = true; -- cgit v0.10.2 From d39de9919a0cded8ddb1655441cf403ad5690d6d Mon Sep 17 00:00:00 2001 From: Vasanthakumar Thiagarajan Date: Thu, 5 Nov 2015 11:34:00 +0530 Subject: ath10k: fix peerid configuration in htt tx desc for htt version < 3.4 Of a word in struct htt_data_tx_desc htt version >= 3.4 firmware uses LSB 16-bit for frequency configuration which is used for offchannel tx and MSB 16-bit is for peerid. But other firmwares using version 2.X (10.1, 10.2.2, 10.2.4 and 10.4) are using 32-bit for peerid in htt tx desc. So far no issue is found with the existing code setting peerid and freq for HTT version 2.X, this could be mainly because of 0 as frequecy (home channel) is being always passed with those firmwares. There may be issues when non-zero freq is passed with firmware using < 3.4 htt version. To be safe use target_version_major and target_version_minor along with htt-op-version before configuring peer id and freq in htt tx desc. This patch extends ath10k_mac_tx_frm_has_freq() to check for htt_op_version_tlv and uses the helper while setting peerid in htt_tx_desc. Fixes: 8d6d36243610 ("ath10k: fix offchan reliability") Signed-off-by: Vasanthakumar Thiagarajan Signed-off-by: Kalle Valo diff --git a/drivers/net/wireless/ath/ath10k/htt.h b/drivers/net/wireless/ath/ath10k/htt.h index 74ccfb29..2b87ed6 100644 --- a/drivers/net/wireless/ath/ath10k/htt.h +++ b/drivers/net/wireless/ath/ath10k/htt.h @@ -166,8 +166,13 @@ struct htt_data_tx_desc { __le16 len; __le16 id; __le32 frags_paddr; - __le16 peerid; - __le16 freq; + union { + __le32 peerid; + struct { + __le16 peerid; + __le16 freq; + } __packed offchan_tx; + } __packed; u8 prefetch[0]; /* start of frame, for FW classification engine */ } __packed; diff --git a/drivers/net/wireless/ath/ath10k/htt_tx.c b/drivers/net/wireless/ath/ath10k/htt_tx.c index bd8f264..8f76b9d 100644 --- a/drivers/net/wireless/ath/ath10k/htt_tx.c +++ b/drivers/net/wireless/ath/ath10k/htt_tx.c @@ -688,8 +688,15 @@ int ath10k_htt_tx(struct ath10k_htt *htt, struct sk_buff *msdu) skb_cb->htt.txbuf->cmd_tx.len = __cpu_to_le16(msdu->len); skb_cb->htt.txbuf->cmd_tx.id = __cpu_to_le16(msdu_id); skb_cb->htt.txbuf->cmd_tx.frags_paddr = __cpu_to_le32(frags_paddr); - skb_cb->htt.txbuf->cmd_tx.peerid = __cpu_to_le16(HTT_INVALID_PEERID); - skb_cb->htt.txbuf->cmd_tx.freq = __cpu_to_le16(skb_cb->htt.freq); + if (ath10k_mac_tx_frm_has_freq(ar)) { + skb_cb->htt.txbuf->cmd_tx.offchan_tx.peerid = + __cpu_to_le16(HTT_INVALID_PEERID); + skb_cb->htt.txbuf->cmd_tx.offchan_tx.freq = + __cpu_to_le16(skb_cb->htt.freq); + } else { + skb_cb->htt.txbuf->cmd_tx.peerid = + __cpu_to_le32(HTT_INVALID_PEERID); + } trace_ath10k_htt_tx(ar, msdu_id, msdu->len, vdev_id, tid); ath10k_dbg(ar, ATH10K_DBG_HTT, diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c index 363a99c..76484a9 100644 --- a/drivers/net/wireless/ath/ath10k/mac.c +++ b/drivers/net/wireless/ath/ath10k/mac.c @@ -3287,7 +3287,7 @@ static void ath10k_tx_h_add_p2p_noa_ie(struct ath10k *ar, } } -static bool ath10k_mac_tx_frm_has_freq(struct ath10k *ar) +bool ath10k_mac_tx_frm_has_freq(struct ath10k *ar) { /* FIXME: Not really sure since when the behaviour changed. At some * point new firmware stopped requiring creation of peer entries for @@ -3296,7 +3296,8 @@ static bool ath10k_mac_tx_frm_has_freq(struct ath10k *ar) * because that's when the `freq` was introduced to TX_FRM HTT command. */ return (ar->htt.target_version_major >= 3 && - ar->htt.target_version_minor >= 4); + ar->htt.target_version_minor >= 4 && + ar->htt.op_version == ATH10K_FW_HTT_OP_VERSION_TLV); } static int ath10k_mac_tx_wmi_mgmt(struct ath10k *ar, struct sk_buff *skb) diff --git a/drivers/net/wireless/ath/ath10k/mac.h b/drivers/net/wireless/ath/ath10k/mac.h index e3cefe4..f504804 100644 --- a/drivers/net/wireless/ath/ath10k/mac.h +++ b/drivers/net/wireless/ath/ath10k/mac.h @@ -74,6 +74,7 @@ void ath10k_mac_tx_lock(struct ath10k *ar, int reason); void ath10k_mac_tx_unlock(struct ath10k *ar, int reason); void ath10k_mac_vif_tx_lock(struct ath10k_vif *arvif, int reason); void ath10k_mac_vif_tx_unlock(struct ath10k_vif *arvif, int reason); +bool ath10k_mac_tx_frm_has_freq(struct ath10k *ar); static inline struct ath10k_vif *ath10k_vif_to_arvif(struct ieee80211_vif *vif) { -- cgit v0.10.2 From 023e06dfa6882f500b9c86fd61f0b1913aa07f36 Mon Sep 17 00:00:00 2001 From: Hakjoo Kim Date: Sun, 15 Mar 2015 23:00:32 +0100 Subject: pinctrl: exynos: add exynos5410 SoC specific data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add Samsung EXYNOS5410 SoC specific data to enable pinctrl support for all platforms based on EXYNOS5410. Signed-off-by: Hakjoo Kim [AF: Rebased onto Exynos5260, irq_chip consolidation, const'ification] Signed-off-by: Andreas Färber Acked-by: Tomasz Figa Tested-by: Pavel Fedin [k.kozlowski: Rebased on current v4.3] Signed-off-by: Krzysztof Kozlowski diff --git a/Documentation/devicetree/bindings/pinctrl/samsung-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/samsung-pinctrl.txt index 9d2a995..6db16b9 100644 --- a/Documentation/devicetree/bindings/pinctrl/samsung-pinctrl.txt +++ b/Documentation/devicetree/bindings/pinctrl/samsung-pinctrl.txt @@ -17,6 +17,7 @@ Required Properties: - "samsung,exynos4x12-pinctrl": for Exynos4x12 compatible pin-controller. - "samsung,exynos5250-pinctrl": for Exynos5250 compatible pin-controller. - "samsung,exynos5260-pinctrl": for Exynos5260 compatible pin-controller. + - "samsung,exynos5410-pinctrl": for Exynos5410 compatible pin-controller. - "samsung,exynos5420-pinctrl": for Exynos5420 compatible pin-controller. - "samsung,exynos7-pinctrl": for Exynos7 compatible pin-controller. diff --git a/drivers/pinctrl/samsung/pinctrl-exynos.c b/drivers/pinctrl/samsung/pinctrl-exynos.c index 71ccf6a..16e2293 100644 --- a/drivers/pinctrl/samsung/pinctrl-exynos.c +++ b/drivers/pinctrl/samsung/pinctrl-exynos.c @@ -1150,6 +1150,109 @@ const struct samsung_pin_ctrl exynos5260_pin_ctrl[] __initconst = { }, }; +/* pin banks of exynos5410 pin-controller 0 */ +static const struct samsung_pin_bank_data exynos5410_pin_banks0[] __initconst = { + EXYNOS_PIN_BANK_EINTG(8, 0x000, "gpa0", 0x00), + EXYNOS_PIN_BANK_EINTG(6, 0x020, "gpa1", 0x04), + EXYNOS_PIN_BANK_EINTG(8, 0x040, "gpa2", 0x08), + EXYNOS_PIN_BANK_EINTG(5, 0x060, "gpb0", 0x0c), + EXYNOS_PIN_BANK_EINTG(5, 0x080, "gpb1", 0x10), + EXYNOS_PIN_BANK_EINTG(4, 0x0A0, "gpb2", 0x14), + EXYNOS_PIN_BANK_EINTG(4, 0x0C0, "gpb3", 0x18), + EXYNOS_PIN_BANK_EINTG(7, 0x0E0, "gpc0", 0x1c), + EXYNOS_PIN_BANK_EINTG(4, 0x100, "gpc3", 0x20), + EXYNOS_PIN_BANK_EINTG(7, 0x120, "gpc1", 0x24), + EXYNOS_PIN_BANK_EINTG(7, 0x140, "gpc2", 0x28), + EXYNOS_PIN_BANK_EINTN(2, 0x160, "gpm5"), + EXYNOS_PIN_BANK_EINTG(8, 0x180, "gpd1", 0x2c), + EXYNOS_PIN_BANK_EINTG(8, 0x1A0, "gpe0", 0x30), + EXYNOS_PIN_BANK_EINTG(2, 0x1C0, "gpe1", 0x34), + EXYNOS_PIN_BANK_EINTG(6, 0x1E0, "gpf0", 0x38), + EXYNOS_PIN_BANK_EINTG(8, 0x200, "gpf1", 0x3c), + EXYNOS_PIN_BANK_EINTG(8, 0x220, "gpg0", 0x40), + EXYNOS_PIN_BANK_EINTG(8, 0x240, "gpg1", 0x44), + EXYNOS_PIN_BANK_EINTG(2, 0x260, "gpg2", 0x48), + EXYNOS_PIN_BANK_EINTG(4, 0x280, "gph0", 0x4c), + EXYNOS_PIN_BANK_EINTG(8, 0x2A0, "gph1", 0x50), + EXYNOS_PIN_BANK_EINTN(8, 0x2C0, "gpm7"), + EXYNOS_PIN_BANK_EINTN(6, 0x2E0, "gpy0"), + EXYNOS_PIN_BANK_EINTN(4, 0x300, "gpy1"), + EXYNOS_PIN_BANK_EINTN(6, 0x320, "gpy2"), + EXYNOS_PIN_BANK_EINTN(8, 0x340, "gpy3"), + EXYNOS_PIN_BANK_EINTN(8, 0x360, "gpy4"), + EXYNOS_PIN_BANK_EINTN(8, 0x380, "gpy5"), + EXYNOS_PIN_BANK_EINTN(8, 0x3A0, "gpy6"), + EXYNOS_PIN_BANK_EINTN(8, 0x3C0, "gpy7"), + EXYNOS_PIN_BANK_EINTW(8, 0xC00, "gpx0", 0x00), + EXYNOS_PIN_BANK_EINTW(8, 0xC20, "gpx1", 0x04), + EXYNOS_PIN_BANK_EINTW(8, 0xC40, "gpx2", 0x08), + EXYNOS_PIN_BANK_EINTW(8, 0xC60, "gpx3", 0x0c), +}; + +/* pin banks of exynos5410 pin-controller 1 */ +static const struct samsung_pin_bank_data exynos5410_pin_banks1[] __initconst = { + EXYNOS_PIN_BANK_EINTG(5, 0x000, "gpj0", 0x00), + EXYNOS_PIN_BANK_EINTG(8, 0x020, "gpj1", 0x04), + EXYNOS_PIN_BANK_EINTG(8, 0x040, "gpj2", 0x08), + EXYNOS_PIN_BANK_EINTG(8, 0x060, "gpj3", 0x0c), + EXYNOS_PIN_BANK_EINTG(2, 0x080, "gpj4", 0x10), + EXYNOS_PIN_BANK_EINTG(8, 0x0A0, "gpk0", 0x14), + EXYNOS_PIN_BANK_EINTG(8, 0x0C0, "gpk1", 0x18), + EXYNOS_PIN_BANK_EINTG(8, 0x0E0, "gpk2", 0x1c), + EXYNOS_PIN_BANK_EINTG(7, 0x100, "gpk3", 0x20), +}; + +/* pin banks of exynos5410 pin-controller 2 */ +static const struct samsung_pin_bank_data exynos5410_pin_banks2[] __initconst = { + EXYNOS_PIN_BANK_EINTG(8, 0x000, "gpv0", 0x00), + EXYNOS_PIN_BANK_EINTG(8, 0x020, "gpv1", 0x04), + EXYNOS_PIN_BANK_EINTG(8, 0x060, "gpv2", 0x08), + EXYNOS_PIN_BANK_EINTG(8, 0x080, "gpv3", 0x0c), + EXYNOS_PIN_BANK_EINTG(2, 0x0C0, "gpv4", 0x10), +}; + +/* pin banks of exynos5410 pin-controller 3 */ +static const struct samsung_pin_bank_data exynos5410_pin_banks3[] __initconst = { + EXYNOS_PIN_BANK_EINTG(7, 0x000, "gpz", 0x00), +}; + +/* + * Samsung pinctrl driver data for Exynos5410 SoC. Exynos5410 SoC includes + * four gpio/pin-mux/pinconfig controllers. + */ +const struct samsung_pin_ctrl exynos5410_pin_ctrl[] __initconst = { + { + /* pin-controller instance 0 data */ + .pin_banks = exynos5410_pin_banks0, + .nr_banks = ARRAY_SIZE(exynos5410_pin_banks0), + .eint_gpio_init = exynos_eint_gpio_init, + .eint_wkup_init = exynos_eint_wkup_init, + .suspend = exynos_pinctrl_suspend, + .resume = exynos_pinctrl_resume, + }, { + /* pin-controller instance 1 data */ + .pin_banks = exynos5410_pin_banks1, + .nr_banks = ARRAY_SIZE(exynos5410_pin_banks1), + .eint_gpio_init = exynos_eint_gpio_init, + .suspend = exynos_pinctrl_suspend, + .resume = exynos_pinctrl_resume, + }, { + /* pin-controller instance 2 data */ + .pin_banks = exynos5410_pin_banks2, + .nr_banks = ARRAY_SIZE(exynos5410_pin_banks2), + .eint_gpio_init = exynos_eint_gpio_init, + .suspend = exynos_pinctrl_suspend, + .resume = exynos_pinctrl_resume, + }, { + /* pin-controller instance 3 data */ + .pin_banks = exynos5410_pin_banks3, + .nr_banks = ARRAY_SIZE(exynos5410_pin_banks3), + .eint_gpio_init = exynos_eint_gpio_init, + .suspend = exynos_pinctrl_suspend, + .resume = exynos_pinctrl_resume, + }, +}; + /* pin banks of exynos5420 pin-controller 0 */ static const struct samsung_pin_bank_data exynos5420_pin_banks0[] __initconst = { EXYNOS_PIN_BANK_EINTG(8, 0x000, "gpy7", 0x00), diff --git a/drivers/pinctrl/samsung/pinctrl-samsung.c b/drivers/pinctrl/samsung/pinctrl-samsung.c index 3f622cc..48294e7 100644 --- a/drivers/pinctrl/samsung/pinctrl-samsung.c +++ b/drivers/pinctrl/samsung/pinctrl-samsung.c @@ -1222,6 +1222,8 @@ static const struct of_device_id samsung_pinctrl_dt_match[] = { .data = (void *)exynos5250_pin_ctrl }, { .compatible = "samsung,exynos5260-pinctrl", .data = (void *)exynos5260_pin_ctrl }, + { .compatible = "samsung,exynos5410-pinctrl", + .data = (void *)exynos5410_pin_ctrl }, { .compatible = "samsung,exynos5420-pinctrl", .data = (void *)exynos5420_pin_ctrl }, { .compatible = "samsung,exynos5433-pinctrl", diff --git a/drivers/pinctrl/samsung/pinctrl-samsung.h b/drivers/pinctrl/samsung/pinctrl-samsung.h index c1239ff..cd31bfa 100644 --- a/drivers/pinctrl/samsung/pinctrl-samsung.h +++ b/drivers/pinctrl/samsung/pinctrl-samsung.h @@ -270,6 +270,7 @@ extern const struct samsung_pin_ctrl exynos4x12_pin_ctrl[]; extern const struct samsung_pin_ctrl exynos4415_pin_ctrl[]; extern const struct samsung_pin_ctrl exynos5250_pin_ctrl[]; extern const struct samsung_pin_ctrl exynos5260_pin_ctrl[]; +extern const struct samsung_pin_ctrl exynos5410_pin_ctrl[]; extern const struct samsung_pin_ctrl exynos5420_pin_ctrl[]; extern const struct samsung_pin_ctrl exynos5433_pin_ctrl[]; extern const struct samsung_pin_ctrl exynos7_pin_ctrl[]; -- cgit v0.10.2 From 823873481b2a17ce5900899f8ef85118f8407b67 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Wed, 11 Nov 2015 09:22:36 -0200 Subject: [media] Revert "[media] ivtv: avoid going past input/audio array" This patch broke ivtv logic, as reported at https://bugzilla.redhat.com/show_bug.cgi?id=1278942 This reverts commit 09290cc885937cab3b2d60a6d48fe3d2d3e04061. Cc: stable@vger.kernel.org # for v4.1 and upper Signed-off-by: Mauro Carvalho Chehab diff --git a/drivers/media/pci/ivtv/ivtv-driver.c b/drivers/media/pci/ivtv/ivtv-driver.c index 8616fa8..c2e60b4 100644 --- a/drivers/media/pci/ivtv/ivtv-driver.c +++ b/drivers/media/pci/ivtv/ivtv-driver.c @@ -805,11 +805,11 @@ static void ivtv_init_struct2(struct ivtv *itv) { int i; - for (i = 0; i < IVTV_CARD_MAX_VIDEO_INPUTS - 1; i++) + for (i = 0; i < IVTV_CARD_MAX_VIDEO_INPUTS; i++) if (itv->card->video_inputs[i].video_type == 0) break; itv->nof_inputs = i; - for (i = 0; i < IVTV_CARD_MAX_AUDIO_INPUTS - 1; i++) + for (i = 0; i < IVTV_CARD_MAX_AUDIO_INPUTS; i++) if (itv->card->audio_inputs[i].audio_type == 0) break; itv->nof_audio_inputs = i; -- cgit v0.10.2 From d55ebd07b6c21a1c7e3e74f1b73b3b033cece2b5 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Wed, 11 Nov 2015 09:27:42 -0200 Subject: [media] ivtv: avoid going past input/audio array As reported by smatch: drivers/media/pci/ivtv/ivtv-driver.c:832 ivtv_init_struct2() error: buffer overflow 'itv->card->video_inputs' 6 <= 6 Signed-off-by: Mauro Carvalho Chehab diff --git a/drivers/media/pci/ivtv/ivtv-driver.c b/drivers/media/pci/ivtv/ivtv-driver.c index c2e60b4..2bb10cd 100644 --- a/drivers/media/pci/ivtv/ivtv-driver.c +++ b/drivers/media/pci/ivtv/ivtv-driver.c @@ -826,7 +826,7 @@ static void ivtv_init_struct2(struct ivtv *itv) IVTV_CARD_INPUT_VID_TUNER) break; } - if (i == itv->nof_inputs) + if (i >= itv->nof_inputs) i = 0; itv->active_input = i; itv->audio_input = itv->card->video_inputs[i].audio_index; -- cgit v0.10.2 From 7b6e55b9703810d771cf324d7b6cd0e1c095c86a Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Wed, 4 Nov 2015 11:07:09 -0200 Subject: [media] demux.h: move documentation overview from device-drivers.tmpl It is better to keep the documentation overview at the header file, as this makes easier for developers to remember to fix when needed. Suggested-by: Laurent Pinchart Signed-off-by: Mauro Carvalho Chehab diff --git a/Documentation/DocBook/device-drivers.tmpl b/Documentation/DocBook/device-drivers.tmpl index 42a2d85..c2bc8f7 100644 --- a/Documentation/DocBook/device-drivers.tmpl +++ b/Documentation/DocBook/device-drivers.tmpl @@ -243,71 +243,12 @@ X!Isound/sound_firmware.c !Idrivers/media/dvb-core/dvb_math.h !Idrivers/media/dvb-core/dvb_ringbuffer.h !Idrivers/media/dvb-core/dvbdev.h - Digital TV Demux API - The kernel demux API defines a driver-internal interface for - registering low-level, hardware specific driver to a hardware - independent demux layer. It is only of interest for Digital TV - device driver writers. The header file for this API is named - demux.h and located in - drivers/media/dvb-core. - - The demux API should be implemented for each demux in the - system. It is used to select the TS source of a demux and to manage - the demux resources. When the demux client allocates a resource via - the demux API, it receives a pointer to the API of that - resource. - Each demux receives its TS input from a DVB front-end or from - memory, as set via this demux API. In a system with more than one - front-end, the API can be used to select one of the DVB front-ends - as a TS source for a demux, unless this is fixed in the HW platform. - The demux API only controls front-ends regarding to their connections - with demuxes; the APIs used to set the other front-end parameters, - such as tuning, are not defined in this document. - The functions that implement the abstract interface demux should - be defined static or module private and registered to the Demux - core for external access. It is not necessary to implement every - function in the struct dmx_demux. For example, - a demux interface might support Section filtering, but not PES - filtering. The API client is expected to check the value of any - function pointer before calling the function: the value of NULL means - that the “function is not available”. - Whenever the functions of the demux API modify shared data, - the possibilities of lost update and race condition problems should - be addressed, e.g. by protecting parts of code with mutexes. - Note that functions called from a bottom half context must not - sleep. Even a simple memory allocation without using GFP_ATOMIC can - result in a kernel thread being put to sleep if swapping is needed. - For example, the Linux kernel calls the functions of a network device - interface from a bottom half context. Thus, if a demux API function - is called from network device code, the function must not sleep. - - - -
- Demux Callback API - This kernel-space API comprises the callback functions that - deliver filtered data to the demux client. Unlike the other DVB - kABIs, these functions are provided by the client and called from - the demux code. - The function pointers of this abstract interface are not - packed into a structure as in the other demux APIs, because the - callback functions are registered and used independent of each - other. As an example, it is possible for the API client to provide - several callback functions for receiving TS packets and no - callbacks for PES packets or sections. - The functions that implement the callback API need not be - re-entrant: when a demux driver calls one of these functions, - the driver is not allowed to call the function again before - the original call returns. If a callback is triggered by a - hardware interrupt, it is recommended to use the Linux - “bottom half” mechanism or start a tasklet instead of - making the callback function call directly from a hardware - interrupt. - This mechanism is implemented by - dmx_ts_cb() and - dmx_section_cb(). -
- + Digital TV Demux API +!Pdrivers/media/dvb-core/demux.h Digital TV Demux API + + Demux Callback API +!Pdrivers/media/dvb-core/demux.h Demux Callback API + !Idrivers/media/dvb-core/demux.h Remote Controller devices diff --git a/drivers/media/dvb-core/demux.h b/drivers/media/dvb-core/demux.h index ccc1f43..f8014aa 100644 --- a/drivers/media/dvb-core/demux.h +++ b/drivers/media/dvb-core/demux.h @@ -32,6 +32,49 @@ #include #include +/** + * DOC: Digital TV Demux API + * + * The kernel demux API defines a driver-internal interface for registering + * low-level, hardware specific driver to a hardware independent demux layer. + * It is only of interest for Digital TV device driver writers. + * The header file for this API is named demux.h and located in + * drivers/media/dvb-core. + * + * The demux API should be implemented for each demux in the system. It is + * used to select the TS source of a demux and to manage the demux resources. + * When the demux client allocates a resource via the demux API, it receives + * a pointer to the API of that resource. + * + * Each demux receives its TS input from a DVB front-end or from memory, as + * set via this demux API. In a system with more than one front-end, the API + * can be used to select one of the DVB front-ends as a TS source for a demux, + * unless this is fixed in the HW platform. + * + * The demux API only controls front-ends regarding to their connections with + * demuxes; the APIs used to set the other front-end parameters, such as + * tuning, are not defined in this document. + * + * The functions that implement the abstract interface demux should be defined + * static or module private and registered to the Demux core for external + * access. It is not necessary to implement every function in the struct + * &dmx_demux. For example, a demux interface might support Section filtering, + * but not PES filtering. The API client is expected to check the value of any + * function pointer before calling the function: the value of NULL means + * that the function is not available. + * + * Whenever the functions of the demux API modify shared data, the + * possibilities of lost update and race condition problems should be + * addressed, e.g. by protecting parts of code with mutexes. + * + * Note that functions called from a bottom half context must not sleep. + * Even a simple memory allocation without using %GFP_ATOMIC can result in a + * kernel thread being put to sleep if swapping is needed. For example, the + * Linux Kernel calls the functions of a network device interface from a + * bottom half context. Thus, if a demux API function is called from network + * device code, the function must not sleep. + */ + /* * Common definitions */ @@ -187,8 +230,28 @@ struct dmx_section_feed { int (*stop_filtering)(struct dmx_section_feed *feed); }; -/* - * Callback functions +/** + * DOC: Demux Callback API + * + * This kernel-space API comprises the callback functions that deliver filtered + * data to the demux client. Unlike the other DVB kABIs, these functions are + * provided by the client and called from the demux code. + * + * The function pointers of this abstract interface are not packed into a + * structure as in the other demux APIs, because the callback functions are + * registered and used independent of each other. As an example, it is possible + * for the API client to provide several callback functions for receiving TS + * packets and no callbacks for PES packets or sections. + * + * The functions that implement the callback API need not be re-entrant: when + * a demux driver calls one of these functions, the driver is not allowed to + * call the function again before the original call returns. If a callback is + * triggered by a hardware interrupt, it is recommended to use the Linux + * bottom half mechanism or start a tasklet instead of making the callback + * function call directly from a hardware interrupt. + * + * This mechanism is implemented by dmx_ts_cb() and dmx_section_cb() + * callbacks. */ /** -- cgit v0.10.2 From 6747b39441925f247f15d04b89be3588f465ad57 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Wed, 4 Nov 2015 12:07:02 -0200 Subject: [media] device-drivers.tmpl: better organize DVB function calls Classify the functions at the DVB core per API. That makes easier to understand how they're related to the userspace API. Signed-off-by: Mauro Carvalho Chehab diff --git a/Documentation/DocBook/device-drivers.tmpl b/Documentation/DocBook/device-drivers.tmpl index c2bc8f7..fc7242d 100644 --- a/Documentation/DocBook/device-drivers.tmpl +++ b/Documentation/DocBook/device-drivers.tmpl @@ -238,19 +238,25 @@ X!Isound/sound_firmware.c !Iinclude/media/videobuf2-memops.h Digital TV (DVB) devices -!Idrivers/media/dvb-core/dvb_ca_en50221.h -!Idrivers/media/dvb-core/dvb_frontend.h + Digital TV Common functions !Idrivers/media/dvb-core/dvb_math.h !Idrivers/media/dvb-core/dvb_ringbuffer.h !Idrivers/media/dvb-core/dvbdev.h - Digital TV Demux API -!Pdrivers/media/dvb-core/demux.h Digital TV Demux API - - Demux Callback API -!Pdrivers/media/dvb-core/demux.h Demux Callback API - + + Digital TV Frontend kABI +!Idrivers/media/dvb-core/dvb_frontend.h + + Digital TV Demux kABI +!Pdrivers/media/dvb-core/demux.h Digital TV Demux + Demux Callback API +!Pdrivers/media/dvb-core/demux.h Demux Callback + !Idrivers/media/dvb-core/demux.h - + + Digital TV Conditional Access kABI +!Idrivers/media/dvb-core/dvb_ca_en50221.h + + Remote Controller devices !Iinclude/media/rc-core.h !Iinclude/media/lirc_dev.h diff --git a/drivers/media/dvb-core/demux.h b/drivers/media/dvb-core/demux.h index f8014aa..f716e14 100644 --- a/drivers/media/dvb-core/demux.h +++ b/drivers/media/dvb-core/demux.h @@ -33,7 +33,7 @@ #include /** - * DOC: Digital TV Demux API + * DOC: Digital TV Demux * * The kernel demux API defines a driver-internal interface for registering * low-level, hardware specific driver to a hardware independent demux layer. @@ -231,7 +231,7 @@ struct dmx_section_feed { }; /** - * DOC: Demux Callback API + * DOC: Demux Callback * * This kernel-space API comprises the callback functions that deliver filtered * data to the demux client. Unlike the other DVB kABIs, these functions are -- cgit v0.10.2 From 29bb45f25ff3051354ed330c0d0f10418a2b8c7c Mon Sep 17 00:00:00 2001 From: Simon Arlott Date: Thu, 29 Oct 2015 19:58:47 +0000 Subject: regmap-mmio: Use native endianness for read/write The regmap API has an endianness setting for formatting reads and writes. This can be set by the usual DT "little-endian" and "big-endian" properties. To work properly the associated regmap_bus needs to read/write in native endian. The "syscon" DT device binding creates an mmio-based regmap_bus which performs all reads/writes as little-endian. These values are then converted again by regmap, which means that all of the MIPS BCM boards (which are big-endian) have been declared as "little-endian" to get regmap to convert them back to big-endian. Modify regmap-mmio to use the native-endian functions __raw_read*() and __raw_write*() instead of the little-endian functions read*() and write*(). Modify the big-endian MIPS BCM boards to use what will now be the correct endianness instead of pretending that the devices are little-endian. Signed-off-by: Simon Arlott Signed-off-by: Mark Brown diff --git a/arch/mips/boot/dts/brcm/bcm6328.dtsi b/arch/mips/boot/dts/brcm/bcm6328.dtsi index 41891c1..d52ce3d 100644 --- a/arch/mips/boot/dts/brcm/bcm6328.dtsi +++ b/arch/mips/boot/dts/brcm/bcm6328.dtsi @@ -73,7 +73,6 @@ timer: timer@10000040 { compatible = "syscon"; reg = <0x10000040 0x2c>; - little-endian; }; reboot { diff --git a/arch/mips/boot/dts/brcm/bcm7125.dtsi b/arch/mips/boot/dts/brcm/bcm7125.dtsi index 1a7efa8..4fc7ece 100644 --- a/arch/mips/boot/dts/brcm/bcm7125.dtsi +++ b/arch/mips/boot/dts/brcm/bcm7125.dtsi @@ -98,7 +98,6 @@ sun_top_ctrl: syscon@404000 { compatible = "brcm,bcm7125-sun-top-ctrl", "syscon"; reg = <0x404000 0x60c>; - little-endian; }; reboot { diff --git a/arch/mips/boot/dts/brcm/bcm7346.dtsi b/arch/mips/boot/dts/brcm/bcm7346.dtsi index d4bf52c..a3039bb 100644 --- a/arch/mips/boot/dts/brcm/bcm7346.dtsi +++ b/arch/mips/boot/dts/brcm/bcm7346.dtsi @@ -118,7 +118,6 @@ sun_top_ctrl: syscon@404000 { compatible = "brcm,bcm7346-sun-top-ctrl", "syscon"; reg = <0x404000 0x51c>; - little-endian; }; reboot { diff --git a/arch/mips/boot/dts/brcm/bcm7358.dtsi b/arch/mips/boot/dts/brcm/bcm7358.dtsi index 8e25016..4274ff4 100644 --- a/arch/mips/boot/dts/brcm/bcm7358.dtsi +++ b/arch/mips/boot/dts/brcm/bcm7358.dtsi @@ -112,7 +112,6 @@ sun_top_ctrl: syscon@404000 { compatible = "brcm,bcm7358-sun-top-ctrl", "syscon"; reg = <0x404000 0x51c>; - little-endian; }; reboot { diff --git a/arch/mips/boot/dts/brcm/bcm7360.dtsi b/arch/mips/boot/dts/brcm/bcm7360.dtsi index 7e5f760..0dcc9163 100644 --- a/arch/mips/boot/dts/brcm/bcm7360.dtsi +++ b/arch/mips/boot/dts/brcm/bcm7360.dtsi @@ -112,7 +112,6 @@ sun_top_ctrl: syscon@404000 { compatible = "brcm,bcm7360-sun-top-ctrl", "syscon"; reg = <0x404000 0x51c>; - little-endian; }; reboot { diff --git a/arch/mips/boot/dts/brcm/bcm7362.dtsi b/arch/mips/boot/dts/brcm/bcm7362.dtsi index c739ea7..2f3f9fc 100644 --- a/arch/mips/boot/dts/brcm/bcm7362.dtsi +++ b/arch/mips/boot/dts/brcm/bcm7362.dtsi @@ -118,7 +118,6 @@ sun_top_ctrl: syscon@404000 { compatible = "brcm,bcm7362-sun-top-ctrl", "syscon"; reg = <0x404000 0x51c>; - little-endian; }; reboot { diff --git a/arch/mips/boot/dts/brcm/bcm7420.dtsi b/arch/mips/boot/dts/brcm/bcm7420.dtsi index 5f55d0a..bee221b 100644 --- a/arch/mips/boot/dts/brcm/bcm7420.dtsi +++ b/arch/mips/boot/dts/brcm/bcm7420.dtsi @@ -99,7 +99,6 @@ sun_top_ctrl: syscon@404000 { compatible = "brcm,bcm7420-sun-top-ctrl", "syscon"; reg = <0x404000 0x60c>; - little-endian; }; reboot { diff --git a/arch/mips/boot/dts/brcm/bcm7425.dtsi b/arch/mips/boot/dts/brcm/bcm7425.dtsi index e24d41a..571f30f 100644 --- a/arch/mips/boot/dts/brcm/bcm7425.dtsi +++ b/arch/mips/boot/dts/brcm/bcm7425.dtsi @@ -100,7 +100,6 @@ sun_top_ctrl: syscon@404000 { compatible = "brcm,bcm7425-sun-top-ctrl", "syscon"; reg = <0x404000 0x51c>; - little-endian; }; reboot { diff --git a/arch/mips/boot/dts/brcm/bcm7435.dtsi b/arch/mips/boot/dts/brcm/bcm7435.dtsi index 8b9432c..614ee21 100644 --- a/arch/mips/boot/dts/brcm/bcm7435.dtsi +++ b/arch/mips/boot/dts/brcm/bcm7435.dtsi @@ -114,7 +114,6 @@ sun_top_ctrl: syscon@404000 { compatible = "brcm,bcm7425-sun-top-ctrl", "syscon"; reg = <0x404000 0x51c>; - little-endian; }; reboot { diff --git a/drivers/base/regmap/regmap-mmio.c b/drivers/base/regmap/regmap-mmio.c index 426a57e..8a77876 100644 --- a/drivers/base/regmap/regmap-mmio.c +++ b/drivers/base/regmap/regmap-mmio.c @@ -106,17 +106,17 @@ static int regmap_mmio_gather_write(void *context, while (val_size) { switch (ctx->val_bytes) { case 1: - writeb(*(u8 *)val, ctx->regs + offset); + __raw_writeb(*(u8 *)val, ctx->regs + offset); break; case 2: - writew(*(u16 *)val, ctx->regs + offset); + __raw_writew(*(u16 *)val, ctx->regs + offset); break; case 4: - writel(*(u32 *)val, ctx->regs + offset); + __raw_writel(*(u32 *)val, ctx->regs + offset); break; #ifdef CONFIG_64BIT case 8: - writeq(*(u64 *)val, ctx->regs + offset); + __raw_writeq(*(u64 *)val, ctx->regs + offset); break; #endif default: @@ -166,17 +166,17 @@ static int regmap_mmio_read(void *context, while (val_size) { switch (ctx->val_bytes) { case 1: - *(u8 *)val = readb(ctx->regs + offset); + *(u8 *)val = __raw_readb(ctx->regs + offset); break; case 2: - *(u16 *)val = readw(ctx->regs + offset); + *(u16 *)val = __raw_readw(ctx->regs + offset); break; case 4: - *(u32 *)val = readl(ctx->regs + offset); + *(u32 *)val = __raw_readl(ctx->regs + offset); break; #ifdef CONFIG_64BIT case 8: - *(u64 *)val = readq(ctx->regs + offset); + *(u64 *)val = __raw_readq(ctx->regs + offset); break; #endif default: -- cgit v0.10.2 From 6399aea629b02a23364efcb6eea1319b8e9d1abf Mon Sep 17 00:00:00 2001 From: Nikesh Oswal Date: Wed, 21 Oct 2015 14:16:14 +0100 Subject: regmap: rbtree: When adding a reg do a bsearch for target node A binary search is much more efficient rather than iterating over the rbtree in ascending order which the current code is doing. During initialisation the reg defaults are written to the cache in a large chunk and these are always sorted in the ascending order so for this situation ideally we should have iterated the rbtree in descending order. But at runtime the drivers may write into the cache in any random order so this patch selects to use a bsearch to give an optimal runtime performance and also at initialisation time when reg defaults are written the performance of binary search would be much better than iterating in ascending order which the current code was doing. Signed-off-by: Nikesh Oswal Signed-off-by: Mark Brown diff --git a/drivers/base/regmap/regcache-rbtree.c b/drivers/base/regmap/regcache-rbtree.c index 56486d9..353f602 100644 --- a/drivers/base/regmap/regcache-rbtree.c +++ b/drivers/base/regmap/regcache-rbtree.c @@ -413,8 +413,8 @@ static int regcache_rbtree_write(struct regmap *map, unsigned int reg, max = reg + max_dist; /* look for an adjacent register to the one we are about to add */ - for (node = rb_first(&rbtree_ctx->root); node; - node = rb_next(node)) { + node = rbtree_ctx->root.rb_node; + while (node) { rbnode_tmp = rb_entry(node, struct regcache_rbtree_node, node); @@ -425,6 +425,11 @@ static int regcache_rbtree_write(struct regmap *map, unsigned int reg, new_base_reg = min(reg, base_reg); new_top_reg = max(reg, top_reg); } else { + if (max < base_reg) + node = node->rb_left; + else + node = node->rb_right; + continue; } -- cgit v0.10.2 From 9e8925b67a809bb27ce4b7d352d67f25cf1d7fc5 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 16 Nov 2015 09:49:34 -0500 Subject: locks: Allow disabling mandatory locking at compile time Mandatory locking appears to be almost unused and buggy and there appears no real interest in doing anything with it. Since effectively no one uses the code and since the code is buggy let's allow it to be disabled at compile time. I would just suggest removing the code but undoubtedly that will break some piece of userspace code somewhere. For the distributions that don't care about this piece of code this gives a nice starting point to make mandatory locking go away. Cc: Benjamin Coddington Cc: Dmitry Vyukov Cc: Jeff Layton Cc: J. Bruce Fields Signed-off-by: "Eric W. Biederman" Signed-off-by: Jeff Layton diff --git a/fs/Kconfig b/fs/Kconfig index da3f32f..59322e6 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -67,6 +67,16 @@ config FILE_LOCKING for filesystems like NFS and for the flock() system call. Disabling this option saves about 11k. +config MANDATORY_FILE_LOCKING + bool "Enable Mandatory file locking" + depends on FILE_LOCKING + default y + help + This option enables files appropriately marked files on appropriely + mounted filesystems to support mandatory locking. + + To the best of my knowledge this is dead code that no one cares about. + source "fs/notify/Kconfig" source "fs/quota/Kconfig" diff --git a/fs/locks.c b/fs/locks.c index 0d2b326..86c9467 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1191,6 +1191,7 @@ static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl) return error; } +#ifdef CONFIG_MANDATORY_FILE_LOCKING /** * locks_mandatory_locked - Check for an active lock * @file: the file to check @@ -1289,6 +1290,7 @@ int locks_mandatory_area(int read_write, struct inode *inode, } EXPORT_SYMBOL(locks_mandatory_area); +#endif /* CONFIG_MANDATORY_FILE_LOCKING */ static void lease_clear_pending(struct file_lock *fl, int arg) { diff --git a/fs/namespace.c b/fs/namespace.c index 0570729..4219885 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1584,6 +1584,14 @@ static inline bool may_mount(void) return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN); } +static inline bool may_mandlock(void) +{ +#ifndef CONFIG_MANDATORY_FILE_LOCKING + return false; +#endif + return true; +} + /* * Now umount can handle mount points as well as block devices. * This is important for filesystems which use unnamed block devices. @@ -2677,6 +2685,8 @@ long do_mount(const char *dev_name, const char __user *dir_name, type_page, flags, data_page); if (!retval && !may_mount()) retval = -EPERM; + if (!retval && (flags & MS_MANDLOCK) && !may_mandlock()) + retval = -EPERM; if (retval) goto dput_out; diff --git a/include/linux/fs.h b/include/linux/fs.h index 3aa5142..cbf08d5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2030,7 +2030,7 @@ extern struct kobject *fs_kobj; #define FLOCK_VERIFY_READ 1 #define FLOCK_VERIFY_WRITE 2 -#ifdef CONFIG_FILE_LOCKING +#ifdef CONFIG_MANDATORY_FILE_LOCKING extern int locks_mandatory_locked(struct file *); extern int locks_mandatory_area(int, struct inode *, struct file *, loff_t, size_t); @@ -2075,6 +2075,45 @@ static inline int locks_verify_truncate(struct inode *inode, return 0; } +#else /* !CONFIG_MANDATORY_FILE_LOCKING */ + +static inline int locks_mandatory_locked(struct file *file) +{ + return 0; +} + +static inline int locks_mandatory_area(int rw, struct inode *inode, + struct file *filp, loff_t offset, + size_t count) +{ + return 0; +} + +static inline int __mandatory_lock(struct inode *inode) +{ + return 0; +} + +static inline int mandatory_lock(struct inode *inode) +{ + return 0; +} + +static inline int locks_verify_locked(struct file *file) +{ + return 0; +} + +static inline int locks_verify_truncate(struct inode *inode, struct file *filp, + size_t size) +{ + return 0; +} + +#endif /* CONFIG_MANDATORY_FILE_LOCKING */ + + +#ifdef CONFIG_FILE_LOCKING static inline int break_lease(struct inode *inode, unsigned int mode) { /* @@ -2136,39 +2175,6 @@ static inline int break_layout(struct inode *inode, bool wait) } #else /* !CONFIG_FILE_LOCKING */ -static inline int locks_mandatory_locked(struct file *file) -{ - return 0; -} - -static inline int locks_mandatory_area(int rw, struct inode *inode, - struct file *filp, loff_t offset, - size_t count) -{ - return 0; -} - -static inline int __mandatory_lock(struct inode *inode) -{ - return 0; -} - -static inline int mandatory_lock(struct inode *inode) -{ - return 0; -} - -static inline int locks_verify_locked(struct file *file) -{ - return 0; -} - -static inline int locks_verify_truncate(struct inode *inode, struct file *filp, - size_t size) -{ - return 0; -} - static inline int break_lease(struct inode *inode, unsigned int mode) { return 0; -- cgit v0.10.2 From 95ace75414f312f9a7b93d873f386987b92a5301 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 11 Nov 2015 17:22:33 -0600 Subject: locks: Don't allow mounts in user namespaces to enable mandatory locking Since no one uses mandatory locking and files with mandatory locks can cause problems don't allow them in user namespaces. Signed-off-by: "Eric W. Biederman" Signed-off-by: Jeff Layton diff --git a/fs/namespace.c b/fs/namespace.c index 4219885..4d2c8f64 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1589,7 +1589,7 @@ static inline bool may_mandlock(void) #ifndef CONFIG_MANDATORY_FILE_LOCKING return false; #endif - return true; + return capable(CAP_SYS_ADMIN); } /* -- cgit v0.10.2 From 4d4142696e18cf30af319031d47bba46853a4605 Mon Sep 17 00:00:00 2001 From: Guillaume Gomez Date: Wed, 23 Sep 2015 12:34:30 +0200 Subject: percpu: Remove unneeded return from void function Signed-off-by: Guillaume Gomez Acked-by: Christoph Lameter Signed-off-by: Tejun Heo diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index 12c9b48..84f542d 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -116,7 +116,7 @@ void percpu_ref_reinit(struct percpu_ref *ref); */ static inline void percpu_ref_kill(struct percpu_ref *ref) { - return percpu_ref_kill_and_confirm(ref, NULL); + percpu_ref_kill_and_confirm(ref, NULL); } /* -- cgit v0.10.2 From b916b785af99088916a122cb37de1bda3fa7f70e Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 28 Oct 2015 12:32:17 +0000 Subject: drivers/perf: kill armpmu_register Nothing outside of drivers/perf/arm_pmu.c should call armpmu_register any more, so it no longer needs to be in include/linux/perf/arm_pmu.h. Additionally, by folding it in to arm_pmu_device_probe we can allow drivers to override struct pmu fields without getting blatted by the armpmu code. This patch folds armpmu_register into arm_pmu_device_probe. The logging to the console is moved to after the PMU is successfully registered with the core perf code. Signed-off-by: Mark Rutland Suggested-by: Will Deacon Cc: Drew Richardson Cc: Pawel Moll Signed-off-by: Will Deacon diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c index be3755c..166637f 100644 --- a/drivers/perf/arm_pmu.c +++ b/drivers/perf/arm_pmu.c @@ -551,14 +551,6 @@ static void armpmu_init(struct arm_pmu *armpmu) }; } -int armpmu_register(struct arm_pmu *armpmu, int type) -{ - armpmu_init(armpmu); - pr_info("enabled with %s PMU driver, %d counters available\n", - armpmu->name, armpmu->num_events); - return perf_pmu_register(&armpmu->pmu, armpmu->name, type); -} - /* Set at runtime when we know what CPU type we are. */ static struct arm_pmu *__oprofile_cpu_pmu; @@ -887,6 +879,8 @@ int arm_pmu_device_probe(struct platform_device *pdev, return -ENOMEM; } + armpmu_init(pmu); + if (!__oprofile_cpu_pmu) __oprofile_cpu_pmu = pmu; @@ -912,10 +906,13 @@ int arm_pmu_device_probe(struct platform_device *pdev, if (ret) goto out_free; - ret = armpmu_register(pmu, -1); + ret = perf_pmu_register(&pmu->pmu, pmu->name, -1); if (ret) goto out_destroy; + pr_info("enabled with %s PMU driver, %d counters available\n", + pmu->name, pmu->num_events); + return 0; out_destroy: diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index bfa673b..83b5e34 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -111,8 +111,6 @@ struct arm_pmu { #define to_arm_pmu(p) (container_of(p, struct arm_pmu, pmu)) -int armpmu_register(struct arm_pmu *armpmu, int type); - u64 armpmu_event_update(struct perf_event *event); int armpmu_event_set_period(struct perf_event *event); -- cgit v0.10.2 From f4ab36cb103a55d02ef83727880a14d9be9823f0 Mon Sep 17 00:00:00 2001 From: Drew Richardson Date: Wed, 28 Oct 2015 08:19:56 -0700 Subject: arm: perf: Convert event enums to #defines The enums are not necessary and this allows the event values to be used to construct static strings at compile time. Signed-off-by: Drew Richardson Signed-off-by: Will Deacon diff --git a/arch/arm/kernel/perf_event_v7.c b/arch/arm/kernel/perf_event_v7.c index 126dc67..dc97972 100644 --- a/arch/arm/kernel/perf_event_v7.c +++ b/arch/arm/kernel/perf_event_v7.c @@ -35,133 +35,117 @@ * but the encodings are considered to be `reserved' in the case that * they are not available. */ -enum armv7_perf_types { - ARMV7_PERFCTR_PMNC_SW_INCR = 0x00, - ARMV7_PERFCTR_L1_ICACHE_REFILL = 0x01, - ARMV7_PERFCTR_ITLB_REFILL = 0x02, - ARMV7_PERFCTR_L1_DCACHE_REFILL = 0x03, - ARMV7_PERFCTR_L1_DCACHE_ACCESS = 0x04, - ARMV7_PERFCTR_DTLB_REFILL = 0x05, - ARMV7_PERFCTR_MEM_READ = 0x06, - ARMV7_PERFCTR_MEM_WRITE = 0x07, - ARMV7_PERFCTR_INSTR_EXECUTED = 0x08, - ARMV7_PERFCTR_EXC_TAKEN = 0x09, - ARMV7_PERFCTR_EXC_EXECUTED = 0x0A, - ARMV7_PERFCTR_CID_WRITE = 0x0B, +#define ARMV7_PERFCTR_PMNC_SW_INCR 0x00 +#define ARMV7_PERFCTR_L1_ICACHE_REFILL 0x01 +#define ARMV7_PERFCTR_ITLB_REFILL 0x02 +#define ARMV7_PERFCTR_L1_DCACHE_REFILL 0x03 +#define ARMV7_PERFCTR_L1_DCACHE_ACCESS 0x04 +#define ARMV7_PERFCTR_DTLB_REFILL 0x05 +#define ARMV7_PERFCTR_MEM_READ 0x06 +#define ARMV7_PERFCTR_MEM_WRITE 0x07 +#define ARMV7_PERFCTR_INSTR_EXECUTED 0x08 +#define ARMV7_PERFCTR_EXC_TAKEN 0x09 +#define ARMV7_PERFCTR_EXC_EXECUTED 0x0A +#define ARMV7_PERFCTR_CID_WRITE 0x0B - /* - * ARMV7_PERFCTR_PC_WRITE is equivalent to HW_BRANCH_INSTRUCTIONS. - * It counts: - * - all (taken) branch instructions, - * - instructions that explicitly write the PC, - * - exception generating instructions. - */ - ARMV7_PERFCTR_PC_WRITE = 0x0C, - ARMV7_PERFCTR_PC_IMM_BRANCH = 0x0D, - ARMV7_PERFCTR_PC_PROC_RETURN = 0x0E, - ARMV7_PERFCTR_MEM_UNALIGNED_ACCESS = 0x0F, - ARMV7_PERFCTR_PC_BRANCH_MIS_PRED = 0x10, - ARMV7_PERFCTR_CLOCK_CYCLES = 0x11, - ARMV7_PERFCTR_PC_BRANCH_PRED = 0x12, - - /* These events are defined by the PMUv2 supplement (ARM DDI 0457A). */ - ARMV7_PERFCTR_MEM_ACCESS = 0x13, - ARMV7_PERFCTR_L1_ICACHE_ACCESS = 0x14, - ARMV7_PERFCTR_L1_DCACHE_WB = 0x15, - ARMV7_PERFCTR_L2_CACHE_ACCESS = 0x16, - ARMV7_PERFCTR_L2_CACHE_REFILL = 0x17, - ARMV7_PERFCTR_L2_CACHE_WB = 0x18, - ARMV7_PERFCTR_BUS_ACCESS = 0x19, - ARMV7_PERFCTR_MEM_ERROR = 0x1A, - ARMV7_PERFCTR_INSTR_SPEC = 0x1B, - ARMV7_PERFCTR_TTBR_WRITE = 0x1C, - ARMV7_PERFCTR_BUS_CYCLES = 0x1D, - - ARMV7_PERFCTR_CPU_CYCLES = 0xFF -}; +/* + * ARMV7_PERFCTR_PC_WRITE is equivalent to HW_BRANCH_INSTRUCTIONS. + * It counts: + * - all (taken) branch instructions, + * - instructions that explicitly write the PC, + * - exception generating instructions. + */ +#define ARMV7_PERFCTR_PC_WRITE 0x0C +#define ARMV7_PERFCTR_PC_IMM_BRANCH 0x0D +#define ARMV7_PERFCTR_PC_PROC_RETURN 0x0E +#define ARMV7_PERFCTR_MEM_UNALIGNED_ACCESS 0x0F +#define ARMV7_PERFCTR_PC_BRANCH_MIS_PRED 0x10 +#define ARMV7_PERFCTR_CLOCK_CYCLES 0x11 +#define ARMV7_PERFCTR_PC_BRANCH_PRED 0x12 + +/* These events are defined by the PMUv2 supplement (ARM DDI 0457A). */ +#define ARMV7_PERFCTR_MEM_ACCESS 0x13 +#define ARMV7_PERFCTR_L1_ICACHE_ACCESS 0x14 +#define ARMV7_PERFCTR_L1_DCACHE_WB 0x15 +#define ARMV7_PERFCTR_L2_CACHE_ACCESS 0x16 +#define ARMV7_PERFCTR_L2_CACHE_REFILL 0x17 +#define ARMV7_PERFCTR_L2_CACHE_WB 0x18 +#define ARMV7_PERFCTR_BUS_ACCESS 0x19 +#define ARMV7_PERFCTR_MEM_ERROR 0x1A +#define ARMV7_PERFCTR_INSTR_SPEC 0x1B +#define ARMV7_PERFCTR_TTBR_WRITE 0x1C +#define ARMV7_PERFCTR_BUS_CYCLES 0x1D + +#define ARMV7_PERFCTR_CPU_CYCLES 0xFF /* ARMv7 Cortex-A8 specific event types */ -enum armv7_a8_perf_types { - ARMV7_A8_PERFCTR_L2_CACHE_ACCESS = 0x43, - ARMV7_A8_PERFCTR_L2_CACHE_REFILL = 0x44, - ARMV7_A8_PERFCTR_L1_ICACHE_ACCESS = 0x50, - ARMV7_A8_PERFCTR_STALL_ISIDE = 0x56, -}; +#define ARMV7_A8_PERFCTR_L2_CACHE_ACCESS 0x43 +#define ARMV7_A8_PERFCTR_L2_CACHE_REFILL 0x44 +#define ARMV7_A8_PERFCTR_L1_ICACHE_ACCESS 0x50 +#define ARMV7_A8_PERFCTR_STALL_ISIDE 0x56 /* ARMv7 Cortex-A9 specific event types */ -enum armv7_a9_perf_types { - ARMV7_A9_PERFCTR_INSTR_CORE_RENAME = 0x68, - ARMV7_A9_PERFCTR_STALL_ICACHE = 0x60, - ARMV7_A9_PERFCTR_STALL_DISPATCH = 0x66, -}; +#define ARMV7_A9_PERFCTR_INSTR_CORE_RENAME 0x68 +#define ARMV7_A9_PERFCTR_STALL_ICACHE 0x60 +#define ARMV7_A9_PERFCTR_STALL_DISPATCH 0x66 /* ARMv7 Cortex-A5 specific event types */ -enum armv7_a5_perf_types { - ARMV7_A5_PERFCTR_PREFETCH_LINEFILL = 0xc2, - ARMV7_A5_PERFCTR_PREFETCH_LINEFILL_DROP = 0xc3, -}; +#define ARMV7_A5_PERFCTR_PREFETCH_LINEFILL 0xc2 +#define ARMV7_A5_PERFCTR_PREFETCH_LINEFILL_DROP 0xc3 /* ARMv7 Cortex-A15 specific event types */ -enum armv7_a15_perf_types { - ARMV7_A15_PERFCTR_L1_DCACHE_ACCESS_READ = 0x40, - ARMV7_A15_PERFCTR_L1_DCACHE_ACCESS_WRITE = 0x41, - ARMV7_A15_PERFCTR_L1_DCACHE_REFILL_READ = 0x42, - ARMV7_A15_PERFCTR_L1_DCACHE_REFILL_WRITE = 0x43, +#define ARMV7_A15_PERFCTR_L1_DCACHE_ACCESS_READ 0x40 +#define ARMV7_A15_PERFCTR_L1_DCACHE_ACCESS_WRITE 0x41 +#define ARMV7_A15_PERFCTR_L1_DCACHE_REFILL_READ 0x42 +#define ARMV7_A15_PERFCTR_L1_DCACHE_REFILL_WRITE 0x43 - ARMV7_A15_PERFCTR_DTLB_REFILL_L1_READ = 0x4C, - ARMV7_A15_PERFCTR_DTLB_REFILL_L1_WRITE = 0x4D, +#define ARMV7_A15_PERFCTR_DTLB_REFILL_L1_READ 0x4C +#define ARMV7_A15_PERFCTR_DTLB_REFILL_L1_WRITE 0x4D - ARMV7_A15_PERFCTR_L2_CACHE_ACCESS_READ = 0x50, - ARMV7_A15_PERFCTR_L2_CACHE_ACCESS_WRITE = 0x51, - ARMV7_A15_PERFCTR_L2_CACHE_REFILL_READ = 0x52, - ARMV7_A15_PERFCTR_L2_CACHE_REFILL_WRITE = 0x53, +#define ARMV7_A15_PERFCTR_L2_CACHE_ACCESS_READ 0x50 +#define ARMV7_A15_PERFCTR_L2_CACHE_ACCESS_WRITE 0x51 +#define ARMV7_A15_PERFCTR_L2_CACHE_REFILL_READ 0x52 +#define ARMV7_A15_PERFCTR_L2_CACHE_REFILL_WRITE 0x53 - ARMV7_A15_PERFCTR_PC_WRITE_SPEC = 0x76, -}; +#define ARMV7_A15_PERFCTR_PC_WRITE_SPEC 0x76 /* ARMv7 Cortex-A12 specific event types */ -enum armv7_a12_perf_types { - ARMV7_A12_PERFCTR_L1_DCACHE_ACCESS_READ = 0x40, - ARMV7_A12_PERFCTR_L1_DCACHE_ACCESS_WRITE = 0x41, +#define ARMV7_A12_PERFCTR_L1_DCACHE_ACCESS_READ 0x40 +#define ARMV7_A12_PERFCTR_L1_DCACHE_ACCESS_WRITE 0x41 - ARMV7_A12_PERFCTR_L2_CACHE_ACCESS_READ = 0x50, - ARMV7_A12_PERFCTR_L2_CACHE_ACCESS_WRITE = 0x51, +#define ARMV7_A12_PERFCTR_L2_CACHE_ACCESS_READ 0x50 +#define ARMV7_A12_PERFCTR_L2_CACHE_ACCESS_WRITE 0x51 - ARMV7_A12_PERFCTR_PC_WRITE_SPEC = 0x76, +#define ARMV7_A12_PERFCTR_PC_WRITE_SPEC 0x76 - ARMV7_A12_PERFCTR_PF_TLB_REFILL = 0xe7, -}; +#define ARMV7_A12_PERFCTR_PF_TLB_REFILL 0xe7 /* ARMv7 Krait specific event types */ -enum krait_perf_types { - KRAIT_PMRESR0_GROUP0 = 0xcc, - KRAIT_PMRESR1_GROUP0 = 0xd0, - KRAIT_PMRESR2_GROUP0 = 0xd4, - KRAIT_VPMRESR0_GROUP0 = 0xd8, +#define KRAIT_PMRESR0_GROUP0 0xcc +#define KRAIT_PMRESR1_GROUP0 0xd0 +#define KRAIT_PMRESR2_GROUP0 0xd4 +#define KRAIT_VPMRESR0_GROUP0 0xd8 - KRAIT_PERFCTR_L1_ICACHE_ACCESS = 0x10011, - KRAIT_PERFCTR_L1_ICACHE_MISS = 0x10010, +#define KRAIT_PERFCTR_L1_ICACHE_ACCESS 0x10011 +#define KRAIT_PERFCTR_L1_ICACHE_MISS 0x10010 - KRAIT_PERFCTR_L1_ITLB_ACCESS = 0x12222, - KRAIT_PERFCTR_L1_DTLB_ACCESS = 0x12210, -}; +#define KRAIT_PERFCTR_L1_ITLB_ACCESS 0x12222 +#define KRAIT_PERFCTR_L1_DTLB_ACCESS 0x12210 /* ARMv7 Scorpion specific event types */ -enum scorpion_perf_types { - SCORPION_LPM0_GROUP0 = 0x4c, - SCORPION_LPM1_GROUP0 = 0x50, - SCORPION_LPM2_GROUP0 = 0x54, - SCORPION_L2LPM_GROUP0 = 0x58, - SCORPION_VLPM_GROUP0 = 0x5c, +#define SCORPION_LPM0_GROUP0 0x4c +#define SCORPION_LPM1_GROUP0 0x50 +#define SCORPION_LPM2_GROUP0 0x54 +#define SCORPION_L2LPM_GROUP0 0x58 +#define SCORPION_VLPM_GROUP0 0x5c - SCORPION_ICACHE_ACCESS = 0x10053, - SCORPION_ICACHE_MISS = 0x10052, +#define SCORPION_ICACHE_ACCESS 0x10053 +#define SCORPION_ICACHE_MISS 0x10052 - SCORPION_DTLB_ACCESS = 0x12013, - SCORPION_DTLB_MISS = 0x12012, +#define SCORPION_DTLB_ACCESS 0x12013 +#define SCORPION_DTLB_MISS 0x12012 - SCORPION_ITLB_MISS = 0x12021, -}; +#define SCORPION_ITLB_MISS 0x12021 /* * Cortex-A8 HW events mapping -- cgit v0.10.2 From 3fbac6ccb6c3a8958239d9026c4d41db60c2f1cf Mon Sep 17 00:00:00 2001 From: Drew Richardson Date: Wed, 28 Oct 2015 08:20:41 -0700 Subject: arm: perf: Add event descriptions Add additional information about the ARM architected hardware events to make counters self describing. This makes the hardware PMUs easier to use as perf list contains possible events instead of users having to refer to documentation like the ARM TRMs. Signed-off-by: Drew Richardson Signed-off-by: Will Deacon diff --git a/arch/arm/kernel/perf_event_v7.c b/arch/arm/kernel/perf_event_v7.c index dc97972..970e136 100644 --- a/arch/arm/kernel/perf_event_v7.c +++ b/arch/arm/kernel/perf_event_v7.c @@ -531,6 +531,120 @@ static const unsigned scorpion_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] [C(BPU)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV7_PERFCTR_PC_BRANCH_MIS_PRED, }; +#define ARMV7_EVENT_ATTR_RESOLVE(m) #m +#define ARMV7_EVENT_ATTR(name, config) \ + PMU_EVENT_ATTR_STRING(name, armv7_event_attr_##name, \ + "event=" ARMV7_EVENT_ATTR_RESOLVE(config)) + +ARMV7_EVENT_ATTR(sw_incr, ARMV7_PERFCTR_PMNC_SW_INCR); +ARMV7_EVENT_ATTR(l1i_cache_refill, ARMV7_PERFCTR_L1_ICACHE_REFILL); +ARMV7_EVENT_ATTR(l1i_tlb_refill, ARMV7_PERFCTR_ITLB_REFILL); +ARMV7_EVENT_ATTR(l1d_cache_refill, ARMV7_PERFCTR_L1_DCACHE_REFILL); +ARMV7_EVENT_ATTR(l1d_cache, ARMV7_PERFCTR_L1_DCACHE_ACCESS); +ARMV7_EVENT_ATTR(l1d_tlb_refill, ARMV7_PERFCTR_DTLB_REFILL); +ARMV7_EVENT_ATTR(ld_retired, ARMV7_PERFCTR_MEM_READ); +ARMV7_EVENT_ATTR(st_retired, ARMV7_PERFCTR_MEM_WRITE); +ARMV7_EVENT_ATTR(inst_retired, ARMV7_PERFCTR_INSTR_EXECUTED); +ARMV7_EVENT_ATTR(exc_taken, ARMV7_PERFCTR_EXC_TAKEN); +ARMV7_EVENT_ATTR(exc_return, ARMV7_PERFCTR_EXC_EXECUTED); +ARMV7_EVENT_ATTR(cid_write_retired, ARMV7_PERFCTR_CID_WRITE); +ARMV7_EVENT_ATTR(pc_write_retired, ARMV7_PERFCTR_PC_WRITE); +ARMV7_EVENT_ATTR(br_immed_retired, ARMV7_PERFCTR_PC_IMM_BRANCH); +ARMV7_EVENT_ATTR(br_return_retired, ARMV7_PERFCTR_PC_PROC_RETURN); +ARMV7_EVENT_ATTR(unaligned_ldst_retired, ARMV7_PERFCTR_MEM_UNALIGNED_ACCESS); +ARMV7_EVENT_ATTR(br_mis_pred, ARMV7_PERFCTR_PC_BRANCH_MIS_PRED); +ARMV7_EVENT_ATTR(cpu_cycles, ARMV7_PERFCTR_CLOCK_CYCLES); +ARMV7_EVENT_ATTR(br_pred, ARMV7_PERFCTR_PC_BRANCH_PRED); + +static struct attribute *armv7_pmuv1_event_attrs[] = { + &armv7_event_attr_sw_incr.attr.attr, + &armv7_event_attr_l1i_cache_refill.attr.attr, + &armv7_event_attr_l1i_tlb_refill.attr.attr, + &armv7_event_attr_l1d_cache_refill.attr.attr, + &armv7_event_attr_l1d_cache.attr.attr, + &armv7_event_attr_l1d_tlb_refill.attr.attr, + &armv7_event_attr_ld_retired.attr.attr, + &armv7_event_attr_st_retired.attr.attr, + &armv7_event_attr_inst_retired.attr.attr, + &armv7_event_attr_exc_taken.attr.attr, + &armv7_event_attr_exc_return.attr.attr, + &armv7_event_attr_cid_write_retired.attr.attr, + &armv7_event_attr_pc_write_retired.attr.attr, + &armv7_event_attr_br_immed_retired.attr.attr, + &armv7_event_attr_br_return_retired.attr.attr, + &armv7_event_attr_unaligned_ldst_retired.attr.attr, + &armv7_event_attr_br_mis_pred.attr.attr, + &armv7_event_attr_cpu_cycles.attr.attr, + &armv7_event_attr_br_pred.attr.attr, + NULL +}; + +static struct attribute_group armv7_pmuv1_events_attr_group = { + .name = "events", + .attrs = armv7_pmuv1_event_attrs, +}; + +static const struct attribute_group *armv7_pmuv1_attr_groups[] = { + &armv7_pmuv1_events_attr_group, + NULL +}; + +ARMV7_EVENT_ATTR(mem_access, ARMV7_PERFCTR_MEM_ACCESS); +ARMV7_EVENT_ATTR(l1i_cache, ARMV7_PERFCTR_L1_ICACHE_ACCESS); +ARMV7_EVENT_ATTR(l1d_cache_wb, ARMV7_PERFCTR_L1_DCACHE_WB); +ARMV7_EVENT_ATTR(l2d_cache, ARMV7_PERFCTR_L2_CACHE_ACCESS); +ARMV7_EVENT_ATTR(l2d_cache_refill, ARMV7_PERFCTR_L2_CACHE_REFILL); +ARMV7_EVENT_ATTR(l2d_cache_wb, ARMV7_PERFCTR_L2_CACHE_WB); +ARMV7_EVENT_ATTR(bus_access, ARMV7_PERFCTR_BUS_ACCESS); +ARMV7_EVENT_ATTR(memory_error, ARMV7_PERFCTR_MEM_ERROR); +ARMV7_EVENT_ATTR(inst_spec, ARMV7_PERFCTR_INSTR_SPEC); +ARMV7_EVENT_ATTR(ttbr_write_retired, ARMV7_PERFCTR_TTBR_WRITE); +ARMV7_EVENT_ATTR(bus_cycles, ARMV7_PERFCTR_BUS_CYCLES); + +static struct attribute *armv7_pmuv2_event_attrs[] = { + &armv7_event_attr_sw_incr.attr.attr, + &armv7_event_attr_l1i_cache_refill.attr.attr, + &armv7_event_attr_l1i_tlb_refill.attr.attr, + &armv7_event_attr_l1d_cache_refill.attr.attr, + &armv7_event_attr_l1d_cache.attr.attr, + &armv7_event_attr_l1d_tlb_refill.attr.attr, + &armv7_event_attr_ld_retired.attr.attr, + &armv7_event_attr_st_retired.attr.attr, + &armv7_event_attr_inst_retired.attr.attr, + &armv7_event_attr_exc_taken.attr.attr, + &armv7_event_attr_exc_return.attr.attr, + &armv7_event_attr_cid_write_retired.attr.attr, + &armv7_event_attr_pc_write_retired.attr.attr, + &armv7_event_attr_br_immed_retired.attr.attr, + &armv7_event_attr_br_return_retired.attr.attr, + &armv7_event_attr_unaligned_ldst_retired.attr.attr, + &armv7_event_attr_br_mis_pred.attr.attr, + &armv7_event_attr_cpu_cycles.attr.attr, + &armv7_event_attr_br_pred.attr.attr, + &armv7_event_attr_mem_access.attr.attr, + &armv7_event_attr_l1i_cache.attr.attr, + &armv7_event_attr_l1d_cache_wb.attr.attr, + &armv7_event_attr_l2d_cache.attr.attr, + &armv7_event_attr_l2d_cache_refill.attr.attr, + &armv7_event_attr_l2d_cache_wb.attr.attr, + &armv7_event_attr_bus_access.attr.attr, + &armv7_event_attr_memory_error.attr.attr, + &armv7_event_attr_inst_spec.attr.attr, + &armv7_event_attr_ttbr_write_retired.attr.attr, + &armv7_event_attr_bus_cycles.attr.attr, + NULL +}; + +static struct attribute_group armv7_pmuv2_events_attr_group = { + .name = "events", + .attrs = armv7_pmuv2_event_attrs, +}; + +static const struct attribute_group *armv7_pmuv2_attr_groups[] = { + &armv7_pmuv2_events_attr_group, + NULL +}; + /* * Perf Events' indices */ @@ -1069,6 +1183,7 @@ static int armv7_a8_pmu_init(struct arm_pmu *cpu_pmu) armv7pmu_init(cpu_pmu); cpu_pmu->name = "armv7_cortex_a8"; cpu_pmu->map_event = armv7_a8_map_event; + cpu_pmu->pmu.attr_groups = armv7_pmuv1_attr_groups; return armv7_probe_num_events(cpu_pmu); } @@ -1077,6 +1192,7 @@ static int armv7_a9_pmu_init(struct arm_pmu *cpu_pmu) armv7pmu_init(cpu_pmu); cpu_pmu->name = "armv7_cortex_a9"; cpu_pmu->map_event = armv7_a9_map_event; + cpu_pmu->pmu.attr_groups = armv7_pmuv1_attr_groups; return armv7_probe_num_events(cpu_pmu); } @@ -1085,6 +1201,7 @@ static int armv7_a5_pmu_init(struct arm_pmu *cpu_pmu) armv7pmu_init(cpu_pmu); cpu_pmu->name = "armv7_cortex_a5"; cpu_pmu->map_event = armv7_a5_map_event; + cpu_pmu->pmu.attr_groups = armv7_pmuv1_attr_groups; return armv7_probe_num_events(cpu_pmu); } @@ -1094,6 +1211,7 @@ static int armv7_a15_pmu_init(struct arm_pmu *cpu_pmu) cpu_pmu->name = "armv7_cortex_a15"; cpu_pmu->map_event = armv7_a15_map_event; cpu_pmu->set_event_filter = armv7pmu_set_event_filter; + cpu_pmu->pmu.attr_groups = armv7_pmuv2_attr_groups; return armv7_probe_num_events(cpu_pmu); } @@ -1103,6 +1221,7 @@ static int armv7_a7_pmu_init(struct arm_pmu *cpu_pmu) cpu_pmu->name = "armv7_cortex_a7"; cpu_pmu->map_event = armv7_a7_map_event; cpu_pmu->set_event_filter = armv7pmu_set_event_filter; + cpu_pmu->pmu.attr_groups = armv7_pmuv2_attr_groups; return armv7_probe_num_events(cpu_pmu); } @@ -1112,6 +1231,7 @@ static int armv7_a12_pmu_init(struct arm_pmu *cpu_pmu) cpu_pmu->name = "armv7_cortex_a12"; cpu_pmu->map_event = armv7_a12_map_event; cpu_pmu->set_event_filter = armv7pmu_set_event_filter; + cpu_pmu->pmu.attr_groups = armv7_pmuv2_attr_groups; return armv7_probe_num_events(cpu_pmu); } @@ -1119,6 +1239,7 @@ static int armv7_a17_pmu_init(struct arm_pmu *cpu_pmu) { int ret = armv7_a12_pmu_init(cpu_pmu); cpu_pmu->name = "armv7_cortex_a17"; + cpu_pmu->pmu.attr_groups = armv7_pmuv2_attr_groups; return ret; } -- cgit v0.10.2 From 18fc93fd64129c96432812cb44f59c963871889b Mon Sep 17 00:00:00 2001 From: Jungseok Lee Date: Wed, 4 Nov 2015 13:26:07 +0000 Subject: percpu: remove PERCPU_ENOUGH_ROOM which is stale definition As pure cleanup, this patch removes PERCPU_ENOUGH_ROOM which is not used any more. That is, no code refers to the definition. Acked-by: Christoph Lameter Signed-off-by: Jungseok Lee Signed-off-by: Tejun Heo diff --git a/arch/ia64/include/asm/percpu.h b/arch/ia64/include/asm/percpu.h index 0ec484d..b929579 100644 --- a/arch/ia64/include/asm/percpu.h +++ b/arch/ia64/include/asm/percpu.h @@ -6,8 +6,6 @@ * David Mosberger-Tang */ -#define PERCPU_ENOUGH_ROOM PERCPU_PAGE_SIZE - #ifdef __ASSEMBLY__ # define THIS_CPU(var) (var) /* use this to mark accesses to per-CPU variables... */ #else /* !__ASSEMBLY__ */ diff --git a/include/linux/percpu.h b/include/linux/percpu.h index caebf2a..4bc6daf 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -18,12 +18,6 @@ #define PERCPU_MODULE_RESERVE 0 #endif -#ifndef PERCPU_ENOUGH_ROOM -#define PERCPU_ENOUGH_ROOM \ - (ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES) + \ - PERCPU_MODULE_RESERVE) -#endif - /* minimum unit size, also is the maximum supported allocation size */ #define PCPU_MIN_UNIT_SIZE PFN_ALIGN(32 << 10) -- cgit v0.10.2 From 67e9c74b8a873408c27ac9a8e4c1d1c8d72c93ff Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 16 Nov 2015 11:13:34 -0500 Subject: cgroup: replace __DEVEL__sane_behavior with cgroup2 fs type With major controllers - cpu, memory and io - shaping up for the unified hierarchy, cgroup2 is about ready to be, gradually, released into the wild. Replace __DEVEL__sane_behavior flag which was used to select the unified hierarchy with a separate filesystem type "cgroup2" so that unified hierarchy can be mounted as follows. mount -t cgroup2 none $MOUNT_POINT The cgroup2 fs has its own magic number - 0x63677270 ("cgrp"). v2: Assign a different magic number to cgroup2 fs. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: Johannes Weiner diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt index 781b1d4..c1f0e87 100644 --- a/Documentation/cgroups/unified-hierarchy.txt +++ b/Documentation/cgroups/unified-hierarchy.txt @@ -94,11 +94,9 @@ the process. 2-1. Mounting -Currently, unified hierarchy can be mounted with the following mount -command. Note that this is still under development and scheduled to -change soon. +Unified hierarchy can be mounted with the following mount command. - mount -t cgroup -o __DEVEL__sane_behavior cgroup $MOUNT_POINT + mount -t cgroup2 none $MOUNT_POINT All controllers which support the unified hierarchy and are not bound to other hierarchies are automatically bound to unified hierarchy and diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 869fd4a..80e2ae6 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -66,7 +66,6 @@ enum { /* cgroup_root->flags */ enum { - CGRP_ROOT_SANE_BEHAVIOR = (1 << 0), /* __DEVEL__sane_behavior specified */ CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */ CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */ }; diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index accb036..b283d56 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -54,6 +54,7 @@ #define SMB_SUPER_MAGIC 0x517B #define CGROUP_SUPER_MAGIC 0x27e0eb +#define CGROUP2_SUPER_MAGIC 0x63677270 #define STACK_END_MAGIC 0x57AC6E9D diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b316deb..af08862 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -211,6 +211,7 @@ static unsigned long have_free_callback __read_mostly; /* Ditto for the can_fork callback. */ static unsigned long have_canfork_callback __read_mostly; +static struct file_system_type cgroup2_fs_type; static struct cftype cgroup_dfl_base_files[]; static struct cftype cgroup_legacy_base_files[]; @@ -1641,10 +1642,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) all_ss = true; continue; } - if (!strcmp(token, "__DEVEL__sane_behavior")) { - opts->flags |= CGRP_ROOT_SANE_BEHAVIOR; - continue; - } if (!strcmp(token, "noprefix")) { opts->flags |= CGRP_ROOT_NOPREFIX; continue; @@ -1711,15 +1708,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) return -ENOENT; } - if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { - pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); - if (nr_opts != 1) { - pr_err("sane_behavior: no other mount options allowed\n"); - return -EINVAL; - } - return 0; - } - /* * If the 'all' option was specified select all the subsystems, * otherwise if 'none', 'name=' and a subsystem name options were @@ -1998,6 +1986,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data) { + bool is_v2 = fs_type == &cgroup2_fs_type; struct super_block *pinned_sb = NULL; struct cgroup_subsys *ss; struct cgroup_root *root; @@ -2014,6 +2003,17 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (!use_task_css_set_links) cgroup_enable_task_cg_lists(); + if (is_v2) { + if (data) { + pr_err("cgroup2: unknown option \"%s\"\n", (char *)data); + return ERR_PTR(-EINVAL); + } + cgrp_dfl_root_visible = true; + root = &cgrp_dfl_root; + cgroup_get(&root->cgrp); + goto out_mount; + } + mutex_lock(&cgroup_mutex); /* First find the desired set of subsystems */ @@ -2021,15 +2021,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (ret) goto out_unlock; - /* look for a matching existing root */ - if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) { - cgrp_dfl_root_visible = true; - root = &cgrp_dfl_root; - cgroup_get(&root->cgrp); - ret = 0; - goto out_unlock; - } - /* * Destruction of cgroup root is asynchronous, so subsystems may * still be dying after the previous unmount. Let's drain the @@ -2140,9 +2131,10 @@ out_free: if (ret) return ERR_PTR(ret); - +out_mount: dentry = kernfs_mount(fs_type, flags, root->kf_root, - CGROUP_SUPER_MAGIC, &new_sb); + is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC, + &new_sb); if (IS_ERR(dentry) || !new_sb) cgroup_put(&root->cgrp); @@ -2185,6 +2177,12 @@ static struct file_system_type cgroup_fs_type = { .kill_sb = cgroup_kill_sb, }; +static struct file_system_type cgroup2_fs_type = { + .name = "cgroup2", + .mount = cgroup_mount, + .kill_sb = cgroup_kill_sb, +}; + /** * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy * @task: target task @@ -5315,6 +5313,7 @@ int __init cgroup_init(void) WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup")); WARN_ON(register_filesystem(&cgroup_fs_type)); + WARN_ON(register_filesystem(&cgroup2_fs_type)); WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations)); return 0; -- cgit v0.10.2 From 0d942766453f3d23a51e0a2d430340a178b0903e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 16 Nov 2015 11:13:34 -0500 Subject: cgroup: rename Documentation/cgroups/ to Documentation/cgroup-legacy/ In preparation for adding cgroup2 documentation, rename Documentation/cgroups/ to Documentation/cgroup-legacy/. Signed-off-by: Tejun Heo Acked-by: Li Zefan diff --git a/Documentation/cgroup-legacy/00-INDEX b/Documentation/cgroup-legacy/00-INDEX new file mode 100644 index 0000000..3f5a40f --- /dev/null +++ b/Documentation/cgroup-legacy/00-INDEX @@ -0,0 +1,30 @@ +00-INDEX + - this file +blkio-controller.txt + - Description for Block IO Controller, implementation and usage details. +cgroups.txt + - Control Groups definition, implementation details, examples and API. +cpuacct.txt + - CPU Accounting Controller; account CPU usage for groups of tasks. +cpusets.txt + - documents the cpusets feature; assign CPUs and Mem to a set of tasks. +devices.txt + - Device Whitelist Controller; description, interface and security. +freezer-subsystem.txt + - checkpointing; rationale to not use signals, interface. +hugetlb.txt + - HugeTLB Controller implementation and usage details. +memcg_test.txt + - Memory Resource Controller; implementation details. +memory.txt + - Memory Resource Controller; design, accounting, interface, testing. +net_cls.txt + - Network classifier cgroups details and usages. +net_prio.txt + - Network priority cgroups details and usages. +pids.txt + - Process number cgroups details and usages. +resource_counter.txt + - Resource Counter API. +unified-hierarchy.txt + - Description the new/next cgroup interface. diff --git a/Documentation/cgroup-legacy/blkio-controller.txt b/Documentation/cgroup-legacy/blkio-controller.txt new file mode 100644 index 0000000..52fa9f3 --- /dev/null +++ b/Documentation/cgroup-legacy/blkio-controller.txt @@ -0,0 +1,455 @@ + Block IO Controller + =================== +Overview +======== +cgroup subsys "blkio" implements the block io controller. There seems to be +a need of various kinds of IO control policies (like proportional BW, max BW) +both at leaf nodes as well as at intermediate nodes in a storage hierarchy. +Plan is to use the same cgroup based management interface for blkio controller +and based on user options switch IO policies in the background. + +Currently two IO control policies are implemented. First one is proportional +weight time based division of disk policy. It is implemented in CFQ. Hence +this policy takes effect only on leaf nodes when CFQ is being used. The second +one is throttling policy which can be used to specify upper IO rate limits +on devices. This policy is implemented in generic block layer and can be +used on leaf nodes as well as higher level logical devices like device mapper. + +HOWTO +===== +Proportional Weight division of bandwidth +----------------------------------------- +You can do a very simple testing of running two dd threads in two different +cgroups. Here is what you can do. + +- Enable Block IO controller + CONFIG_BLK_CGROUP=y + +- Enable group scheduling in CFQ + CONFIG_CFQ_GROUP_IOSCHED=y + +- Compile and boot into kernel and mount IO controller (blkio); see + cgroups.txt, Why are cgroups needed?. + + mount -t tmpfs cgroup_root /sys/fs/cgroup + mkdir /sys/fs/cgroup/blkio + mount -t cgroup -o blkio none /sys/fs/cgroup/blkio + +- Create two cgroups + mkdir -p /sys/fs/cgroup/blkio/test1/ /sys/fs/cgroup/blkio/test2 + +- Set weights of group test1 and test2 + echo 1000 > /sys/fs/cgroup/blkio/test1/blkio.weight + echo 500 > /sys/fs/cgroup/blkio/test2/blkio.weight + +- Create two same size files (say 512MB each) on same disk (file1, file2) and + launch two dd threads in different cgroup to read those files. + + sync + echo 3 > /proc/sys/vm/drop_caches + + dd if=/mnt/sdb/zerofile1 of=/dev/null & + echo $! > /sys/fs/cgroup/blkio/test1/tasks + cat /sys/fs/cgroup/blkio/test1/tasks + + dd if=/mnt/sdb/zerofile2 of=/dev/null & + echo $! > /sys/fs/cgroup/blkio/test2/tasks + cat /sys/fs/cgroup/blkio/test2/tasks + +- At macro level, first dd should finish first. To get more precise data, keep + on looking at (with the help of script), at blkio.disk_time and + blkio.disk_sectors files of both test1 and test2 groups. This will tell how + much disk time (in milliseconds), each group got and how many sectors each + group dispatched to the disk. We provide fairness in terms of disk time, so + ideally io.disk_time of cgroups should be in proportion to the weight. + +Throttling/Upper Limit policy +----------------------------- +- Enable Block IO controller + CONFIG_BLK_CGROUP=y + +- Enable throttling in block layer + CONFIG_BLK_DEV_THROTTLING=y + +- Mount blkio controller (see cgroups.txt, Why are cgroups needed?) + mount -t cgroup -o blkio none /sys/fs/cgroup/blkio + +- Specify a bandwidth rate on particular device for root group. The format + for policy is ": ". + + echo "8:16 1048576" > /sys/fs/cgroup/blkio/blkio.throttle.read_bps_device + + Above will put a limit of 1MB/second on reads happening for root group + on device having major/minor number 8:16. + +- Run dd to read a file and see if rate is throttled to 1MB/s or not. + + # dd if=/mnt/common/zerofile of=/dev/null bs=4K count=1024 + # iflag=direct + 1024+0 records in + 1024+0 records out + 4194304 bytes (4.2 MB) copied, 4.0001 s, 1.0 MB/s + + Limits for writes can be put using blkio.throttle.write_bps_device file. + +Hierarchical Cgroups +==================== + +Both CFQ and throttling implement hierarchy support; however, +throttling's hierarchy support is enabled iff "sane_behavior" is +enabled from cgroup side, which currently is a development option and +not publicly available. + +If somebody created a hierarchy like as follows. + + root + / \ + test1 test2 + | + test3 + +CFQ by default and throttling with "sane_behavior" will handle the +hierarchy correctly. For details on CFQ hierarchy support, refer to +Documentation/block/cfq-iosched.txt. For throttling, all limits apply +to the whole subtree while all statistics are local to the IOs +directly generated by tasks in that cgroup. + +Throttling without "sane_behavior" enabled from cgroup side will +practically treat all groups at same level as if it looks like the +following. + + pivot + / / \ \ + root test1 test2 test3 + +Various user visible config options +=================================== +CONFIG_BLK_CGROUP + - Block IO controller. + +CONFIG_DEBUG_BLK_CGROUP + - Debug help. Right now some additional stats file show up in cgroup + if this option is enabled. + +CONFIG_CFQ_GROUP_IOSCHED + - Enables group scheduling in CFQ. Currently only 1 level of group + creation is allowed. + +CONFIG_BLK_DEV_THROTTLING + - Enable block device throttling support in block layer. + +Details of cgroup files +======================= +Proportional weight policy files +-------------------------------- +- blkio.weight + - Specifies per cgroup weight. This is default weight of the group + on all the devices until and unless overridden by per device rule. + (See blkio.weight_device). + Currently allowed range of weights is from 10 to 1000. + +- blkio.weight_device + - One can specify per cgroup per device rules using this interface. + These rules override the default value of group weight as specified + by blkio.weight. + + Following is the format. + + # echo dev_maj:dev_minor weight > blkio.weight_device + Configure weight=300 on /dev/sdb (8:16) in this cgroup + # echo 8:16 300 > blkio.weight_device + # cat blkio.weight_device + dev weight + 8:16 300 + + Configure weight=500 on /dev/sda (8:0) in this cgroup + # echo 8:0 500 > blkio.weight_device + # cat blkio.weight_device + dev weight + 8:0 500 + 8:16 300 + + Remove specific weight for /dev/sda in this cgroup + # echo 8:0 0 > blkio.weight_device + # cat blkio.weight_device + dev weight + 8:16 300 + +- blkio.leaf_weight[_device] + - Equivalents of blkio.weight[_device] for the purpose of + deciding how much weight tasks in the given cgroup has while + competing with the cgroup's child cgroups. For details, + please refer to Documentation/block/cfq-iosched.txt. + +- blkio.time + - disk time allocated to cgroup per device in milliseconds. First + two fields specify the major and minor number of the device and + third field specifies the disk time allocated to group in + milliseconds. + +- blkio.sectors + - number of sectors transferred to/from disk by the group. First + two fields specify the major and minor number of the device and + third field specifies the number of sectors transferred by the + group to/from the device. + +- blkio.io_service_bytes + - Number of bytes transferred to/from the disk by the group. These + are further divided by the type of operation - read or write, sync + or async. First two fields specify the major and minor number of the + device, third field specifies the operation type and the fourth field + specifies the number of bytes. + +- blkio.io_serviced + - Number of IOs (bio) issued to the disk by the group. These + are further divided by the type of operation - read or write, sync + or async. First two fields specify the major and minor number of the + device, third field specifies the operation type and the fourth field + specifies the number of IOs. + +- blkio.io_service_time + - Total amount of time between request dispatch and request completion + for the IOs done by this cgroup. This is in nanoseconds to make it + meaningful for flash devices too. For devices with queue depth of 1, + this time represents the actual service time. When queue_depth > 1, + that is no longer true as requests may be served out of order. This + may cause the service time for a given IO to include the service time + of multiple IOs when served out of order which may result in total + io_service_time > actual time elapsed. This time is further divided by + the type of operation - read or write, sync or async. First two fields + specify the major and minor number of the device, third field + specifies the operation type and the fourth field specifies the + io_service_time in ns. + +- blkio.io_wait_time + - Total amount of time the IOs for this cgroup spent waiting in the + scheduler queues for service. This can be greater than the total time + elapsed since it is cumulative io_wait_time for all IOs. It is not a + measure of total time the cgroup spent waiting but rather a measure of + the wait_time for its individual IOs. For devices with queue_depth > 1 + this metric does not include the time spent waiting for service once + the IO is dispatched to the device but till it actually gets serviced + (there might be a time lag here due to re-ordering of requests by the + device). This is in nanoseconds to make it meaningful for flash + devices too. This time is further divided by the type of operation - + read or write, sync or async. First two fields specify the major and + minor number of the device, third field specifies the operation type + and the fourth field specifies the io_wait_time in ns. + +- blkio.io_merged + - Total number of bios/requests merged into requests belonging to this + cgroup. This is further divided by the type of operation - read or + write, sync or async. + +- blkio.io_queued + - Total number of requests queued up at any given instant for this + cgroup. This is further divided by the type of operation - read or + write, sync or async. + +- blkio.avg_queue_size + - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. + The average queue size for this cgroup over the entire time of this + cgroup's existence. Queue size samples are taken each time one of the + queues of this cgroup gets a timeslice. + +- blkio.group_wait_time + - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. + This is the amount of time the cgroup had to wait since it became busy + (i.e., went from 0 to 1 request queued) to get a timeslice for one of + its queues. This is different from the io_wait_time which is the + cumulative total of the amount of time spent by each IO in that cgroup + waiting in the scheduler queue. This is in nanoseconds. If this is + read when the cgroup is in a waiting (for timeslice) state, the stat + will only report the group_wait_time accumulated till the last time it + got a timeslice and will not include the current delta. + +- blkio.empty_time + - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. + This is the amount of time a cgroup spends without any pending + requests when not being served, i.e., it does not include any time + spent idling for one of the queues of the cgroup. This is in + nanoseconds. If this is read when the cgroup is in an empty state, + the stat will only report the empty_time accumulated till the last + time it had a pending request and will not include the current delta. + +- blkio.idle_time + - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. + This is the amount of time spent by the IO scheduler idling for a + given cgroup in anticipation of a better request than the existing ones + from other queues/cgroups. This is in nanoseconds. If this is read + when the cgroup is in an idling state, the stat will only report the + idle_time accumulated till the last idle period and will not include + the current delta. + +- blkio.dequeue + - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. This + gives the statistics about how many a times a group was dequeued + from service tree of the device. First two fields specify the major + and minor number of the device and third field specifies the number + of times a group was dequeued from a particular device. + +- blkio.*_recursive + - Recursive version of various stats. These files show the + same information as their non-recursive counterparts but + include stats from all the descendant cgroups. + +Throttling/Upper limit policy files +----------------------------------- +- blkio.throttle.read_bps_device + - Specifies upper limit on READ rate from the device. IO rate is + specified in bytes per second. Rules are per device. Following is + the format. + + echo ": " > /cgrp/blkio.throttle.read_bps_device + +- blkio.throttle.write_bps_device + - Specifies upper limit on WRITE rate to the device. IO rate is + specified in bytes per second. Rules are per device. Following is + the format. + + echo ": " > /cgrp/blkio.throttle.write_bps_device + +- blkio.throttle.read_iops_device + - Specifies upper limit on READ rate from the device. IO rate is + specified in IO per second. Rules are per device. Following is + the format. + + echo ": " > /cgrp/blkio.throttle.read_iops_device + +- blkio.throttle.write_iops_device + - Specifies upper limit on WRITE rate to the device. IO rate is + specified in io per second. Rules are per device. Following is + the format. + + echo ": " > /cgrp/blkio.throttle.write_iops_device + +Note: If both BW and IOPS rules are specified for a device, then IO is + subjected to both the constraints. + +- blkio.throttle.io_serviced + - Number of IOs (bio) issued to the disk by the group. These + are further divided by the type of operation - read or write, sync + or async. First two fields specify the major and minor number of the + device, third field specifies the operation type and the fourth field + specifies the number of IOs. + +- blkio.throttle.io_service_bytes + - Number of bytes transferred to/from the disk by the group. These + are further divided by the type of operation - read or write, sync + or async. First two fields specify the major and minor number of the + device, third field specifies the operation type and the fourth field + specifies the number of bytes. + +Common files among various policies +----------------------------------- +- blkio.reset_stats + - Writing an int to this file will result in resetting all the stats + for that cgroup. + +CFQ sysfs tunable +================= +/sys/block//queue/iosched/slice_idle +------------------------------------------ +On a faster hardware CFQ can be slow, especially with sequential workload. +This happens because CFQ idles on a single queue and single queue might not +drive deeper request queue depths to keep the storage busy. In such scenarios +one can try setting slice_idle=0 and that would switch CFQ to IOPS +(IO operations per second) mode on NCQ supporting hardware. + +That means CFQ will not idle between cfq queues of a cfq group and hence be +able to driver higher queue depth and achieve better throughput. That also +means that cfq provides fairness among groups in terms of IOPS and not in +terms of disk time. + +/sys/block//queue/iosched/group_idle +------------------------------------------ +If one disables idling on individual cfq queues and cfq service trees by +setting slice_idle=0, group_idle kicks in. That means CFQ will still idle +on the group in an attempt to provide fairness among groups. + +By default group_idle is same as slice_idle and does not do anything if +slice_idle is enabled. + +One can experience an overall throughput drop if you have created multiple +groups and put applications in that group which are not driving enough +IO to keep disk busy. In that case set group_idle=0, and CFQ will not idle +on individual groups and throughput should improve. + +Writeback +========= + +Page cache is dirtied through buffered writes and shared mmaps and +written asynchronously to the backing filesystem by the writeback +mechanism. Writeback sits between the memory and IO domains and +regulates the proportion of dirty memory by balancing dirtying and +write IOs. + +On traditional cgroup hierarchies, relationships between different +controllers cannot be established making it impossible for writeback +to operate accounting for cgroup resource restrictions and all +writeback IOs are attributed to the root cgroup. + +If both the blkio and memory controllers are used on the v2 hierarchy +and the filesystem supports cgroup writeback, writeback operations +correctly follow the resource restrictions imposed by both memory and +blkio controllers. + +Writeback examines both system-wide and per-cgroup dirty memory status +and enforces the more restrictive of the two. Also, writeback control +parameters which are absolute values - vm.dirty_bytes and +vm.dirty_background_bytes - are distributed across cgroups according +to their current writeback bandwidth. + +There's a peculiarity stemming from the discrepancy in ownership +granularity between memory controller and writeback. While memory +controller tracks ownership per page, writeback operates on inode +basis. cgroup writeback bridges the gap by tracking ownership by +inode but migrating ownership if too many foreign pages, pages which +don't match the current inode ownership, have been encountered while +writing back the inode. + +This is a conscious design choice as writeback operations are +inherently tied to inodes making strictly following page ownership +complicated and inefficient. The only use case which suffers from +this compromise is multiple cgroups concurrently dirtying disjoint +regions of the same inode, which is an unlikely use case and decided +to be unsupported. Note that as memory controller assigns page +ownership on the first use and doesn't update it until the page is +released, even if cgroup writeback strictly follows page ownership, +multiple cgroups dirtying overlapping areas wouldn't work as expected. +In general, write-sharing an inode across multiple cgroups is not well +supported. + +Filesystem support for cgroup writeback +--------------------------------------- + +A filesystem can make writeback IOs cgroup-aware by updating +address_space_operations->writepage[s]() to annotate bio's using the +following two functions. + +* wbc_init_bio(@wbc, @bio) + + Should be called for each bio carrying writeback data and associates + the bio with the inode's owner cgroup. Can be called anytime + between bio allocation and submission. + +* wbc_account_io(@wbc, @page, @bytes) + + Should be called for each data segment being written out. While + this function doesn't care exactly when it's called during the + writeback session, it's the easiest and most natural to call it as + data segments are added to a bio. + +With writeback bio's annotated, cgroup support can be enabled per +super_block by setting MS_CGROUPWB in ->s_flags. This allows for +selective disabling of cgroup writeback support which is helpful when +certain filesystem features, e.g. journaled data mode, are +incompatible. + +wbc_init_bio() binds the specified bio to its cgroup. Depending on +the configuration, the bio may be executed at a lower priority and if +the writeback session is holding shared resources, e.g. a journal +entry, may lead to priority inversion. There is no one easy solution +for the problem. Filesystems can try to work around specific problem +cases by skipping wbc_init_bio() or using bio_associate_blkcg() +directly. diff --git a/Documentation/cgroup-legacy/cgroups.txt b/Documentation/cgroup-legacy/cgroups.txt new file mode 100644 index 0000000..c6256ae --- /dev/null +++ b/Documentation/cgroup-legacy/cgroups.txt @@ -0,0 +1,682 @@ + CGROUPS + ------- + +Written by Paul Menage based on +Documentation/cgroups/cpusets.txt + +Original copyright statements from cpusets.txt: +Portions Copyright (C) 2004 BULL SA. +Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. +Modified by Paul Jackson +Modified by Christoph Lameter + +CONTENTS: +========= + +1. Control Groups + 1.1 What are cgroups ? + 1.2 Why are cgroups needed ? + 1.3 How are cgroups implemented ? + 1.4 What does notify_on_release do ? + 1.5 What does clone_children do ? + 1.6 How do I use cgroups ? +2. Usage Examples and Syntax + 2.1 Basic Usage + 2.2 Attaching processes + 2.3 Mounting hierarchies by name +3. Kernel API + 3.1 Overview + 3.2 Synchronization + 3.3 Subsystem API +4. Extended attributes usage +5. Questions + +1. Control Groups +================= + +1.1 What are cgroups ? +---------------------- + +Control Groups provide a mechanism for aggregating/partitioning sets of +tasks, and all their future children, into hierarchical groups with +specialized behaviour. + +Definitions: + +A *cgroup* associates a set of tasks with a set of parameters for one +or more subsystems. + +A *subsystem* is a module that makes use of the task grouping +facilities provided by cgroups to treat groups of tasks in +particular ways. A subsystem is typically a "resource controller" that +schedules a resource or applies per-cgroup limits, but it may be +anything that wants to act on a group of processes, e.g. a +virtualization subsystem. + +A *hierarchy* is a set of cgroups arranged in a tree, such that +every task in the system is in exactly one of the cgroups in the +hierarchy, and a set of subsystems; each subsystem has system-specific +state attached to each cgroup in the hierarchy. Each hierarchy has +an instance of the cgroup virtual filesystem associated with it. + +At any one time there may be multiple active hierarchies of task +cgroups. Each hierarchy is a partition of all tasks in the system. + +User-level code may create and destroy cgroups by name in an +instance of the cgroup virtual file system, specify and query to +which cgroup a task is assigned, and list the task PIDs assigned to +a cgroup. Those creations and assignments only affect the hierarchy +associated with that instance of the cgroup file system. + +On their own, the only use for cgroups is for simple job +tracking. The intention is that other subsystems hook into the generic +cgroup support to provide new attributes for cgroups, such as +accounting/limiting the resources which processes in a cgroup can +access. For example, cpusets (see Documentation/cgroups/cpusets.txt) allow +you to associate a set of CPUs and a set of memory nodes with the +tasks in each cgroup. + +1.2 Why are cgroups needed ? +---------------------------- + +There are multiple efforts to provide process aggregations in the +Linux kernel, mainly for resource-tracking purposes. Such efforts +include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server +namespaces. These all require the basic notion of a +grouping/partitioning of processes, with newly forked processes ending +up in the same group (cgroup) as their parent process. + +The kernel cgroup patch provides the minimum essential kernel +mechanisms required to efficiently implement such groups. It has +minimal impact on the system fast paths, and provides hooks for +specific subsystems such as cpusets to provide additional behaviour as +desired. + +Multiple hierarchy support is provided to allow for situations where +the division of tasks into cgroups is distinctly different for +different subsystems - having parallel hierarchies allows each +hierarchy to be a natural division of tasks, without having to handle +complex combinations of tasks that would be present if several +unrelated subsystems needed to be forced into the same tree of +cgroups. + +At one extreme, each resource controller or subsystem could be in a +separate hierarchy; at the other extreme, all subsystems +would be attached to the same hierarchy. + +As an example of a scenario (originally proposed by vatsa@in.ibm.com) +that can benefit from multiple hierarchies, consider a large +university server with various users - students, professors, system +tasks etc. The resource planning for this server could be along the +following lines: + + CPU : "Top cpuset" + / \ + CPUSet1 CPUSet2 + | | + (Professors) (Students) + + In addition (system tasks) are attached to topcpuset (so + that they can run anywhere) with a limit of 20% + + Memory : Professors (50%), Students (30%), system (20%) + + Disk : Professors (50%), Students (30%), system (20%) + + Network : WWW browsing (20%), Network File System (60%), others (20%) + / \ + Professors (15%) students (5%) + +Browsers like Firefox/Lynx go into the WWW network class, while (k)nfsd goes +into the NFS network class. + +At the same time Firefox/Lynx will share an appropriate CPU/Memory class +depending on who launched it (prof/student). + +With the ability to classify tasks differently for different resources +(by putting those resource subsystems in different hierarchies), +the admin can easily set up a script which receives exec notifications +and depending on who is launching the browser he can + + # echo browser_pid > /sys/fs/cgroup///tasks + +With only a single hierarchy, he now would potentially have to create +a separate cgroup for every browser launched and associate it with +appropriate network and other resource class. This may lead to +proliferation of such cgroups. + +Also let's say that the administrator would like to give enhanced network +access temporarily to a student's browser (since it is night and the user +wants to do online gaming :)) OR give one of the student's simulation +apps enhanced CPU power. + +With ability to write PIDs directly to resource classes, it's just a +matter of: + + # echo pid > /sys/fs/cgroup/network//tasks + (after some time) + # echo pid > /sys/fs/cgroup/network//tasks + +Without this ability, the administrator would have to split the cgroup into +multiple separate ones and then associate the new cgroups with the +new resource classes. + + + +1.3 How are cgroups implemented ? +--------------------------------- + +Control Groups extends the kernel as follows: + + - Each task in the system has a reference-counted pointer to a + css_set. + + - A css_set contains a set of reference-counted pointers to + cgroup_subsys_state objects, one for each cgroup subsystem + registered in the system. There is no direct link from a task to + the cgroup of which it's a member in each hierarchy, but this + can be determined by following pointers through the + cgroup_subsys_state objects. This is because accessing the + subsystem state is something that's expected to happen frequently + and in performance-critical code, whereas operations that require a + task's actual cgroup assignments (in particular, moving between + cgroups) are less common. A linked list runs through the cg_list + field of each task_struct using the css_set, anchored at + css_set->tasks. + + - A cgroup hierarchy filesystem can be mounted for browsing and + manipulation from user space. + + - You can list all the tasks (by PID) attached to any cgroup. + +The implementation of cgroups requires a few, simple hooks +into the rest of the kernel, none in performance-critical paths: + + - in init/main.c, to initialize the root cgroups and initial + css_set at system boot. + + - in fork and exit, to attach and detach a task from its css_set. + +In addition, a new file system of type "cgroup" may be mounted, to +enable browsing and modifying the cgroups presently known to the +kernel. When mounting a cgroup hierarchy, you may specify a +comma-separated list of subsystems to mount as the filesystem mount +options. By default, mounting the cgroup filesystem attempts to +mount a hierarchy containing all registered subsystems. + +If an active hierarchy with exactly the same set of subsystems already +exists, it will be reused for the new mount. If no existing hierarchy +matches, and any of the requested subsystems are in use in an existing +hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy +is activated, associated with the requested subsystems. + +It's not currently possible to bind a new subsystem to an active +cgroup hierarchy, or to unbind a subsystem from an active cgroup +hierarchy. This may be possible in future, but is fraught with nasty +error-recovery issues. + +When a cgroup filesystem is unmounted, if there are any +child cgroups created below the top-level cgroup, that hierarchy +will remain active even though unmounted; if there are no +child cgroups then the hierarchy will be deactivated. + +No new system calls are added for cgroups - all support for +querying and modifying cgroups is via this cgroup file system. + +Each task under /proc has an added file named 'cgroup' displaying, +for each active hierarchy, the subsystem names and the cgroup name +as the path relative to the root of the cgroup file system. + +Each cgroup is represented by a directory in the cgroup file system +containing the following files describing that cgroup: + + - tasks: list of tasks (by PID) attached to that cgroup. This list + is not guaranteed to be sorted. Writing a thread ID into this file + moves the thread into this cgroup. + - cgroup.procs: list of thread group IDs in the cgroup. This list is + not guaranteed to be sorted or free of duplicate TGIDs, and userspace + should sort/uniquify the list if this property is required. + Writing a thread group ID into this file moves all threads in that + group into this cgroup. + - notify_on_release flag: run the release agent on exit? + - release_agent: the path to use for release notifications (this file + exists in the top cgroup only) + +Other subsystems such as cpusets may add additional files in each +cgroup dir. + +New cgroups are created using the mkdir system call or shell +command. The properties of a cgroup, such as its flags, are +modified by writing to the appropriate file in that cgroups +directory, as listed above. + +The named hierarchical structure of nested cgroups allows partitioning +a large system into nested, dynamically changeable, "soft-partitions". + +The attachment of each task, automatically inherited at fork by any +children of that task, to a cgroup allows organizing the work load +on a system into related sets of tasks. A task may be re-attached to +any other cgroup, if allowed by the permissions on the necessary +cgroup file system directories. + +When a task is moved from one cgroup to another, it gets a new +css_set pointer - if there's an already existing css_set with the +desired collection of cgroups then that group is reused, otherwise a new +css_set is allocated. The appropriate existing css_set is located by +looking into a hash table. + +To allow access from a cgroup to the css_sets (and hence tasks) +that comprise it, a set of cg_cgroup_link objects form a lattice; +each cg_cgroup_link is linked into a list of cg_cgroup_links for +a single cgroup on its cgrp_link_list field, and a list of +cg_cgroup_links for a single css_set on its cg_link_list. + +Thus the set of tasks in a cgroup can be listed by iterating over +each css_set that references the cgroup, and sub-iterating over +each css_set's task set. + +The use of a Linux virtual file system (vfs) to represent the +cgroup hierarchy provides for a familiar permission and name space +for cgroups, with a minimum of additional kernel code. + +1.4 What does notify_on_release do ? +------------------------------------ + +If the notify_on_release flag is enabled (1) in a cgroup, then +whenever the last task in the cgroup leaves (exits or attaches to +some other cgroup) and the last child cgroup of that cgroup +is removed, then the kernel runs the command specified by the contents +of the "release_agent" file in that hierarchy's root directory, +supplying the pathname (relative to the mount point of the cgroup +file system) of the abandoned cgroup. This enables automatic +removal of abandoned cgroups. The default value of +notify_on_release in the root cgroup at system boot is disabled +(0). The default value of other cgroups at creation is the current +value of their parents' notify_on_release settings. The default value of +a cgroup hierarchy's release_agent path is empty. + +1.5 What does clone_children do ? +--------------------------------- + +This flag only affects the cpuset controller. If the clone_children +flag is enabled (1) in a cgroup, a new cpuset cgroup will copy its +configuration from the parent during initialization. + +1.6 How do I use cgroups ? +-------------------------- + +To start a new job that is to be contained within a cgroup, using +the "cpuset" cgroup subsystem, the steps are something like: + + 1) mount -t tmpfs cgroup_root /sys/fs/cgroup + 2) mkdir /sys/fs/cgroup/cpuset + 3) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset + 4) Create the new cgroup by doing mkdir's and write's (or echo's) in + the /sys/fs/cgroup/cpuset virtual file system. + 5) Start a task that will be the "founding father" of the new job. + 6) Attach that task to the new cgroup by writing its PID to the + /sys/fs/cgroup/cpuset tasks file for that cgroup. + 7) fork, exec or clone the job tasks from this founding father task. + +For example, the following sequence of commands will setup a cgroup +named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, +and then start a subshell 'sh' in that cgroup: + + mount -t tmpfs cgroup_root /sys/fs/cgroup + mkdir /sys/fs/cgroup/cpuset + mount -t cgroup cpuset -ocpuset /sys/fs/cgroup/cpuset + cd /sys/fs/cgroup/cpuset + mkdir Charlie + cd Charlie + /bin/echo 2-3 > cpuset.cpus + /bin/echo 1 > cpuset.mems + /bin/echo $$ > tasks + sh + # The subshell 'sh' is now running in cgroup Charlie + # The next line should display '/Charlie' + cat /proc/self/cgroup + +2. Usage Examples and Syntax +============================ + +2.1 Basic Usage +--------------- + +Creating, modifying, using cgroups can be done through the cgroup +virtual filesystem. + +To mount a cgroup hierarchy with all available subsystems, type: +# mount -t cgroup xxx /sys/fs/cgroup + +The "xxx" is not interpreted by the cgroup code, but will appear in +/proc/mounts so may be any useful identifying string that you like. + +Note: Some subsystems do not work without some user input first. For instance, +if cpusets are enabled the user will have to populate the cpus and mems files +for each new cgroup created before that group can be used. + +As explained in section `1.2 Why are cgroups needed?' you should create +different hierarchies of cgroups for each single resource or group of +resources you want to control. Therefore, you should mount a tmpfs on +/sys/fs/cgroup and create directories for each cgroup resource or resource +group. + +# mount -t tmpfs cgroup_root /sys/fs/cgroup +# mkdir /sys/fs/cgroup/rg1 + +To mount a cgroup hierarchy with just the cpuset and memory +subsystems, type: +# mount -t cgroup -o cpuset,memory hier1 /sys/fs/cgroup/rg1 + +While remounting cgroups is currently supported, it is not recommend +to use it. Remounting allows changing bound subsystems and +release_agent. Rebinding is hardly useful as it only works when the +hierarchy is empty and release_agent itself should be replaced with +conventional fsnotify. The support for remounting will be removed in +the future. + +To Specify a hierarchy's release_agent: +# mount -t cgroup -o cpuset,release_agent="/sbin/cpuset_release_agent" \ + xxx /sys/fs/cgroup/rg1 + +Note that specifying 'release_agent' more than once will return failure. + +Note that changing the set of subsystems is currently only supported +when the hierarchy consists of a single (root) cgroup. Supporting +the ability to arbitrarily bind/unbind subsystems from an existing +cgroup hierarchy is intended to be implemented in the future. + +Then under /sys/fs/cgroup/rg1 you can find a tree that corresponds to the +tree of the cgroups in the system. For instance, /sys/fs/cgroup/rg1 +is the cgroup that holds the whole system. + +If you want to change the value of release_agent: +# echo "/sbin/new_release_agent" > /sys/fs/cgroup/rg1/release_agent + +It can also be changed via remount. + +If you want to create a new cgroup under /sys/fs/cgroup/rg1: +# cd /sys/fs/cgroup/rg1 +# mkdir my_cgroup + +Now you want to do something with this cgroup. +# cd my_cgroup + +In this directory you can find several files: +# ls +cgroup.procs notify_on_release tasks +(plus whatever files added by the attached subsystems) + +Now attach your shell to this cgroup: +# /bin/echo $$ > tasks + +You can also create cgroups inside your cgroup by using mkdir in this +directory. +# mkdir my_sub_cs + +To remove a cgroup, just use rmdir: +# rmdir my_sub_cs + +This will fail if the cgroup is in use (has cgroups inside, or +has processes attached, or is held alive by other subsystem-specific +reference). + +2.2 Attaching processes +----------------------- + +# /bin/echo PID > tasks + +Note that it is PID, not PIDs. You can only attach ONE task at a time. +If you have several tasks to attach, you have to do it one after another: + +# /bin/echo PID1 > tasks +# /bin/echo PID2 > tasks + ... +# /bin/echo PIDn > tasks + +You can attach the current shell task by echoing 0: + +# echo 0 > tasks + +You can use the cgroup.procs file instead of the tasks file to move all +threads in a threadgroup at once. Echoing the PID of any task in a +threadgroup to cgroup.procs causes all tasks in that threadgroup to be +attached to the cgroup. Writing 0 to cgroup.procs moves all tasks +in the writing task's threadgroup. + +Note: Since every task is always a member of exactly one cgroup in each +mounted hierarchy, to remove a task from its current cgroup you must +move it into a new cgroup (possibly the root cgroup) by writing to the +new cgroup's tasks file. + +Note: Due to some restrictions enforced by some cgroup subsystems, moving +a process to another cgroup can fail. + +2.3 Mounting hierarchies by name +-------------------------------- + +Passing the name= option when mounting a cgroups hierarchy +associates the given name with the hierarchy. This can be used when +mounting a pre-existing hierarchy, in order to refer to it by name +rather than by its set of active subsystems. Each hierarchy is either +nameless, or has a unique name. + +The name should match [\w.-]+ + +When passing a name= option for a new hierarchy, you need to +specify subsystems manually; the legacy behaviour of mounting all +subsystems when none are explicitly specified is not supported when +you give a subsystem a name. + +The name of the subsystem appears as part of the hierarchy description +in /proc/mounts and /proc//cgroups. + + +3. Kernel API +============= + +3.1 Overview +------------ + +Each kernel subsystem that wants to hook into the generic cgroup +system needs to create a cgroup_subsys object. This contains +various methods, which are callbacks from the cgroup system, along +with a subsystem ID which will be assigned by the cgroup system. + +Other fields in the cgroup_subsys object include: + +- subsys_id: a unique array index for the subsystem, indicating which + entry in cgroup->subsys[] this subsystem should be managing. + +- name: should be initialized to a unique subsystem name. Should be + no longer than MAX_CGROUP_TYPE_NAMELEN. + +- early_init: indicate if the subsystem needs early initialization + at system boot. + +Each cgroup object created by the system has an array of pointers, +indexed by subsystem ID; this pointer is entirely managed by the +subsystem; the generic cgroup code will never touch this pointer. + +3.2 Synchronization +------------------- + +There is a global mutex, cgroup_mutex, used by the cgroup +system. This should be taken by anything that wants to modify a +cgroup. It may also be taken to prevent cgroups from being +modified, but more specific locks may be more appropriate in that +situation. + +See kernel/cgroup.c for more details. + +Subsystems can take/release the cgroup_mutex via the functions +cgroup_lock()/cgroup_unlock(). + +Accessing a task's cgroup pointer may be done in the following ways: +- while holding cgroup_mutex +- while holding the task's alloc_lock (via task_lock()) +- inside an rcu_read_lock() section via rcu_dereference() + +3.3 Subsystem API +----------------- + +Each subsystem should: + +- add an entry in linux/cgroup_subsys.h +- define a cgroup_subsys object called _subsys + +If a subsystem can be compiled as a module, it should also have in its +module initcall a call to cgroup_load_subsys(), and in its exitcall a +call to cgroup_unload_subsys(). It should also set its_subsys.module = +THIS_MODULE in its .c file. + +Each subsystem may export the following methods. The only mandatory +methods are css_alloc/free. Any others that are null are presumed to +be successful no-ops. + +struct cgroup_subsys_state *css_alloc(struct cgroup *cgrp) +(cgroup_mutex held by caller) + +Called to allocate a subsystem state object for a cgroup. The +subsystem should allocate its subsystem state object for the passed +cgroup, returning a pointer to the new object on success or a +ERR_PTR() value. On success, the subsystem pointer should point to +a structure of type cgroup_subsys_state (typically embedded in a +larger subsystem-specific object), which will be initialized by the +cgroup system. Note that this will be called at initialization to +create the root subsystem state for this subsystem; this case can be +identified by the passed cgroup object having a NULL parent (since +it's the root of the hierarchy) and may be an appropriate place for +initialization code. + +int css_online(struct cgroup *cgrp) +(cgroup_mutex held by caller) + +Called after @cgrp successfully completed all allocations and made +visible to cgroup_for_each_child/descendant_*() iterators. The +subsystem may choose to fail creation by returning -errno. This +callback can be used to implement reliable state sharing and +propagation along the hierarchy. See the comment on +cgroup_for_each_descendant_pre() for details. + +void css_offline(struct cgroup *cgrp); +(cgroup_mutex held by caller) + +This is the counterpart of css_online() and called iff css_online() +has succeeded on @cgrp. This signifies the beginning of the end of +@cgrp. @cgrp is being removed and the subsystem should start dropping +all references it's holding on @cgrp. When all references are dropped, +cgroup removal will proceed to the next step - css_free(). After this +callback, @cgrp should be considered dead to the subsystem. + +void css_free(struct cgroup *cgrp) +(cgroup_mutex held by caller) + +The cgroup system is about to free @cgrp; the subsystem should free +its subsystem state object. By the time this method is called, @cgrp +is completely unused; @cgrp->parent is still valid. (Note - can also +be called for a newly-created cgroup if an error occurs after this +subsystem's create() method has been called for the new cgroup). + +int can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +(cgroup_mutex held by caller) + +Called prior to moving one or more tasks into a cgroup; if the +subsystem returns an error, this will abort the attach operation. +@tset contains the tasks to be attached and is guaranteed to have at +least one task in it. + +If there are multiple tasks in the taskset, then: + - it's guaranteed that all are from the same thread group + - @tset contains all tasks from the thread group whether or not + they're switching cgroups + - the first task is the leader + +Each @tset entry also contains the task's old cgroup and tasks which +aren't switching cgroup can be skipped easily using the +cgroup_taskset_for_each() iterator. Note that this isn't called on a +fork. If this method returns 0 (success) then this should remain valid +while the caller holds cgroup_mutex and it is ensured that either +attach() or cancel_attach() will be called in future. + +void css_reset(struct cgroup_subsys_state *css) +(cgroup_mutex held by caller) + +An optional operation which should restore @css's configuration to the +initial state. This is currently only used on the unified hierarchy +when a subsystem is disabled on a cgroup through +"cgroup.subtree_control" but should remain enabled because other +subsystems depend on it. cgroup core makes such a css invisible by +removing the associated interface files and invokes this callback so +that the hidden subsystem can return to the initial neutral state. +This prevents unexpected resource control from a hidden css and +ensures that the configuration is in the initial state when it is made +visible again later. + +void cancel_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +(cgroup_mutex held by caller) + +Called when a task attach operation has failed after can_attach() has succeeded. +A subsystem whose can_attach() has some side-effects should provide this +function, so that the subsystem can implement a rollback. If not, not necessary. +This will be called only about subsystems whose can_attach() operation have +succeeded. The parameters are identical to can_attach(). + +void attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +(cgroup_mutex held by caller) + +Called after the task has been attached to the cgroup, to allow any +post-attachment activity that requires memory allocations or blocking. +The parameters are identical to can_attach(). + +void fork(struct task_struct *task) + +Called when a task is forked into a cgroup. + +void exit(struct task_struct *task) + +Called during task exit. + +void free(struct task_struct *task) + +Called when the task_struct is freed. + +void bind(struct cgroup *root) +(cgroup_mutex held by caller) + +Called when a cgroup subsystem is rebound to a different hierarchy +and root cgroup. Currently this will only involve movement between +the default hierarchy (which never has sub-cgroups) and a hierarchy +that is being created/destroyed (and hence has no sub-cgroups). + +4. Extended attribute usage +=========================== + +cgroup filesystem supports certain types of extended attributes in its +directories and files. The current supported types are: + - Trusted (XATTR_TRUSTED) + - Security (XATTR_SECURITY) + +Both require CAP_SYS_ADMIN capability to set. + +Like in tmpfs, the extended attributes in cgroup filesystem are stored +using kernel memory and it's advised to keep the usage at minimum. This +is the reason why user defined extended attributes are not supported, since +any user can do it and there's no limit in the value size. + +The current known users for this feature are SELinux to limit cgroup usage +in containers and systemd for assorted meta data like main PID in a cgroup +(systemd creates a cgroup per service). + +5. Questions +============ + +Q: what's up with this '/bin/echo' ? +A: bash's builtin 'echo' command does not check calls to write() against + errors. If you use it in the cgroup file system, you won't be + able to tell whether a command succeeded or failed. + +Q: When I attach processes, only the first of the line gets really attached ! +A: We can only return one error code per call to write(). So you should also + put only ONE PID. + diff --git a/Documentation/cgroup-legacy/cpuacct.txt b/Documentation/cgroup-legacy/cpuacct.txt new file mode 100644 index 0000000..9d73cc0 --- /dev/null +++ b/Documentation/cgroup-legacy/cpuacct.txt @@ -0,0 +1,49 @@ +CPU Accounting Controller +------------------------- + +The CPU accounting controller is used to group tasks using cgroups and +account the CPU usage of these groups of tasks. + +The CPU accounting controller supports multi-hierarchy groups. An accounting +group accumulates the CPU usage of all of its child groups and the tasks +directly present in its group. + +Accounting groups can be created by first mounting the cgroup filesystem. + +# mount -t cgroup -ocpuacct none /sys/fs/cgroup + +With the above step, the initial or the parent accounting group becomes +visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in +the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup. +/sys/fs/cgroup/cpuacct.usage gives the CPU time (in nanoseconds) obtained +by this group which is essentially the CPU time obtained by all the tasks +in the system. + +New accounting groups can be created under the parent group /sys/fs/cgroup. + +# cd /sys/fs/cgroup +# mkdir g1 +# echo $$ > g1/tasks + +The above steps create a new group g1 and move the current shell +process (bash) into it. CPU time consumed by this bash and its children +can be obtained from g1/cpuacct.usage and the same is accumulated in +/sys/fs/cgroup/cpuacct.usage also. + +cpuacct.stat file lists a few statistics which further divide the +CPU time obtained by the cgroup into user and system times. Currently +the following statistics are supported: + +user: Time spent by tasks of the cgroup in user mode. +system: Time spent by tasks of the cgroup in kernel mode. + +user and system are in USER_HZ unit. + +cpuacct controller uses percpu_counter interface to collect user and +system times. This has two side effects: + +- It is theoretically possible to see wrong values for user and system times. + This is because percpu_counter_read() on 32bit systems isn't safe + against concurrent writes. +- It is possible to see slightly outdated values for user and system times + due to the batch processing nature of percpu_counter. diff --git a/Documentation/cgroup-legacy/cpusets.txt b/Documentation/cgroup-legacy/cpusets.txt new file mode 100644 index 0000000..fdf7dff --- /dev/null +++ b/Documentation/cgroup-legacy/cpusets.txt @@ -0,0 +1,839 @@ + CPUSETS + ------- + +Copyright (C) 2004 BULL SA. +Written by Simon.Derr@bull.net + +Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. +Modified by Paul Jackson +Modified by Christoph Lameter +Modified by Paul Menage +Modified by Hidetoshi Seto + +CONTENTS: +========= + +1. Cpusets + 1.1 What are cpusets ? + 1.2 Why are cpusets needed ? + 1.3 How are cpusets implemented ? + 1.4 What are exclusive cpusets ? + 1.5 What is memory_pressure ? + 1.6 What is memory spread ? + 1.7 What is sched_load_balance ? + 1.8 What is sched_relax_domain_level ? + 1.9 How do I use cpusets ? +2. Usage Examples and Syntax + 2.1 Basic Usage + 2.2 Adding/removing cpus + 2.3 Setting flags + 2.4 Attaching processes +3. Questions +4. Contact + +1. Cpusets +========== + +1.1 What are cpusets ? +---------------------- + +Cpusets provide a mechanism for assigning a set of CPUs and Memory +Nodes to a set of tasks. In this document "Memory Node" refers to +an on-line node that contains memory. + +Cpusets constrain the CPU and Memory placement of tasks to only +the resources within a task's current cpuset. They form a nested +hierarchy visible in a virtual file system. These are the essential +hooks, beyond what is already present, required to manage dynamic +job placement on large systems. + +Cpusets use the generic cgroup subsystem described in +Documentation/cgroups/cgroups.txt. + +Requests by a task, using the sched_setaffinity(2) system call to +include CPUs in its CPU affinity mask, and using the mbind(2) and +set_mempolicy(2) system calls to include Memory Nodes in its memory +policy, are both filtered through that task's cpuset, filtering out any +CPUs or Memory Nodes not in that cpuset. The scheduler will not +schedule a task on a CPU that is not allowed in its cpus_allowed +vector, and the kernel page allocator will not allocate a page on a +node that is not allowed in the requesting task's mems_allowed vector. + +User level code may create and destroy cpusets by name in the cgroup +virtual file system, manage the attributes and permissions of these +cpusets and which CPUs and Memory Nodes are assigned to each cpuset, +specify and query to which cpuset a task is assigned, and list the +task pids assigned to a cpuset. + + +1.2 Why are cpusets needed ? +---------------------------- + +The management of large computer systems, with many processors (CPUs), +complex memory cache hierarchies and multiple Memory Nodes having +non-uniform access times (NUMA) presents additional challenges for +the efficient scheduling and memory placement of processes. + +Frequently more modest sized systems can be operated with adequate +efficiency just by letting the operating system automatically share +the available CPU and Memory resources amongst the requesting tasks. + +But larger systems, which benefit more from careful processor and +memory placement to reduce memory access times and contention, +and which typically represent a larger investment for the customer, +can benefit from explicitly placing jobs on properly sized subsets of +the system. + +This can be especially valuable on: + + * Web Servers running multiple instances of the same web application, + * Servers running different applications (for instance, a web server + and a database), or + * NUMA systems running large HPC applications with demanding + performance characteristics. + +These subsets, or "soft partitions" must be able to be dynamically +adjusted, as the job mix changes, without impacting other concurrently +executing jobs. The location of the running jobs pages may also be moved +when the memory locations are changed. + +The kernel cpuset patch provides the minimum essential kernel +mechanisms required to efficiently implement such subsets. It +leverages existing CPU and Memory Placement facilities in the Linux +kernel to avoid any additional impact on the critical scheduler or +memory allocator code. + + +1.3 How are cpusets implemented ? +--------------------------------- + +Cpusets provide a Linux kernel mechanism to constrain which CPUs and +Memory Nodes are used by a process or set of processes. + +The Linux kernel already has a pair of mechanisms to specify on which +CPUs a task may be scheduled (sched_setaffinity) and on which Memory +Nodes it may obtain memory (mbind, set_mempolicy). + +Cpusets extends these two mechanisms as follows: + + - Cpusets are sets of allowed CPUs and Memory Nodes, known to the + kernel. + - Each task in the system is attached to a cpuset, via a pointer + in the task structure to a reference counted cgroup structure. + - Calls to sched_setaffinity are filtered to just those CPUs + allowed in that task's cpuset. + - Calls to mbind and set_mempolicy are filtered to just + those Memory Nodes allowed in that task's cpuset. + - The root cpuset contains all the systems CPUs and Memory + Nodes. + - For any cpuset, one can define child cpusets containing a subset + of the parents CPU and Memory Node resources. + - The hierarchy of cpusets can be mounted at /dev/cpuset, for + browsing and manipulation from user space. + - A cpuset may be marked exclusive, which ensures that no other + cpuset (except direct ancestors and descendants) may contain + any overlapping CPUs or Memory Nodes. + - You can list all the tasks (by pid) attached to any cpuset. + +The implementation of cpusets requires a few, simple hooks +into the rest of the kernel, none in performance critical paths: + + - in init/main.c, to initialize the root cpuset at system boot. + - in fork and exit, to attach and detach a task from its cpuset. + - in sched_setaffinity, to mask the requested CPUs by what's + allowed in that task's cpuset. + - in sched.c migrate_live_tasks(), to keep migrating tasks within + the CPUs allowed by their cpuset, if possible. + - in the mbind and set_mempolicy system calls, to mask the requested + Memory Nodes by what's allowed in that task's cpuset. + - in page_alloc.c, to restrict memory to allowed nodes. + - in vmscan.c, to restrict page recovery to the current cpuset. + +You should mount the "cgroup" filesystem type in order to enable +browsing and modifying the cpusets presently known to the kernel. No +new system calls are added for cpusets - all support for querying and +modifying cpusets is via this cpuset file system. + +The /proc//status file for each task has four added lines, +displaying the task's cpus_allowed (on which CPUs it may be scheduled) +and mems_allowed (on which Memory Nodes it may obtain memory), +in the two formats seen in the following example: + + Cpus_allowed: ffffffff,ffffffff,ffffffff,ffffffff + Cpus_allowed_list: 0-127 + Mems_allowed: ffffffff,ffffffff + Mems_allowed_list: 0-63 + +Each cpuset is represented by a directory in the cgroup file system +containing (on top of the standard cgroup files) the following +files describing that cpuset: + + - cpuset.cpus: list of CPUs in that cpuset + - cpuset.mems: list of Memory Nodes in that cpuset + - cpuset.memory_migrate flag: if set, move pages to cpusets nodes + - cpuset.cpu_exclusive flag: is cpu placement exclusive? + - cpuset.mem_exclusive flag: is memory placement exclusive? + - cpuset.mem_hardwall flag: is memory allocation hardwalled + - cpuset.memory_pressure: measure of how much paging pressure in cpuset + - cpuset.memory_spread_page flag: if set, spread page cache evenly on allowed nodes + - cpuset.memory_spread_slab flag: if set, spread slab cache evenly on allowed nodes + - cpuset.sched_load_balance flag: if set, load balance within CPUs on that cpuset + - cpuset.sched_relax_domain_level: the searching range when migrating tasks + +In addition, only the root cpuset has the following file: + - cpuset.memory_pressure_enabled flag: compute memory_pressure? + +New cpusets are created using the mkdir system call or shell +command. The properties of a cpuset, such as its flags, allowed +CPUs and Memory Nodes, and attached tasks, are modified by writing +to the appropriate file in that cpusets directory, as listed above. + +The named hierarchical structure of nested cpusets allows partitioning +a large system into nested, dynamically changeable, "soft-partitions". + +The attachment of each task, automatically inherited at fork by any +children of that task, to a cpuset allows organizing the work load +on a system into related sets of tasks such that each set is constrained +to using the CPUs and Memory Nodes of a particular cpuset. A task +may be re-attached to any other cpuset, if allowed by the permissions +on the necessary cpuset file system directories. + +Such management of a system "in the large" integrates smoothly with +the detailed placement done on individual tasks and memory regions +using the sched_setaffinity, mbind and set_mempolicy system calls. + +The following rules apply to each cpuset: + + - Its CPUs and Memory Nodes must be a subset of its parents. + - It can't be marked exclusive unless its parent is. + - If its cpu or memory is exclusive, they may not overlap any sibling. + +These rules, and the natural hierarchy of cpusets, enable efficient +enforcement of the exclusive guarantee, without having to scan all +cpusets every time any of them change to ensure nothing overlaps a +exclusive cpuset. Also, the use of a Linux virtual file system (vfs) +to represent the cpuset hierarchy provides for a familiar permission +and name space for cpusets, with a minimum of additional kernel code. + +The cpus and mems files in the root (top_cpuset) cpuset are +read-only. The cpus file automatically tracks the value of +cpu_online_mask using a CPU hotplug notifier, and the mems file +automatically tracks the value of node_states[N_MEMORY]--i.e., +nodes with memory--using the cpuset_track_online_nodes() hook. + + +1.4 What are exclusive cpusets ? +-------------------------------- + +If a cpuset is cpu or mem exclusive, no other cpuset, other than +a direct ancestor or descendant, may share any of the same CPUs or +Memory Nodes. + +A cpuset that is cpuset.mem_exclusive *or* cpuset.mem_hardwall is "hardwalled", +i.e. it restricts kernel allocations for page, buffer and other data +commonly shared by the kernel across multiple users. All cpusets, +whether hardwalled or not, restrict allocations of memory for user +space. This enables configuring a system so that several independent +jobs can share common kernel data, such as file system pages, while +isolating each job's user allocation in its own cpuset. To do this, +construct a large mem_exclusive cpuset to hold all the jobs, and +construct child, non-mem_exclusive cpusets for each individual job. +Only a small amount of typical kernel memory, such as requests from +interrupt handlers, is allowed to be taken outside even a +mem_exclusive cpuset. + + +1.5 What is memory_pressure ? +----------------------------- +The memory_pressure of a cpuset provides a simple per-cpuset metric +of the rate that the tasks in a cpuset are attempting to free up in +use memory on the nodes of the cpuset to satisfy additional memory +requests. + +This enables batch managers monitoring jobs running in dedicated +cpusets to efficiently detect what level of memory pressure that job +is causing. + +This is useful both on tightly managed systems running a wide mix of +submitted jobs, which may choose to terminate or re-prioritize jobs that +are trying to use more memory than allowed on the nodes assigned to them, +and with tightly coupled, long running, massively parallel scientific +computing jobs that will dramatically fail to meet required performance +goals if they start to use more memory than allowed to them. + +This mechanism provides a very economical way for the batch manager +to monitor a cpuset for signs of memory pressure. It's up to the +batch manager or other user code to decide what to do about it and +take action. + +==> Unless this feature is enabled by writing "1" to the special file + /dev/cpuset/memory_pressure_enabled, the hook in the rebalance + code of __alloc_pages() for this metric reduces to simply noticing + that the cpuset_memory_pressure_enabled flag is zero. So only + systems that enable this feature will compute the metric. + +Why a per-cpuset, running average: + + Because this meter is per-cpuset, rather than per-task or mm, + the system load imposed by a batch scheduler monitoring this + metric is sharply reduced on large systems, because a scan of + the tasklist can be avoided on each set of queries. + + Because this meter is a running average, instead of an accumulating + counter, a batch scheduler can detect memory pressure with a + single read, instead of having to read and accumulate results + for a period of time. + + Because this meter is per-cpuset rather than per-task or mm, + the batch scheduler can obtain the key information, memory + pressure in a cpuset, with a single read, rather than having to + query and accumulate results over all the (dynamically changing) + set of tasks in the cpuset. + +A per-cpuset simple digital filter (requires a spinlock and 3 words +of data per-cpuset) is kept, and updated by any task attached to that +cpuset, if it enters the synchronous (direct) page reclaim code. + +A per-cpuset file provides an integer number representing the recent +(half-life of 10 seconds) rate of direct page reclaims caused by +the tasks in the cpuset, in units of reclaims attempted per second, +times 1000. + + +1.6 What is memory spread ? +--------------------------- +There are two boolean flag files per cpuset that control where the +kernel allocates pages for the file system buffers and related in +kernel data structures. They are called 'cpuset.memory_spread_page' and +'cpuset.memory_spread_slab'. + +If the per-cpuset boolean flag file 'cpuset.memory_spread_page' is set, then +the kernel will spread the file system buffers (page cache) evenly +over all the nodes that the faulting task is allowed to use, instead +of preferring to put those pages on the node where the task is running. + +If the per-cpuset boolean flag file 'cpuset.memory_spread_slab' is set, +then the kernel will spread some file system related slab caches, +such as for inodes and dentries evenly over all the nodes that the +faulting task is allowed to use, instead of preferring to put those +pages on the node where the task is running. + +The setting of these flags does not affect anonymous data segment or +stack segment pages of a task. + +By default, both kinds of memory spreading are off, and memory +pages are allocated on the node local to where the task is running, +except perhaps as modified by the task's NUMA mempolicy or cpuset +configuration, so long as sufficient free memory pages are available. + +When new cpusets are created, they inherit the memory spread settings +of their parent. + +Setting memory spreading causes allocations for the affected page +or slab caches to ignore the task's NUMA mempolicy and be spread +instead. Tasks using mbind() or set_mempolicy() calls to set NUMA +mempolicies will not notice any change in these calls as a result of +their containing task's memory spread settings. If memory spreading +is turned off, then the currently specified NUMA mempolicy once again +applies to memory page allocations. + +Both 'cpuset.memory_spread_page' and 'cpuset.memory_spread_slab' are boolean flag +files. By default they contain "0", meaning that the feature is off +for that cpuset. If a "1" is written to that file, then that turns +the named feature on. + +The implementation is simple. + +Setting the flag 'cpuset.memory_spread_page' turns on a per-process flag +PFA_SPREAD_PAGE for each task that is in that cpuset or subsequently +joins that cpuset. The page allocation calls for the page cache +is modified to perform an inline check for this PFA_SPREAD_PAGE task +flag, and if set, a call to a new routine cpuset_mem_spread_node() +returns the node to prefer for the allocation. + +Similarly, setting 'cpuset.memory_spread_slab' turns on the flag +PFA_SPREAD_SLAB, and appropriately marked slab caches will allocate +pages from the node returned by cpuset_mem_spread_node(). + +The cpuset_mem_spread_node() routine is also simple. It uses the +value of a per-task rotor cpuset_mem_spread_rotor to select the next +node in the current task's mems_allowed to prefer for the allocation. + +This memory placement policy is also known (in other contexts) as +round-robin or interleave. + +This policy can provide substantial improvements for jobs that need +to place thread local data on the corresponding node, but that need +to access large file system data sets that need to be spread across +the several nodes in the jobs cpuset in order to fit. Without this +policy, especially for jobs that might have one thread reading in the +data set, the memory allocation across the nodes in the jobs cpuset +can become very uneven. + +1.7 What is sched_load_balance ? +-------------------------------- + +The kernel scheduler (kernel/sched/core.c) automatically load balances +tasks. If one CPU is underutilized, kernel code running on that +CPU will look for tasks on other more overloaded CPUs and move those +tasks to itself, within the constraints of such placement mechanisms +as cpusets and sched_setaffinity. + +The algorithmic cost of load balancing and its impact on key shared +kernel data structures such as the task list increases more than +linearly with the number of CPUs being balanced. So the scheduler +has support to partition the systems CPUs into a number of sched +domains such that it only load balances within each sched domain. +Each sched domain covers some subset of the CPUs in the system; +no two sched domains overlap; some CPUs might not be in any sched +domain and hence won't be load balanced. + +Put simply, it costs less to balance between two smaller sched domains +than one big one, but doing so means that overloads in one of the +two domains won't be load balanced to the other one. + +By default, there is one sched domain covering all CPUs, including those +marked isolated using the kernel boot time "isolcpus=" argument. However, +the isolated CPUs will not participate in load balancing, and will not +have tasks running on them unless explicitly assigned. + +This default load balancing across all CPUs is not well suited for +the following two situations: + 1) On large systems, load balancing across many CPUs is expensive. + If the system is managed using cpusets to place independent jobs + on separate sets of CPUs, full load balancing is unnecessary. + 2) Systems supporting realtime on some CPUs need to minimize + system overhead on those CPUs, including avoiding task load + balancing if that is not needed. + +When the per-cpuset flag "cpuset.sched_load_balance" is enabled (the default +setting), it requests that all the CPUs in that cpusets allowed 'cpuset.cpus' +be contained in a single sched domain, ensuring that load balancing +can move a task (not otherwised pinned, as by sched_setaffinity) +from any CPU in that cpuset to any other. + +When the per-cpuset flag "cpuset.sched_load_balance" is disabled, then the +scheduler will avoid load balancing across the CPUs in that cpuset, +--except-- in so far as is necessary because some overlapping cpuset +has "sched_load_balance" enabled. + +So, for example, if the top cpuset has the flag "cpuset.sched_load_balance" +enabled, then the scheduler will have one sched domain covering all +CPUs, and the setting of the "cpuset.sched_load_balance" flag in any other +cpusets won't matter, as we're already fully load balancing. + +Therefore in the above two situations, the top cpuset flag +"cpuset.sched_load_balance" should be disabled, and only some of the smaller, +child cpusets have this flag enabled. + +When doing this, you don't usually want to leave any unpinned tasks in +the top cpuset that might use non-trivial amounts of CPU, as such tasks +may be artificially constrained to some subset of CPUs, depending on +the particulars of this flag setting in descendant cpusets. Even if +such a task could use spare CPU cycles in some other CPUs, the kernel +scheduler might not consider the possibility of load balancing that +task to that underused CPU. + +Of course, tasks pinned to a particular CPU can be left in a cpuset +that disables "cpuset.sched_load_balance" as those tasks aren't going anywhere +else anyway. + +There is an impedance mismatch here, between cpusets and sched domains. +Cpusets are hierarchical and nest. Sched domains are flat; they don't +overlap and each CPU is in at most one sched domain. + +It is necessary for sched domains to be flat because load balancing +across partially overlapping sets of CPUs would risk unstable dynamics +that would be beyond our understanding. So if each of two partially +overlapping cpusets enables the flag 'cpuset.sched_load_balance', then we +form a single sched domain that is a superset of both. We won't move +a task to a CPU outside its cpuset, but the scheduler load balancing +code might waste some compute cycles considering that possibility. + +This mismatch is why there is not a simple one-to-one relation +between which cpusets have the flag "cpuset.sched_load_balance" enabled, +and the sched domain configuration. If a cpuset enables the flag, it +will get balancing across all its CPUs, but if it disables the flag, +it will only be assured of no load balancing if no other overlapping +cpuset enables the flag. + +If two cpusets have partially overlapping 'cpuset.cpus' allowed, and only +one of them has this flag enabled, then the other may find its +tasks only partially load balanced, just on the overlapping CPUs. +This is just the general case of the top_cpuset example given a few +paragraphs above. In the general case, as in the top cpuset case, +don't leave tasks that might use non-trivial amounts of CPU in +such partially load balanced cpusets, as they may be artificially +constrained to some subset of the CPUs allowed to them, for lack of +load balancing to the other CPUs. + +CPUs in "cpuset.isolcpus" were excluded from load balancing by the +isolcpus= kernel boot option, and will never be load balanced regardless +of the value of "cpuset.sched_load_balance" in any cpuset. + +1.7.1 sched_load_balance implementation details. +------------------------------------------------ + +The per-cpuset flag 'cpuset.sched_load_balance' defaults to enabled (contrary +to most cpuset flags.) When enabled for a cpuset, the kernel will +ensure that it can load balance across all the CPUs in that cpuset +(makes sure that all the CPUs in the cpus_allowed of that cpuset are +in the same sched domain.) + +If two overlapping cpusets both have 'cpuset.sched_load_balance' enabled, +then they will be (must be) both in the same sched domain. + +If, as is the default, the top cpuset has 'cpuset.sched_load_balance' enabled, +then by the above that means there is a single sched domain covering +the whole system, regardless of any other cpuset settings. + +The kernel commits to user space that it will avoid load balancing +where it can. It will pick as fine a granularity partition of sched +domains as it can while still providing load balancing for any set +of CPUs allowed to a cpuset having 'cpuset.sched_load_balance' enabled. + +The internal kernel cpuset to scheduler interface passes from the +cpuset code to the scheduler code a partition of the load balanced +CPUs in the system. This partition is a set of subsets (represented +as an array of struct cpumask) of CPUs, pairwise disjoint, that cover +all the CPUs that must be load balanced. + +The cpuset code builds a new such partition and passes it to the +scheduler sched domain setup code, to have the sched domains rebuilt +as necessary, whenever: + - the 'cpuset.sched_load_balance' flag of a cpuset with non-empty CPUs changes, + - or CPUs come or go from a cpuset with this flag enabled, + - or 'cpuset.sched_relax_domain_level' value of a cpuset with non-empty CPUs + and with this flag enabled changes, + - or a cpuset with non-empty CPUs and with this flag enabled is removed, + - or a cpu is offlined/onlined. + +This partition exactly defines what sched domains the scheduler should +setup - one sched domain for each element (struct cpumask) in the +partition. + +The scheduler remembers the currently active sched domain partitions. +When the scheduler routine partition_sched_domains() is invoked from +the cpuset code to update these sched domains, it compares the new +partition requested with the current, and updates its sched domains, +removing the old and adding the new, for each change. + + +1.8 What is sched_relax_domain_level ? +-------------------------------------- + +In sched domain, the scheduler migrates tasks in 2 ways; periodic load +balance on tick, and at time of some schedule events. + +When a task is woken up, scheduler try to move the task on idle CPU. +For example, if a task A running on CPU X activates another task B +on the same CPU X, and if CPU Y is X's sibling and performing idle, +then scheduler migrate task B to CPU Y so that task B can start on +CPU Y without waiting task A on CPU X. + +And if a CPU run out of tasks in its runqueue, the CPU try to pull +extra tasks from other busy CPUs to help them before it is going to +be idle. + +Of course it takes some searching cost to find movable tasks and/or +idle CPUs, the scheduler might not search all CPUs in the domain +every time. In fact, in some architectures, the searching ranges on +events are limited in the same socket or node where the CPU locates, +while the load balance on tick searches all. + +For example, assume CPU Z is relatively far from CPU X. Even if CPU Z +is idle while CPU X and the siblings are busy, scheduler can't migrate +woken task B from X to Z since it is out of its searching range. +As the result, task B on CPU X need to wait task A or wait load balance +on the next tick. For some applications in special situation, waiting +1 tick may be too long. + +The 'cpuset.sched_relax_domain_level' file allows you to request changing +this searching range as you like. This file takes int value which +indicates size of searching range in levels ideally as follows, +otherwise initial value -1 that indicates the cpuset has no request. + + -1 : no request. use system default or follow request of others. + 0 : no search. + 1 : search siblings (hyperthreads in a core). + 2 : search cores in a package. + 3 : search cpus in a node [= system wide on non-NUMA system] + 4 : search nodes in a chunk of node [on NUMA system] + 5 : search system wide [on NUMA system] + +The system default is architecture dependent. The system default +can be changed using the relax_domain_level= boot parameter. + +This file is per-cpuset and affect the sched domain where the cpuset +belongs to. Therefore if the flag 'cpuset.sched_load_balance' of a cpuset +is disabled, then 'cpuset.sched_relax_domain_level' have no effect since +there is no sched domain belonging the cpuset. + +If multiple cpusets are overlapping and hence they form a single sched +domain, the largest value among those is used. Be careful, if one +requests 0 and others are -1 then 0 is used. + +Note that modifying this file will have both good and bad effects, +and whether it is acceptable or not depends on your situation. +Don't modify this file if you are not sure. + +If your situation is: + - The migration costs between each cpu can be assumed considerably + small(for you) due to your special application's behavior or + special hardware support for CPU cache etc. + - The searching cost doesn't have impact(for you) or you can make + the searching cost enough small by managing cpuset to compact etc. + - The latency is required even it sacrifices cache hit rate etc. +then increasing 'sched_relax_domain_level' would benefit you. + + +1.9 How do I use cpusets ? +-------------------------- + +In order to minimize the impact of cpusets on critical kernel +code, such as the scheduler, and due to the fact that the kernel +does not support one task updating the memory placement of another +task directly, the impact on a task of changing its cpuset CPU +or Memory Node placement, or of changing to which cpuset a task +is attached, is subtle. + +If a cpuset has its Memory Nodes modified, then for each task attached +to that cpuset, the next time that the kernel attempts to allocate +a page of memory for that task, the kernel will notice the change +in the task's cpuset, and update its per-task memory placement to +remain within the new cpusets memory placement. If the task was using +mempolicy MPOL_BIND, and the nodes to which it was bound overlap with +its new cpuset, then the task will continue to use whatever subset +of MPOL_BIND nodes are still allowed in the new cpuset. If the task +was using MPOL_BIND and now none of its MPOL_BIND nodes are allowed +in the new cpuset, then the task will be essentially treated as if it +was MPOL_BIND bound to the new cpuset (even though its NUMA placement, +as queried by get_mempolicy(), doesn't change). If a task is moved +from one cpuset to another, then the kernel will adjust the task's +memory placement, as above, the next time that the kernel attempts +to allocate a page of memory for that task. + +If a cpuset has its 'cpuset.cpus' modified, then each task in that cpuset +will have its allowed CPU placement changed immediately. Similarly, +if a task's pid is written to another cpusets 'cpuset.tasks' file, then its +allowed CPU placement is changed immediately. If such a task had been +bound to some subset of its cpuset using the sched_setaffinity() call, +the task will be allowed to run on any CPU allowed in its new cpuset, +negating the effect of the prior sched_setaffinity() call. + +In summary, the memory placement of a task whose cpuset is changed is +updated by the kernel, on the next allocation of a page for that task, +and the processor placement is updated immediately. + +Normally, once a page is allocated (given a physical page +of main memory) then that page stays on whatever node it +was allocated, so long as it remains allocated, even if the +cpusets memory placement policy 'cpuset.mems' subsequently changes. +If the cpuset flag file 'cpuset.memory_migrate' is set true, then when +tasks are attached to that cpuset, any pages that task had +allocated to it on nodes in its previous cpuset are migrated +to the task's new cpuset. The relative placement of the page within +the cpuset is preserved during these migration operations if possible. +For example if the page was on the second valid node of the prior cpuset +then the page will be placed on the second valid node of the new cpuset. + +Also if 'cpuset.memory_migrate' is set true, then if that cpuset's +'cpuset.mems' file is modified, pages allocated to tasks in that +cpuset, that were on nodes in the previous setting of 'cpuset.mems', +will be moved to nodes in the new setting of 'mems.' +Pages that were not in the task's prior cpuset, or in the cpuset's +prior 'cpuset.mems' setting, will not be moved. + +There is an exception to the above. If hotplug functionality is used +to remove all the CPUs that are currently assigned to a cpuset, +then all the tasks in that cpuset will be moved to the nearest ancestor +with non-empty cpus. But the moving of some (or all) tasks might fail if +cpuset is bound with another cgroup subsystem which has some restrictions +on task attaching. In this failing case, those tasks will stay +in the original cpuset, and the kernel will automatically update +their cpus_allowed to allow all online CPUs. When memory hotplug +functionality for removing Memory Nodes is available, a similar exception +is expected to apply there as well. In general, the kernel prefers to +violate cpuset placement, over starving a task that has had all +its allowed CPUs or Memory Nodes taken offline. + +There is a second exception to the above. GFP_ATOMIC requests are +kernel internal allocations that must be satisfied, immediately. +The kernel may drop some request, in rare cases even panic, if a +GFP_ATOMIC alloc fails. If the request cannot be satisfied within +the current task's cpuset, then we relax the cpuset, and look for +memory anywhere we can find it. It's better to violate the cpuset +than stress the kernel. + +To start a new job that is to be contained within a cpuset, the steps are: + + 1) mkdir /sys/fs/cgroup/cpuset + 2) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset + 3) Create the new cpuset by doing mkdir's and write's (or echo's) in + the /sys/fs/cgroup/cpuset virtual file system. + 4) Start a task that will be the "founding father" of the new job. + 5) Attach that task to the new cpuset by writing its pid to the + /sys/fs/cgroup/cpuset tasks file for that cpuset. + 6) fork, exec or clone the job tasks from this founding father task. + +For example, the following sequence of commands will setup a cpuset +named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, +and then start a subshell 'sh' in that cpuset: + + mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset + cd /sys/fs/cgroup/cpuset + mkdir Charlie + cd Charlie + /bin/echo 2-3 > cpuset.cpus + /bin/echo 1 > cpuset.mems + /bin/echo $$ > tasks + sh + # The subshell 'sh' is now running in cpuset Charlie + # The next line should display '/Charlie' + cat /proc/self/cpuset + +There are ways to query or modify cpusets: + - via the cpuset file system directly, using the various cd, mkdir, echo, + cat, rmdir commands from the shell, or their equivalent from C. + - via the C library libcpuset. + - via the C library libcgroup. + (http://sourceforge.net/projects/libcg/) + - via the python application cset. + (http://code.google.com/p/cpuset/) + +The sched_setaffinity calls can also be done at the shell prompt using +SGI's runon or Robert Love's taskset. The mbind and set_mempolicy +calls can be done at the shell prompt using the numactl command +(part of Andi Kleen's numa package). + +2. Usage Examples and Syntax +============================ + +2.1 Basic Usage +--------------- + +Creating, modifying, using the cpusets can be done through the cpuset +virtual filesystem. + +To mount it, type: +# mount -t cgroup -o cpuset cpuset /sys/fs/cgroup/cpuset + +Then under /sys/fs/cgroup/cpuset you can find a tree that corresponds to the +tree of the cpusets in the system. For instance, /sys/fs/cgroup/cpuset +is the cpuset that holds the whole system. + +If you want to create a new cpuset under /sys/fs/cgroup/cpuset: +# cd /sys/fs/cgroup/cpuset +# mkdir my_cpuset + +Now you want to do something with this cpuset. +# cd my_cpuset + +In this directory you can find several files: +# ls +cgroup.clone_children cpuset.memory_pressure +cgroup.event_control cpuset.memory_spread_page +cgroup.procs cpuset.memory_spread_slab +cpuset.cpu_exclusive cpuset.mems +cpuset.cpus cpuset.sched_load_balance +cpuset.mem_exclusive cpuset.sched_relax_domain_level +cpuset.mem_hardwall notify_on_release +cpuset.memory_migrate tasks + +Reading them will give you information about the state of this cpuset: +the CPUs and Memory Nodes it can use, the processes that are using +it, its properties. By writing to these files you can manipulate +the cpuset. + +Set some flags: +# /bin/echo 1 > cpuset.cpu_exclusive + +Add some cpus: +# /bin/echo 0-7 > cpuset.cpus + +Add some mems: +# /bin/echo 0-7 > cpuset.mems + +Now attach your shell to this cpuset: +# /bin/echo $$ > tasks + +You can also create cpusets inside your cpuset by using mkdir in this +directory. +# mkdir my_sub_cs + +To remove a cpuset, just use rmdir: +# rmdir my_sub_cs +This will fail if the cpuset is in use (has cpusets inside, or has +processes attached). + +Note that for legacy reasons, the "cpuset" filesystem exists as a +wrapper around the cgroup filesystem. + +The command + +mount -t cpuset X /sys/fs/cgroup/cpuset + +is equivalent to + +mount -t cgroup -ocpuset,noprefix X /sys/fs/cgroup/cpuset +echo "/sbin/cpuset_release_agent" > /sys/fs/cgroup/cpuset/release_agent + +2.2 Adding/removing cpus +------------------------ + +This is the syntax to use when writing in the cpus or mems files +in cpuset directories: + +# /bin/echo 1-4 > cpuset.cpus -> set cpus list to cpus 1,2,3,4 +# /bin/echo 1,2,3,4 > cpuset.cpus -> set cpus list to cpus 1,2,3,4 + +To add a CPU to a cpuset, write the new list of CPUs including the +CPU to be added. To add 6 to the above cpuset: + +# /bin/echo 1-4,6 > cpuset.cpus -> set cpus list to cpus 1,2,3,4,6 + +Similarly to remove a CPU from a cpuset, write the new list of CPUs +without the CPU to be removed. + +To remove all the CPUs: + +# /bin/echo "" > cpuset.cpus -> clear cpus list + +2.3 Setting flags +----------------- + +The syntax is very simple: + +# /bin/echo 1 > cpuset.cpu_exclusive -> set flag 'cpuset.cpu_exclusive' +# /bin/echo 0 > cpuset.cpu_exclusive -> unset flag 'cpuset.cpu_exclusive' + +2.4 Attaching processes +----------------------- + +# /bin/echo PID > tasks + +Note that it is PID, not PIDs. You can only attach ONE task at a time. +If you have several tasks to attach, you have to do it one after another: + +# /bin/echo PID1 > tasks +# /bin/echo PID2 > tasks + ... +# /bin/echo PIDn > tasks + + +3. Questions +============ + +Q: what's up with this '/bin/echo' ? +A: bash's builtin 'echo' command does not check calls to write() against + errors. If you use it in the cpuset file system, you won't be + able to tell whether a command succeeded or failed. + +Q: When I attach processes, only the first of the line gets really attached ! +A: We can only return one error code per call to write(). So you should also + put only ONE pid. + +4. Contact +========== + +Web: http://www.bullopensource.org/cpuset diff --git a/Documentation/cgroup-legacy/devices.txt b/Documentation/cgroup-legacy/devices.txt new file mode 100644 index 0000000..3c1095c --- /dev/null +++ b/Documentation/cgroup-legacy/devices.txt @@ -0,0 +1,116 @@ +Device Whitelist Controller + +1. Description: + +Implement a cgroup to track and enforce open and mknod restrictions +on device files. A device cgroup associates a device access +whitelist with each cgroup. A whitelist entry has 4 fields. +'type' is a (all), c (char), or b (block). 'all' means it applies +to all types and all major and minor numbers. Major and minor are +either an integer or * for all. Access is a composition of r +(read), w (write), and m (mknod). + +The root device cgroup starts with rwm to 'all'. A child device +cgroup gets a copy of the parent. Administrators can then remove +devices from the whitelist or add new entries. A child cgroup can +never receive a device access which is denied by its parent. + +2. User Interface + +An entry is added using devices.allow, and removed using +devices.deny. For instance + + echo 'c 1:3 mr' > /sys/fs/cgroup/1/devices.allow + +allows cgroup 1 to read and mknod the device usually known as +/dev/null. Doing + + echo a > /sys/fs/cgroup/1/devices.deny + +will remove the default 'a *:* rwm' entry. Doing + + echo a > /sys/fs/cgroup/1/devices.allow + +will add the 'a *:* rwm' entry to the whitelist. + +3. Security + +Any task can move itself between cgroups. This clearly won't +suffice, but we can decide the best way to adequately restrict +movement as people get some experience with this. We may just want +to require CAP_SYS_ADMIN, which at least is a separate bit from +CAP_MKNOD. We may want to just refuse moving to a cgroup which +isn't a descendant of the current one. Or we may want to use +CAP_MAC_ADMIN, since we really are trying to lock down root. + +CAP_SYS_ADMIN is needed to modify the whitelist or move another +task to a new cgroup. (Again we'll probably want to change that). + +A cgroup may not be granted more permissions than the cgroup's +parent has. + +4. Hierarchy + +device cgroups maintain hierarchy by making sure a cgroup never has more +access permissions than its parent. Every time an entry is written to +a cgroup's devices.deny file, all its children will have that entry removed +from their whitelist and all the locally set whitelist entries will be +re-evaluated. In case one of the locally set whitelist entries would provide +more access than the cgroup's parent, it'll be removed from the whitelist. + +Example: + A + / \ + B + + group behavior exceptions + A allow "b 8:* rwm", "c 116:1 rw" + B deny "c 1:3 rwm", "c 116:2 rwm", "b 3:* rwm" + +If a device is denied in group A: + # echo "c 116:* r" > A/devices.deny +it'll propagate down and after revalidating B's entries, the whitelist entry +"c 116:2 rwm" will be removed: + + group whitelist entries denied devices + A all "b 8:* rwm", "c 116:* rw" + B "c 1:3 rwm", "b 3:* rwm" all the rest + +In case parent's exceptions change and local exceptions are not allowed +anymore, they'll be deleted. + +Notice that new whitelist entries will not be propagated: + A + / \ + B + + group whitelist entries denied devices + A "c 1:3 rwm", "c 1:5 r" all the rest + B "c 1:3 rwm", "c 1:5 r" all the rest + +when adding "c *:3 rwm": + # echo "c *:3 rwm" >A/devices.allow + +the result: + group whitelist entries denied devices + A "c *:3 rwm", "c 1:5 r" all the rest + B "c 1:3 rwm", "c 1:5 r" all the rest + +but now it'll be possible to add new entries to B: + # echo "c 2:3 rwm" >B/devices.allow + # echo "c 50:3 r" >B/devices.allow +or even + # echo "c *:3 rwm" >B/devices.allow + +Allowing or denying all by writing 'a' to devices.allow or devices.deny will +not be possible once the device cgroups has children. + +4.1 Hierarchy (internal implementation) + +device cgroups is implemented internally using a behavior (ALLOW, DENY) and a +list of exceptions. The internal state is controlled using the same user +interface to preserve compatibility with the previous whitelist-only +implementation. Removal or addition of exceptions that will reduce the access +to devices will be propagated down the hierarchy. +For every propagated exception, the effective rules will be re-evaluated based +on current parent's access rules. diff --git a/Documentation/cgroup-legacy/freezer-subsystem.txt b/Documentation/cgroup-legacy/freezer-subsystem.txt new file mode 100644 index 0000000..e831cb2 --- /dev/null +++ b/Documentation/cgroup-legacy/freezer-subsystem.txt @@ -0,0 +1,123 @@ +The cgroup freezer is useful to batch job management system which start +and stop sets of tasks in order to schedule the resources of a machine +according to the desires of a system administrator. This sort of program +is often used on HPC clusters to schedule access to the cluster as a +whole. The cgroup freezer uses cgroups to describe the set of tasks to +be started/stopped by the batch job management system. It also provides +a means to start and stop the tasks composing the job. + +The cgroup freezer will also be useful for checkpointing running groups +of tasks. The freezer allows the checkpoint code to obtain a consistent +image of the tasks by attempting to force the tasks in a cgroup into a +quiescent state. Once the tasks are quiescent another task can +walk /proc or invoke a kernel interface to gather information about the +quiesced tasks. Checkpointed tasks can be restarted later should a +recoverable error occur. This also allows the checkpointed tasks to be +migrated between nodes in a cluster by copying the gathered information +to another node and restarting the tasks there. + +Sequences of SIGSTOP and SIGCONT are not always sufficient for stopping +and resuming tasks in userspace. Both of these signals are observable +from within the tasks we wish to freeze. While SIGSTOP cannot be caught, +blocked, or ignored it can be seen by waiting or ptracing parent tasks. +SIGCONT is especially unsuitable since it can be caught by the task. Any +programs designed to watch for SIGSTOP and SIGCONT could be broken by +attempting to use SIGSTOP and SIGCONT to stop and resume tasks. We can +demonstrate this problem using nested bash shells: + + $ echo $$ + 16644 + $ bash + $ echo $$ + 16690 + + From a second, unrelated bash shell: + $ kill -SIGSTOP 16690 + $ kill -SIGCONT 16690 + + + +This happens because bash can observe both signals and choose how it +responds to them. + +Another example of a program which catches and responds to these +signals is gdb. In fact any program designed to use ptrace is likely to +have a problem with this method of stopping and resuming tasks. + +In contrast, the cgroup freezer uses the kernel freezer code to +prevent the freeze/unfreeze cycle from becoming visible to the tasks +being frozen. This allows the bash example above and gdb to run as +expected. + +The cgroup freezer is hierarchical. Freezing a cgroup freezes all +tasks belonging to the cgroup and all its descendant cgroups. Each +cgroup has its own state (self-state) and the state inherited from the +parent (parent-state). Iff both states are THAWED, the cgroup is +THAWED. + +The following cgroupfs files are created by cgroup freezer. + +* freezer.state: Read-write. + + When read, returns the effective state of the cgroup - "THAWED", + "FREEZING" or "FROZEN". This is the combined self and parent-states. + If any is freezing, the cgroup is freezing (FREEZING or FROZEN). + + FREEZING cgroup transitions into FROZEN state when all tasks + belonging to the cgroup and its descendants become frozen. Note that + a cgroup reverts to FREEZING from FROZEN after a new task is added + to the cgroup or one of its descendant cgroups until the new task is + frozen. + + When written, sets the self-state of the cgroup. Two values are + allowed - "FROZEN" and "THAWED". If FROZEN is written, the cgroup, + if not already freezing, enters FREEZING state along with all its + descendant cgroups. + + If THAWED is written, the self-state of the cgroup is changed to + THAWED. Note that the effective state may not change to THAWED if + the parent-state is still freezing. If a cgroup's effective state + becomes THAWED, all its descendants which are freezing because of + the cgroup also leave the freezing state. + +* freezer.self_freezing: Read only. + + Shows the self-state. 0 if the self-state is THAWED; otherwise, 1. + This value is 1 iff the last write to freezer.state was "FROZEN". + +* freezer.parent_freezing: Read only. + + Shows the parent-state. 0 if none of the cgroup's ancestors is + frozen; otherwise, 1. + +The root cgroup is non-freezable and the above interface files don't +exist. + +* Examples of usage : + + # mkdir /sys/fs/cgroup/freezer + # mount -t cgroup -ofreezer freezer /sys/fs/cgroup/freezer + # mkdir /sys/fs/cgroup/freezer/0 + # echo $some_pid > /sys/fs/cgroup/freezer/0/tasks + +to get status of the freezer subsystem : + + # cat /sys/fs/cgroup/freezer/0/freezer.state + THAWED + +to freeze all tasks in the container : + + # echo FROZEN > /sys/fs/cgroup/freezer/0/freezer.state + # cat /sys/fs/cgroup/freezer/0/freezer.state + FREEZING + # cat /sys/fs/cgroup/freezer/0/freezer.state + FROZEN + +to unfreeze all tasks in the container : + + # echo THAWED > /sys/fs/cgroup/freezer/0/freezer.state + # cat /sys/fs/cgroup/freezer/0/freezer.state + THAWED + +This is the basic mechanism which should do the right thing for user space task +in a simple scenario. diff --git a/Documentation/cgroup-legacy/hugetlb.txt b/Documentation/cgroup-legacy/hugetlb.txt new file mode 100644 index 0000000..106245c --- /dev/null +++ b/Documentation/cgroup-legacy/hugetlb.txt @@ -0,0 +1,45 @@ +HugeTLB Controller +------------------- + +The HugeTLB controller allows to limit the HugeTLB usage per control group and +enforces the controller limit during page fault. Since HugeTLB doesn't +support page reclaim, enforcing the limit at page fault time implies that, +the application will get SIGBUS signal if it tries to access HugeTLB pages +beyond its limit. This requires the application to know beforehand how much +HugeTLB pages it would require for its use. + +HugeTLB controller can be created by first mounting the cgroup filesystem. + +# mount -t cgroup -o hugetlb none /sys/fs/cgroup + +With the above step, the initial or the parent HugeTLB group becomes +visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in +the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup. + +New groups can be created under the parent group /sys/fs/cgroup. + +# cd /sys/fs/cgroup +# mkdir g1 +# echo $$ > g1/tasks + +The above steps create a new group g1 and move the current shell +process (bash) into it. + +Brief summary of control files + + hugetlb..limit_in_bytes # set/show limit of "hugepagesize" hugetlb usage + hugetlb..max_usage_in_bytes # show max "hugepagesize" hugetlb usage recorded + hugetlb..usage_in_bytes # show current usage for "hugepagesize" hugetlb + hugetlb..failcnt # show the number of allocation failure due to HugeTLB limit + +For a system supporting two hugepage size (16M and 16G) the control +files include: + +hugetlb.16GB.limit_in_bytes +hugetlb.16GB.max_usage_in_bytes +hugetlb.16GB.usage_in_bytes +hugetlb.16GB.failcnt +hugetlb.16MB.limit_in_bytes +hugetlb.16MB.max_usage_in_bytes +hugetlb.16MB.usage_in_bytes +hugetlb.16MB.failcnt diff --git a/Documentation/cgroup-legacy/memcg_test.txt b/Documentation/cgroup-legacy/memcg_test.txt new file mode 100644 index 0000000..8870b02 --- /dev/null +++ b/Documentation/cgroup-legacy/memcg_test.txt @@ -0,0 +1,280 @@ +Memory Resource Controller(Memcg) Implementation Memo. +Last Updated: 2010/2 +Base Kernel Version: based on 2.6.33-rc7-mm(candidate for 34). + +Because VM is getting complex (one of reasons is memcg...), memcg's behavior +is complex. This is a document for memcg's internal behavior. +Please note that implementation details can be changed. + +(*) Topics on API should be in Documentation/cgroups/memory.txt) + +0. How to record usage ? + 2 objects are used. + + page_cgroup ....an object per page. + Allocated at boot or memory hotplug. Freed at memory hot removal. + + swap_cgroup ... an entry per swp_entry. + Allocated at swapon(). Freed at swapoff(). + + The page_cgroup has USED bit and double count against a page_cgroup never + occurs. swap_cgroup is used only when a charged page is swapped-out. + +1. Charge + + a page/swp_entry may be charged (usage += PAGE_SIZE) at + + mem_cgroup_try_charge() + +2. Uncharge + a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by + + mem_cgroup_uncharge() + Called when a page's refcount goes down to 0. + + mem_cgroup_uncharge_swap() + Called when swp_entry's refcnt goes down to 0. A charge against swap + disappears. + +3. charge-commit-cancel + Memcg pages are charged in two steps: + mem_cgroup_try_charge() + mem_cgroup_commit_charge() or mem_cgroup_cancel_charge() + + At try_charge(), there are no flags to say "this page is charged". + at this point, usage += PAGE_SIZE. + + At commit(), the page is associated with the memcg. + + At cancel(), simply usage -= PAGE_SIZE. + +Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y. + +4. Anonymous + Anonymous page is newly allocated at + - page fault into MAP_ANONYMOUS mapping. + - Copy-On-Write. + + 4.1 Swap-in. + At swap-in, the page is taken from swap-cache. There are 2 cases. + + (a) If the SwapCache is newly allocated and read, it has no charges. + (b) If the SwapCache has been mapped by processes, it has been + charged already. + + 4.2 Swap-out. + At swap-out, typical state transition is below. + + (a) add to swap cache. (marked as SwapCache) + swp_entry's refcnt += 1. + (b) fully unmapped. + swp_entry's refcnt += # of ptes. + (c) write back to swap. + (d) delete from swap cache. (remove from SwapCache) + swp_entry's refcnt -= 1. + + + Finally, at task exit, + (e) zap_pte() is called and swp_entry's refcnt -=1 -> 0. + +5. Page Cache + Page Cache is charged at + - add_to_page_cache_locked(). + + The logic is very clear. (About migration, see below) + Note: __remove_from_page_cache() is called by remove_from_page_cache() + and __remove_mapping(). + +6. Shmem(tmpfs) Page Cache + The best way to understand shmem's page state transition is to read + mm/shmem.c. + But brief explanation of the behavior of memcg around shmem will be + helpful to understand the logic. + + Shmem's page (just leaf page, not direct/indirect block) can be on + - radix-tree of shmem's inode. + - SwapCache. + - Both on radix-tree and SwapCache. This happens at swap-in + and swap-out, + + It's charged when... + - A new page is added to shmem's radix-tree. + - A swp page is read. (move a charge from swap_cgroup to page_cgroup) + +7. Page Migration + + mem_cgroup_migrate() + +8. LRU + Each memcg has its own private LRU. Now, its handling is under global + VM's control (means that it's handled under global zone->lru_lock). + Almost all routines around memcg's LRU is called by global LRU's + list management functions under zone->lru_lock(). + + A special function is mem_cgroup_isolate_pages(). This scans + memcg's private LRU and call __isolate_lru_page() to extract a page + from LRU. + (By __isolate_lru_page(), the page is removed from both of global and + private LRU.) + + +9. Typical Tests. + + Tests for racy cases. + + 9.1 Small limit to memcg. + When you do test to do racy case, it's good test to set memcg's limit + to be very small rather than GB. Many races found in the test under + xKB or xxMB limits. + (Memory behavior under GB and Memory behavior under MB shows very + different situation.) + + 9.2 Shmem + Historically, memcg's shmem handling was poor and we saw some amount + of troubles here. This is because shmem is page-cache but can be + SwapCache. Test with shmem/tmpfs is always good test. + + 9.3 Migration + For NUMA, migration is an another special case. To do easy test, cpuset + is useful. Following is a sample script to do migration. + + mount -t cgroup -o cpuset none /opt/cpuset + + mkdir /opt/cpuset/01 + echo 1 > /opt/cpuset/01/cpuset.cpus + echo 0 > /opt/cpuset/01/cpuset.mems + echo 1 > /opt/cpuset/01/cpuset.memory_migrate + mkdir /opt/cpuset/02 + echo 1 > /opt/cpuset/02/cpuset.cpus + echo 1 > /opt/cpuset/02/cpuset.mems + echo 1 > /opt/cpuset/02/cpuset.memory_migrate + + In above set, when you moves a task from 01 to 02, page migration to + node 0 to node 1 will occur. Following is a script to migrate all + under cpuset. + -- + move_task() + { + for pid in $1 + do + /bin/echo $pid >$2/tasks 2>/dev/null + echo -n $pid + echo -n " " + done + echo END + } + + G1_TASK=`cat ${G1}/tasks` + G2_TASK=`cat ${G2}/tasks` + move_task "${G1_TASK}" ${G2} & + -- + 9.4 Memory hotplug. + memory hotplug test is one of good test. + to offline memory, do following. + # echo offline > /sys/devices/system/memory/memoryXXX/state + (XXX is the place of memory) + This is an easy way to test page migration, too. + + 9.5 mkdir/rmdir + When using hierarchy, mkdir/rmdir test should be done. + Use tests like the following. + + echo 1 >/opt/cgroup/01/memory/use_hierarchy + mkdir /opt/cgroup/01/child_a + mkdir /opt/cgroup/01/child_b + + set limit to 01. + add limit to 01/child_b + run jobs under child_a and child_b + + create/delete following groups at random while jobs are running. + /opt/cgroup/01/child_a/child_aa + /opt/cgroup/01/child_b/child_bb + /opt/cgroup/01/child_c + + running new jobs in new group is also good. + + 9.6 Mount with other subsystems. + Mounting with other subsystems is a good test because there is a + race and lock dependency with other cgroup subsystems. + + example) + # mount -t cgroup none /cgroup -o cpuset,memory,cpu,devices + + and do task move, mkdir, rmdir etc...under this. + + 9.7 swapoff. + Besides management of swap is one of complicated parts of memcg, + call path of swap-in at swapoff is not same as usual swap-in path.. + It's worth to be tested explicitly. + + For example, test like following is good. + (Shell-A) + # mount -t cgroup none /cgroup -o memory + # mkdir /cgroup/test + # echo 40M > /cgroup/test/memory.limit_in_bytes + # echo 0 > /cgroup/test/tasks + Run malloc(100M) program under this. You'll see 60M of swaps. + (Shell-B) + # move all tasks in /cgroup/test to /cgroup + # /sbin/swapoff -a + # rmdir /cgroup/test + # kill malloc task. + + Of course, tmpfs v.s. swapoff test should be tested, too. + + 9.8 OOM-Killer + Out-of-memory caused by memcg's limit will kill tasks under + the memcg. When hierarchy is used, a task under hierarchy + will be killed by the kernel. + In this case, panic_on_oom shouldn't be invoked and tasks + in other groups shouldn't be killed. + + It's not difficult to cause OOM under memcg as following. + Case A) when you can swapoff + #swapoff -a + #echo 50M > /memory.limit_in_bytes + run 51M of malloc + + Case B) when you use mem+swap limitation. + #echo 50M > memory.limit_in_bytes + #echo 50M > memory.memsw.limit_in_bytes + run 51M of malloc + + 9.9 Move charges at task migration + Charges associated with a task can be moved along with task migration. + + (Shell-A) + #mkdir /cgroup/A + #echo $$ >/cgroup/A/tasks + run some programs which uses some amount of memory in /cgroup/A. + + (Shell-B) + #mkdir /cgroup/B + #echo 1 >/cgroup/B/memory.move_charge_at_immigrate + #echo "pid of the program running in group A" >/cgroup/B/tasks + + You can see charges have been moved by reading *.usage_in_bytes or + memory.stat of both A and B. + See 8.2 of Documentation/cgroups/memory.txt to see what value should be + written to move_charge_at_immigrate. + + 9.10 Memory thresholds + Memory controller implements memory thresholds using cgroups notification + API. You can use tools/cgroup/cgroup_event_listener.c to test it. + + (Shell-A) Create cgroup and run event listener + # mkdir /cgroup/A + # ./cgroup_event_listener /cgroup/A/memory.usage_in_bytes 5M + + (Shell-B) Add task to cgroup and try to allocate and free memory + # echo $$ >/cgroup/A/tasks + # a="$(dd if=/dev/zero bs=1M count=10)" + # a= + + You will see message from cgroup_event_listener every time you cross + the thresholds. + + Use /cgroup/A/memory.memsw.usage_in_bytes to test memsw thresholds. + + It's good idea to test root cgroup as well. diff --git a/Documentation/cgroup-legacy/memory.txt b/Documentation/cgroup-legacy/memory.txt new file mode 100644 index 0000000..ff71e16 --- /dev/null +++ b/Documentation/cgroup-legacy/memory.txt @@ -0,0 +1,876 @@ +Memory Resource Controller + +NOTE: This document is hopelessly outdated and it asks for a complete + rewrite. It still contains a useful information so we are keeping it + here but make sure to check the current code if you need a deeper + understanding. + +NOTE: The Memory Resource Controller has generically been referred to as the + memory controller in this document. Do not confuse memory controller + used here with the memory controller that is used in hardware. + +(For editors) +In this document: + When we mention a cgroup (cgroupfs's directory) with memory controller, + we call it "memory cgroup". When you see git-log and source code, you'll + see patch's title and function names tend to use "memcg". + In this document, we avoid using it. + +Benefits and Purpose of the memory controller + +The memory controller isolates the memory behaviour of a group of tasks +from the rest of the system. The article on LWN [12] mentions some probable +uses of the memory controller. The memory controller can be used to + +a. Isolate an application or a group of applications + Memory-hungry applications can be isolated and limited to a smaller + amount of memory. +b. Create a cgroup with a limited amount of memory; this can be used + as a good alternative to booting with mem=XXXX. +c. Virtualization solutions can control the amount of memory they want + to assign to a virtual machine instance. +d. A CD/DVD burner could control the amount of memory used by the + rest of the system to ensure that burning does not fail due to lack + of available memory. +e. There are several other use cases; find one or use the controller just + for fun (to learn and hack on the VM subsystem). + +Current Status: linux-2.6.34-mmotm(development version of 2010/April) + +Features: + - accounting anonymous pages, file caches, swap caches usage and limiting them. + - pages are linked to per-memcg LRU exclusively, and there is no global LRU. + - optionally, memory+swap usage can be accounted and limited. + - hierarchical accounting + - soft limit + - moving (recharging) account at moving a task is selectable. + - usage threshold notifier + - memory pressure notifier + - oom-killer disable knob and oom-notifier + - Root cgroup has no limit controls. + + Kernel memory support is a work in progress, and the current version provides + basically functionality. (See Section 2.7) + +Brief summary of control files. + + tasks # attach a task(thread) and show list of threads + cgroup.procs # show list of processes + cgroup.event_control # an interface for event_fd() + memory.usage_in_bytes # show current usage for memory + (See 5.5 for details) + memory.memsw.usage_in_bytes # show current usage for memory+Swap + (See 5.5 for details) + memory.limit_in_bytes # set/show limit of memory usage + memory.memsw.limit_in_bytes # set/show limit of memory+Swap usage + memory.failcnt # show the number of memory usage hits limits + memory.memsw.failcnt # show the number of memory+Swap hits limits + memory.max_usage_in_bytes # show max memory usage recorded + memory.memsw.max_usage_in_bytes # show max memory+Swap usage recorded + memory.soft_limit_in_bytes # set/show soft limit of memory usage + memory.stat # show various statistics + memory.use_hierarchy # set/show hierarchical account enabled + memory.force_empty # trigger forced move charge to parent + memory.pressure_level # set memory pressure notifications + memory.swappiness # set/show swappiness parameter of vmscan + (See sysctl's vm.swappiness) + memory.move_charge_at_immigrate # set/show controls of moving charges + memory.oom_control # set/show oom controls. + memory.numa_stat # show the number of memory usage per numa node + + memory.kmem.limit_in_bytes # set/show hard limit for kernel memory + memory.kmem.usage_in_bytes # show current kernel memory allocation + memory.kmem.failcnt # show the number of kernel memory usage hits limits + memory.kmem.max_usage_in_bytes # show max kernel memory usage recorded + + memory.kmem.tcp.limit_in_bytes # set/show hard limit for tcp buf memory + memory.kmem.tcp.usage_in_bytes # show current tcp buf memory allocation + memory.kmem.tcp.failcnt # show the number of tcp buf memory usage hits limits + memory.kmem.tcp.max_usage_in_bytes # show max tcp buf memory usage recorded + +1. History + +The memory controller has a long history. A request for comments for the memory +controller was posted by Balbir Singh [1]. At the time the RFC was posted +there were several implementations for memory control. The goal of the +RFC was to build consensus and agreement for the minimal features required +for memory control. The first RSS controller was posted by Balbir Singh[2] +in Feb 2007. Pavel Emelianov [3][4][5] has since posted three versions of the +RSS controller. At OLS, at the resource management BoF, everyone suggested +that we handle both page cache and RSS together. Another request was raised +to allow user space handling of OOM. The current memory controller is +at version 6; it combines both mapped (RSS) and unmapped Page +Cache Control [11]. + +2. Memory Control + +Memory is a unique resource in the sense that it is present in a limited +amount. If a task requires a lot of CPU processing, the task can spread +its processing over a period of hours, days, months or years, but with +memory, the same physical memory needs to be reused to accomplish the task. + +The memory controller implementation has been divided into phases. These +are: + +1. Memory controller +2. mlock(2) controller +3. Kernel user memory accounting and slab control +4. user mappings length controller + +The memory controller is the first controller developed. + +2.1. Design + +The core of the design is a counter called the page_counter. The +page_counter tracks the current memory usage and limit of the group of +processes associated with the controller. Each cgroup has a memory controller +specific data structure (mem_cgroup) associated with it. + +2.2. Accounting + + +--------------------+ + | mem_cgroup | + | (page_counter) | + +--------------------+ + / ^ \ + / | \ + +---------------+ | +---------------+ + | mm_struct | |.... | mm_struct | + | | | | | + +---------------+ | +---------------+ + | + + --------------+ + | + +---------------+ +------+--------+ + | page +----------> page_cgroup| + | | | | + +---------------+ +---------------+ + + (Figure 1: Hierarchy of Accounting) + + +Figure 1 shows the important aspects of the controller + +1. Accounting happens per cgroup +2. Each mm_struct knows about which cgroup it belongs to +3. Each page has a pointer to the page_cgroup, which in turn knows the + cgroup it belongs to + +The accounting is done as follows: mem_cgroup_charge_common() is invoked to +set up the necessary data structures and check if the cgroup that is being +charged is over its limit. If it is, then reclaim is invoked on the cgroup. +More details can be found in the reclaim section of this document. +If everything goes well, a page meta-data-structure called page_cgroup is +updated. page_cgroup has its own LRU on cgroup. +(*) page_cgroup structure is allocated at boot/memory-hotplug time. + +2.2.1 Accounting details + +All mapped anon pages (RSS) and cache pages (Page Cache) are accounted. +Some pages which are never reclaimable and will not be on the LRU +are not accounted. We just account pages under usual VM management. + +RSS pages are accounted at page_fault unless they've already been accounted +for earlier. A file page will be accounted for as Page Cache when it's +inserted into inode (radix-tree). While it's mapped into the page tables of +processes, duplicate accounting is carefully avoided. + +An RSS page is unaccounted when it's fully unmapped. A PageCache page is +unaccounted when it's removed from radix-tree. Even if RSS pages are fully +unmapped (by kswapd), they may exist as SwapCache in the system until they +are really freed. Such SwapCaches are also accounted. +A swapped-in page is not accounted until it's mapped. + +Note: The kernel does swapin-readahead and reads multiple swaps at once. +This means swapped-in pages may contain pages for other tasks than a task +causing page fault. So, we avoid accounting at swap-in I/O. + +At page migration, accounting information is kept. + +Note: we just account pages-on-LRU because our purpose is to control amount +of used pages; not-on-LRU pages tend to be out-of-control from VM view. + +2.3 Shared Page Accounting + +Shared pages are accounted on the basis of the first touch approach. The +cgroup that first touches a page is accounted for the page. The principle +behind this approach is that a cgroup that aggressively uses a shared +page will eventually get charged for it (once it is uncharged from +the cgroup that brought it in -- this will happen on memory pressure). + +But see section 8.2: when moving a task to another cgroup, its pages may +be recharged to the new cgroup, if move_charge_at_immigrate has been chosen. + +Exception: If CONFIG_MEMCG_SWAP is not used. +When you do swapoff and make swapped-out pages of shmem(tmpfs) to +be backed into memory in force, charges for pages are accounted against the +caller of swapoff rather than the users of shmem. + +2.4 Swap Extension (CONFIG_MEMCG_SWAP) + +Swap Extension allows you to record charge for swap. A swapped-in page is +charged back to original page allocator if possible. + +When swap is accounted, following files are added. + - memory.memsw.usage_in_bytes. + - memory.memsw.limit_in_bytes. + +memsw means memory+swap. Usage of memory+swap is limited by +memsw.limit_in_bytes. + +Example: Assume a system with 4G of swap. A task which allocates 6G of memory +(by mistake) under 2G memory limitation will use all swap. +In this case, setting memsw.limit_in_bytes=3G will prevent bad use of swap. +By using the memsw limit, you can avoid system OOM which can be caused by swap +shortage. + +* why 'memory+swap' rather than swap. +The global LRU(kswapd) can swap out arbitrary pages. Swap-out means +to move account from memory to swap...there is no change in usage of +memory+swap. In other words, when we want to limit the usage of swap without +affecting global LRU, memory+swap limit is better than just limiting swap from +an OS point of view. + +* What happens when a cgroup hits memory.memsw.limit_in_bytes +When a cgroup hits memory.memsw.limit_in_bytes, it's useless to do swap-out +in this cgroup. Then, swap-out will not be done by cgroup routine and file +caches are dropped. But as mentioned above, global LRU can do swapout memory +from it for sanity of the system's memory management state. You can't forbid +it by cgroup. + +2.5 Reclaim + +Each cgroup maintains a per cgroup LRU which has the same structure as +global VM. When a cgroup goes over its limit, we first try +to reclaim memory from the cgroup so as to make space for the new +pages that the cgroup has touched. If the reclaim is unsuccessful, +an OOM routine is invoked to select and kill the bulkiest task in the +cgroup. (See 10. OOM Control below.) + +The reclaim algorithm has not been modified for cgroups, except that +pages that are selected for reclaiming come from the per-cgroup LRU +list. + +NOTE: Reclaim does not work for the root cgroup, since we cannot set any +limits on the root cgroup. + +Note2: When panic_on_oom is set to "2", the whole system will panic. + +When oom event notifier is registered, event will be delivered. +(See oom_control section) + +2.6 Locking + + lock_page_cgroup()/unlock_page_cgroup() should not be called under + mapping->tree_lock. + + Other lock order is following: + PG_locked. + mm->page_table_lock + zone->lru_lock + lock_page_cgroup. + In many cases, just lock_page_cgroup() is called. + per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by + zone->lru_lock, it has no lock of its own. + +2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM) + +With the Kernel memory extension, the Memory Controller is able to limit +the amount of kernel memory used by the system. Kernel memory is fundamentally +different than user memory, since it can't be swapped out, which makes it +possible to DoS the system by consuming too much of this precious resource. + +Kernel memory won't be accounted at all until limit on a group is set. This +allows for existing setups to continue working without disruption. The limit +cannot be set if the cgroup have children, or if there are already tasks in the +cgroup. Attempting to set the limit under those conditions will return -EBUSY. +When use_hierarchy == 1 and a group is accounted, its children will +automatically be accounted regardless of their limit value. + +After a group is first limited, it will be kept being accounted until it +is removed. The memory limitation itself, can of course be removed by writing +-1 to memory.kmem.limit_in_bytes. In this case, kmem will be accounted, but not +limited. + +Kernel memory limits are not imposed for the root cgroup. Usage for the root +cgroup may or may not be accounted. The memory used is accumulated into +memory.kmem.usage_in_bytes, or in a separate counter when it makes sense. +(currently only for tcp). +The main "kmem" counter is fed into the main counter, so kmem charges will +also be visible from the user counter. + +Currently no soft limit is implemented for kernel memory. It is future work +to trigger slab reclaim when those limits are reached. + +2.7.1 Current Kernel Memory resources accounted + +* stack pages: every process consumes some stack pages. By accounting into +kernel memory, we prevent new processes from being created when the kernel +memory usage is too high. + +* slab pages: pages allocated by the SLAB or SLUB allocator are tracked. A copy +of each kmem_cache is created every time the cache is touched by the first time +from inside the memcg. The creation is done lazily, so some objects can still be +skipped while the cache is being created. All objects in a slab page should +belong to the same memcg. This only fails to hold when a task is migrated to a +different memcg during the page allocation by the cache. + +* sockets memory pressure: some sockets protocols have memory pressure +thresholds. The Memory Controller allows them to be controlled individually +per cgroup, instead of globally. + +* tcp memory pressure: sockets memory pressure for the tcp protocol. + +2.7.2 Common use cases + +Because the "kmem" counter is fed to the main user counter, kernel memory can +never be limited completely independently of user memory. Say "U" is the user +limit, and "K" the kernel limit. There are three possible ways limits can be +set: + + U != 0, K = unlimited: + This is the standard memcg limitation mechanism already present before kmem + accounting. Kernel memory is completely ignored. + + U != 0, K < U: + Kernel memory is a subset of the user memory. This setup is useful in + deployments where the total amount of memory per-cgroup is overcommited. + Overcommiting kernel memory limits is definitely not recommended, since the + box can still run out of non-reclaimable memory. + In this case, the admin could set up K so that the sum of all groups is + never greater than the total memory, and freely set U at the cost of his + QoS. + WARNING: In the current implementation, memory reclaim will NOT be + triggered for a cgroup when it hits K while staying below U, which makes + this setup impractical. + + U != 0, K >= U: + Since kmem charges will also be fed to the user counter and reclaim will be + triggered for the cgroup for both kinds of memory. This setup gives the + admin a unified view of memory, and it is also useful for people who just + want to track kernel memory usage. + +3. User Interface + +3.0. Configuration + +a. Enable CONFIG_CGROUPS +b. Enable CONFIG_MEMCG +c. Enable CONFIG_MEMCG_SWAP (to use swap extension) +d. Enable CONFIG_MEMCG_KMEM (to use kmem extension) + +3.1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?) +# mount -t tmpfs none /sys/fs/cgroup +# mkdir /sys/fs/cgroup/memory +# mount -t cgroup none /sys/fs/cgroup/memory -o memory + +3.2. Make the new group and move bash into it +# mkdir /sys/fs/cgroup/memory/0 +# echo $$ > /sys/fs/cgroup/memory/0/tasks + +Since now we're in the 0 cgroup, we can alter the memory limit: +# echo 4M > /sys/fs/cgroup/memory/0/memory.limit_in_bytes + +NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, +mega or gigabytes. (Here, Kilo, Mega, Giga are Kibibytes, Mebibytes, Gibibytes.) + +NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited). +NOTE: We cannot set limits on the root cgroup any more. + +# cat /sys/fs/cgroup/memory/0/memory.limit_in_bytes +4194304 + +We can check the usage: +# cat /sys/fs/cgroup/memory/0/memory.usage_in_bytes +1216512 + +A successful write to this file does not guarantee a successful setting of +this limit to the value written into the file. This can be due to a +number of factors, such as rounding up to page boundaries or the total +availability of memory on the system. The user is required to re-read +this file after a write to guarantee the value committed by the kernel. + +# echo 1 > memory.limit_in_bytes +# cat memory.limit_in_bytes +4096 + +The memory.failcnt field gives the number of times that the cgroup limit was +exceeded. + +The memory.stat file gives accounting information. Now, the number of +caches, RSS and Active pages/Inactive pages are shown. + +4. Testing + +For testing features and implementation, see memcg_test.txt. + +Performance test is also important. To see pure memory controller's overhead, +testing on tmpfs will give you good numbers of small overheads. +Example: do kernel make on tmpfs. + +Page-fault scalability is also important. At measuring parallel +page fault test, multi-process test may be better than multi-thread +test because it has noise of shared objects/status. + +But the above two are testing extreme situations. +Trying usual test under memory controller is always helpful. + +4.1 Troubleshooting + +Sometimes a user might find that the application under a cgroup is +terminated by the OOM killer. There are several causes for this: + +1. The cgroup limit is too low (just too low to do anything useful) +2. The user is using anonymous memory and swap is turned off or too low + +A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of +some of the pages cached in the cgroup (page cache pages). + +To know what happens, disabling OOM_Kill as per "10. OOM Control" (below) and +seeing what happens will be helpful. + +4.2 Task migration + +When a task migrates from one cgroup to another, its charge is not +carried forward by default. The pages allocated from the original cgroup still +remain charged to it, the charge is dropped when the page is freed or +reclaimed. + +You can move charges of a task along with task migration. +See 8. "Move charges at task migration" + +4.3 Removing a cgroup + +A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a +cgroup might have some charge associated with it, even though all +tasks have migrated away from it. (because we charge against pages, not +against tasks.) + +We move the stats to root (if use_hierarchy==0) or parent (if +use_hierarchy==1), and no change on the charge except uncharging +from the child. + +Charges recorded in swap information is not updated at removal of cgroup. +Recorded information is discarded and a cgroup which uses swap (swapcache) +will be charged as a new owner of it. + +About use_hierarchy, see Section 6. + +5. Misc. interfaces. + +5.1 force_empty + memory.force_empty interface is provided to make cgroup's memory usage empty. + When writing anything to this + + # echo 0 > memory.force_empty + + the cgroup will be reclaimed and as many pages reclaimed as possible. + + The typical use case for this interface is before calling rmdir(). + Because rmdir() moves all pages to parent, some out-of-use page caches can be + moved to the parent. If you want to avoid that, force_empty will be useful. + + Also, note that when memory.kmem.limit_in_bytes is set the charges due to + kernel pages will still be seen. This is not considered a failure and the + write will still return success. In this case, it is expected that + memory.kmem.usage_in_bytes == memory.usage_in_bytes. + + About use_hierarchy, see Section 6. + +5.2 stat file + +memory.stat file includes following statistics + +# per-memory cgroup local status +cache - # of bytes of page cache memory. +rss - # of bytes of anonymous and swap cache memory (includes + transparent hugepages). +rss_huge - # of bytes of anonymous transparent hugepages. +mapped_file - # of bytes of mapped file (includes tmpfs/shmem) +pgpgin - # of charging events to the memory cgroup. The charging + event happens each time a page is accounted as either mapped + anon page(RSS) or cache page(Page Cache) to the cgroup. +pgpgout - # of uncharging events to the memory cgroup. The uncharging + event happens each time a page is unaccounted from the cgroup. +swap - # of bytes of swap usage +dirty - # of bytes that are waiting to get written back to the disk. +writeback - # of bytes of file/anon cache that are queued for syncing to + disk. +inactive_anon - # of bytes of anonymous and swap cache memory on inactive + LRU list. +active_anon - # of bytes of anonymous and swap cache memory on active + LRU list. +inactive_file - # of bytes of file-backed memory on inactive LRU list. +active_file - # of bytes of file-backed memory on active LRU list. +unevictable - # of bytes of memory that cannot be reclaimed (mlocked etc). + +# status considering hierarchy (see memory.use_hierarchy settings) + +hierarchical_memory_limit - # of bytes of memory limit with regard to hierarchy + under which the memory cgroup is +hierarchical_memsw_limit - # of bytes of memory+swap limit with regard to + hierarchy under which memory cgroup is. + +total_ - # hierarchical version of , which in + addition to the cgroup's own value includes the + sum of all hierarchical children's values of + , i.e. total_cache + +# The following additional stats are dependent on CONFIG_DEBUG_VM. + +recent_rotated_anon - VM internal parameter. (see mm/vmscan.c) +recent_rotated_file - VM internal parameter. (see mm/vmscan.c) +recent_scanned_anon - VM internal parameter. (see mm/vmscan.c) +recent_scanned_file - VM internal parameter. (see mm/vmscan.c) + +Memo: + recent_rotated means recent frequency of LRU rotation. + recent_scanned means recent # of scans to LRU. + showing for better debug please see the code for meanings. + +Note: + Only anonymous and swap cache memory is listed as part of 'rss' stat. + This should not be confused with the true 'resident set size' or the + amount of physical memory used by the cgroup. + 'rss + file_mapped" will give you resident set size of cgroup. + (Note: file and shmem may be shared among other cgroups. In that case, + file_mapped is accounted only when the memory cgroup is owner of page + cache.) + +5.3 swappiness + +Overrides /proc/sys/vm/swappiness for the particular group. The tunable +in the root cgroup corresponds to the global swappiness setting. + +Please note that unlike during the global reclaim, limit reclaim +enforces that 0 swappiness really prevents from any swapping even if +there is a swap storage available. This might lead to memcg OOM killer +if there are no file pages to reclaim. + +5.4 failcnt + +A memory cgroup provides memory.failcnt and memory.memsw.failcnt files. +This failcnt(== failure count) shows the number of times that a usage counter +hit its limit. When a memory cgroup hits a limit, failcnt increases and +memory under it will be reclaimed. + +You can reset failcnt by writing 0 to failcnt file. +# echo 0 > .../memory.failcnt + +5.5 usage_in_bytes + +For efficiency, as other kernel components, memory cgroup uses some optimization +to avoid unnecessary cacheline false sharing. usage_in_bytes is affected by the +method and doesn't show 'exact' value of memory (and swap) usage, it's a fuzz +value for efficient access. (Of course, when necessary, it's synchronized.) +If you want to know more exact memory usage, you should use RSS+CACHE(+SWAP) +value in memory.stat(see 5.2). + +5.6 numa_stat + +This is similar to numa_maps but operates on a per-memcg basis. This is +useful for providing visibility into the numa locality information within +an memcg since the pages are allowed to be allocated from any physical +node. One of the use cases is evaluating application performance by +combining this information with the application's CPU allocation. + +Each memcg's numa_stat file includes "total", "file", "anon" and "unevictable" +per-node page counts including "hierarchical_" which sums up all +hierarchical children's values in addition to the memcg's own value. + +The output format of memory.numa_stat is: + +total= N0= N1= ... +file= N0= N1= ... +anon= N0= N1= ... +unevictable= N0= N1= ... +hierarchical_= N0= N1= ... + +The "total" count is sum of file + anon + unevictable. + +6. Hierarchy support + +The memory controller supports a deep hierarchy and hierarchical accounting. +The hierarchy is created by creating the appropriate cgroups in the +cgroup filesystem. Consider for example, the following cgroup filesystem +hierarchy + + root + / | \ + / | \ + a b c + | \ + | \ + d e + +In the diagram above, with hierarchical accounting enabled, all memory +usage of e, is accounted to its ancestors up until the root (i.e, c and root), +that has memory.use_hierarchy enabled. If one of the ancestors goes over its +limit, the reclaim algorithm reclaims from the tasks in the ancestor and the +children of the ancestor. + +6.1 Enabling hierarchical accounting and reclaim + +A memory cgroup by default disables the hierarchy feature. Support +can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup + +# echo 1 > memory.use_hierarchy + +The feature can be disabled by + +# echo 0 > memory.use_hierarchy + +NOTE1: Enabling/disabling will fail if either the cgroup already has other + cgroups created below it, or if the parent cgroup has use_hierarchy + enabled. + +NOTE2: When panic_on_oom is set to "2", the whole system will panic in + case of an OOM event in any cgroup. + +7. Soft limits + +Soft limits allow for greater sharing of memory. The idea behind soft limits +is to allow control groups to use as much of the memory as needed, provided + +a. There is no memory contention +b. They do not exceed their hard limit + +When the system detects memory contention or low memory, control groups +are pushed back to their soft limits. If the soft limit of each control +group is very high, they are pushed back as much as possible to make +sure that one control group does not starve the others of memory. + +Please note that soft limits is a best-effort feature; it comes with +no guarantees, but it does its best to make sure that when memory is +heavily contended for, memory is allocated based on the soft limit +hints/setup. Currently soft limit based reclaim is set up such that +it gets invoked from balance_pgdat (kswapd). + +7.1 Interface + +Soft limits can be setup by using the following commands (in this example we +assume a soft limit of 256 MiB) + +# echo 256M > memory.soft_limit_in_bytes + +If we want to change this to 1G, we can at any time use + +# echo 1G > memory.soft_limit_in_bytes + +NOTE1: Soft limits take effect over a long period of time, since they involve + reclaiming memory for balancing between memory cgroups +NOTE2: It is recommended to set the soft limit always below the hard limit, + otherwise the hard limit will take precedence. + +8. Move charges at task migration + +Users can move charges associated with a task along with task migration, that +is, uncharge task's pages from the old cgroup and charge them to the new cgroup. +This feature is not supported in !CONFIG_MMU environments because of lack of +page tables. + +8.1 Interface + +This feature is disabled by default. It can be enabled (and disabled again) by +writing to memory.move_charge_at_immigrate of the destination cgroup. + +If you want to enable it: + +# echo (some positive value) > memory.move_charge_at_immigrate + +Note: Each bits of move_charge_at_immigrate has its own meaning about what type + of charges should be moved. See 8.2 for details. +Note: Charges are moved only when you move mm->owner, in other words, + a leader of a thread group. +Note: If we cannot find enough space for the task in the destination cgroup, we + try to make space by reclaiming memory. Task migration may fail if we + cannot make enough space. +Note: It can take several seconds if you move charges much. + +And if you want disable it again: + +# echo 0 > memory.move_charge_at_immigrate + +8.2 Type of charges which can be moved + +Each bit in move_charge_at_immigrate has its own meaning about what type of +charges should be moved. But in any case, it must be noted that an account of +a page or a swap can be moved only when it is charged to the task's current +(old) memory cgroup. + + bit | what type of charges would be moved ? + -----+------------------------------------------------------------------------ + 0 | A charge of an anonymous page (or swap of it) used by the target task. + | You must enable Swap Extension (see 2.4) to enable move of swap charges. + -----+------------------------------------------------------------------------ + 1 | A charge of file pages (normal file, tmpfs file (e.g. ipc shared memory) + | and swaps of tmpfs file) mmapped by the target task. Unlike the case of + | anonymous pages, file pages (and swaps) in the range mmapped by the task + | will be moved even if the task hasn't done page fault, i.e. they might + | not be the task's "RSS", but other task's "RSS" that maps the same file. + | And mapcount of the page is ignored (the page can be moved even if + | page_mapcount(page) > 1). You must enable Swap Extension (see 2.4) to + | enable move of swap charges. + +8.3 TODO + +- All of moving charge operations are done under cgroup_mutex. It's not good + behavior to hold the mutex too long, so we may need some trick. + +9. Memory thresholds + +Memory cgroup implements memory thresholds using the cgroups notification +API (see cgroups.txt). It allows to register multiple memory and memsw +thresholds and gets notifications when it crosses. + +To register a threshold, an application must: +- create an eventfd using eventfd(2); +- open memory.usage_in_bytes or memory.memsw.usage_in_bytes; +- write string like " " to + cgroup.event_control. + +Application will be notified through eventfd when memory usage crosses +threshold in any direction. + +It's applicable for root and non-root cgroup. + +10. OOM Control + +memory.oom_control file is for OOM notification and other controls. + +Memory cgroup implements OOM notifier using the cgroup notification +API (See cgroups.txt). It allows to register multiple OOM notification +delivery and gets notification when OOM happens. + +To register a notifier, an application must: + - create an eventfd using eventfd(2) + - open memory.oom_control file + - write string like " " to + cgroup.event_control + +The application will be notified through eventfd when OOM happens. +OOM notification doesn't work for the root cgroup. + +You can disable the OOM-killer by writing "1" to memory.oom_control file, as: + + #echo 1 > memory.oom_control + +If OOM-killer is disabled, tasks under cgroup will hang/sleep +in memory cgroup's OOM-waitqueue when they request accountable memory. + +For running them, you have to relax the memory cgroup's OOM status by + * enlarge limit or reduce usage. +To reduce usage, + * kill some tasks. + * move some tasks to other group with account migration. + * remove some files (on tmpfs?) + +Then, stopped tasks will work again. + +At reading, current status of OOM is shown. + oom_kill_disable 0 or 1 (if 1, oom-killer is disabled) + under_oom 0 or 1 (if 1, the memory cgroup is under OOM, tasks may + be stopped.) + +11. Memory Pressure + +The pressure level notifications can be used to monitor the memory +allocation cost; based on the pressure, applications can implement +different strategies of managing their memory resources. The pressure +levels are defined as following: + +The "low" level means that the system is reclaiming memory for new +allocations. Monitoring this reclaiming activity might be useful for +maintaining cache level. Upon notification, the program (typically +"Activity Manager") might analyze vmstat and act in advance (i.e. +prematurely shutdown unimportant services). + +The "medium" level means that the system is experiencing medium memory +pressure, the system might be making swap, paging out active file caches, +etc. Upon this event applications may decide to further analyze +vmstat/zoneinfo/memcg or internal memory usage statistics and free any +resources that can be easily reconstructed or re-read from a disk. + +The "critical" level means that the system is actively thrashing, it is +about to out of memory (OOM) or even the in-kernel OOM killer is on its +way to trigger. Applications should do whatever they can to help the +system. It might be too late to consult with vmstat or any other +statistics, so it's advisable to take an immediate action. + +The events are propagated upward until the event is handled, i.e. the +events are not pass-through. Here is what this means: for example you have +three cgroups: A->B->C. Now you set up an event listener on cgroups A, B +and C, and suppose group C experiences some pressure. In this situation, +only group C will receive the notification, i.e. groups A and B will not +receive it. This is done to avoid excessive "broadcasting" of messages, +which disturbs the system and which is especially bad if we are low on +memory or thrashing. So, organize the cgroups wisely, or propagate the +events manually (or, ask us to implement the pass-through events, +explaining why would you need them.) + +The file memory.pressure_level is only used to setup an eventfd. To +register a notification, an application must: + +- create an eventfd using eventfd(2); +- open memory.pressure_level; +- write string like " " + to cgroup.event_control. + +Application will be notified through eventfd when memory pressure is at +the specific level (or higher). Read/write operations to +memory.pressure_level are no implemented. + +Test: + + Here is a small script example that makes a new cgroup, sets up a + memory limit, sets up a notification in the cgroup and then makes child + cgroup experience a critical pressure: + + # cd /sys/fs/cgroup/memory/ + # mkdir foo + # cd foo + # cgroup_event_listener memory.pressure_level low & + # echo 8000000 > memory.limit_in_bytes + # echo 8000000 > memory.memsw.limit_in_bytes + # echo $$ > tasks + # dd if=/dev/zero | read x + + (Expect a bunch of notifications, and eventually, the oom-killer will + trigger.) + +12. TODO + +1. Make per-cgroup scanner reclaim not-shared pages first +2. Teach controller to account for shared-pages +3. Start reclamation in the background when the limit is + not yet hit but the usage is getting closer + +Summary + +Overall, the memory controller has been a stable controller and has been +commented and discussed quite extensively in the community. + +References + +1. Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/ +2. Singh, Balbir. Memory Controller (RSS Control), + http://lwn.net/Articles/222762/ +3. Emelianov, Pavel. Resource controllers based on process cgroups + http://lkml.org/lkml/2007/3/6/198 +4. Emelianov, Pavel. RSS controller based on process cgroups (v2) + http://lkml.org/lkml/2007/4/9/78 +5. Emelianov, Pavel. RSS controller based on process cgroups (v3) + http://lkml.org/lkml/2007/5/30/244 +6. Menage, Paul. Control Groups v10, http://lwn.net/Articles/236032/ +7. Vaidyanathan, Srinivasan, Control Groups: Pagecache accounting and control + subsystem (v3), http://lwn.net/Articles/235534/ +8. Singh, Balbir. RSS controller v2 test results (lmbench), + http://lkml.org/lkml/2007/5/17/232 +9. Singh, Balbir. RSS controller v2 AIM9 results + http://lkml.org/lkml/2007/5/18/1 +10. Singh, Balbir. Memory controller v6 test results, + http://lkml.org/lkml/2007/8/19/36 +11. Singh, Balbir. Memory controller introduction (v6), + http://lkml.org/lkml/2007/8/17/69 +12. Corbet, Jonathan, Controlling memory use in cgroups, + http://lwn.net/Articles/243795/ diff --git a/Documentation/cgroup-legacy/net_cls.txt b/Documentation/cgroup-legacy/net_cls.txt new file mode 100644 index 0000000..ec18234 --- /dev/null +++ b/Documentation/cgroup-legacy/net_cls.txt @@ -0,0 +1,39 @@ +Network classifier cgroup +------------------------- + +The Network classifier cgroup provides an interface to +tag network packets with a class identifier (classid). + +The Traffic Controller (tc) can be used to assign +different priorities to packets from different cgroups. +Also, Netfilter (iptables) can use this tag to perform +actions on such packets. + +Creating a net_cls cgroups instance creates a net_cls.classid file. +This net_cls.classid value is initialized to 0. + +You can write hexadecimal values to net_cls.classid; the format for these +values is 0xAAAABBBB; AAAA is the major handle number and BBBB +is the minor handle number. +Reading net_cls.classid yields a decimal result. + +Example: +mkdir /sys/fs/cgroup/net_cls +mount -t cgroup -onet_cls net_cls /sys/fs/cgroup/net_cls +mkdir /sys/fs/cgroup/net_cls/0 +echo 0x100001 > /sys/fs/cgroup/net_cls/0/net_cls.classid + - setting a 10:1 handle. + +cat /sys/fs/cgroup/net_cls/0/net_cls.classid +1048577 + +configuring tc: +tc qdisc add dev eth0 root handle 10: htb + +tc class add dev eth0 parent 10: classid 10:1 htb rate 40mbit + - creating traffic class 10:1 + +tc filter add dev eth0 parent 10: protocol ip prio 10 handle 1: cgroup + +configuring iptables, basic example: +iptables -A OUTPUT -m cgroup ! --cgroup 0x100001 -j DROP diff --git a/Documentation/cgroup-legacy/net_prio.txt b/Documentation/cgroup-legacy/net_prio.txt new file mode 100644 index 0000000..a82cbd2 --- /dev/null +++ b/Documentation/cgroup-legacy/net_prio.txt @@ -0,0 +1,55 @@ +Network priority cgroup +------------------------- + +The Network priority cgroup provides an interface to allow an administrator to +dynamically set the priority of network traffic generated by various +applications + +Nominally, an application would set the priority of its traffic via the +SO_PRIORITY socket option. This however, is not always possible because: + +1) The application may not have been coded to set this value +2) The priority of application traffic is often a site-specific administrative + decision rather than an application defined one. + +This cgroup allows an administrator to assign a process to a group which defines +the priority of egress traffic on a given interface. Network priority groups can +be created by first mounting the cgroup filesystem. + +# mount -t cgroup -onet_prio none /sys/fs/cgroup/net_prio + +With the above step, the initial group acting as the parent accounting group +becomes visible at '/sys/fs/cgroup/net_prio'. This group includes all tasks in +the system. '/sys/fs/cgroup/net_prio/tasks' lists the tasks in this cgroup. + +Each net_prio cgroup contains two files that are subsystem specific + +net_prio.prioidx +This file is read-only, and is simply informative. It contains a unique integer +value that the kernel uses as an internal representation of this cgroup. + +net_prio.ifpriomap +This file contains a map of the priorities assigned to traffic originating from +processes in this group and egressing the system on various interfaces. It +contains a list of tuples in the form . Contents of this file +can be modified by echoing a string into the file using the same tuple format. +for example: + +echo "eth0 5" > /sys/fs/cgroups/net_prio/iscsi/net_prio.ifpriomap + +This command would force any traffic originating from processes belonging to the +iscsi net_prio cgroup and egressing on interface eth0 to have the priority of +said traffic set to the value 5. The parent accounting group also has a +writeable 'net_prio.ifpriomap' file that can be used to set a system default +priority. + +Priorities are set immediately prior to queueing a frame to the device +queueing discipline (qdisc) so priorities will be assigned prior to the hardware +queue selection being made. + +One usage for the net_prio cgroup is with mqprio qdisc allowing application +traffic to be steered to hardware/driver based traffic classes. These mappings +can then be managed by administrators or other networking protocols such as +DCBX. + +A new net_prio cgroup inherits the parent's configuration. diff --git a/Documentation/cgroup-legacy/pids.txt b/Documentation/cgroup-legacy/pids.txt new file mode 100644 index 0000000..1a078b5 --- /dev/null +++ b/Documentation/cgroup-legacy/pids.txt @@ -0,0 +1,85 @@ + Process Number Controller + ========================= + +Abstract +-------- + +The process number controller is used to allow a cgroup hierarchy to stop any +new tasks from being fork()'d or clone()'d after a certain limit is reached. + +Since it is trivial to hit the task limit without hitting any kmemcg limits in +place, PIDs are a fundamental resource. As such, PID exhaustion must be +preventable in the scope of a cgroup hierarchy by allowing resource limiting of +the number of tasks in a cgroup. + +Usage +----- + +In order to use the `pids` controller, set the maximum number of tasks in +pids.max (this is not available in the root cgroup for obvious reasons). The +number of processes currently in the cgroup is given by pids.current. + +Organisational operations are not blocked by cgroup policies, so it is possible +to have pids.current > pids.max. This can be done by either setting the limit to +be smaller than pids.current, or attaching enough processes to the cgroup such +that pids.current > pids.max. However, it is not possible to violate a cgroup +policy through fork() or clone(). fork() and clone() will return -EAGAIN if the +creation of a new process would cause a cgroup policy to be violated. + +To set a cgroup to have no limit, set pids.max to "max". This is the default for +all new cgroups (N.B. that PID limits are hierarchical, so the most stringent +limit in the hierarchy is followed). + +pids.current tracks all child cgroup hierarchies, so parent/pids.current is a +superset of parent/child/pids.current. + +Example +------- + +First, we mount the pids controller: +# mkdir -p /sys/fs/cgroup/pids +# mount -t cgroup -o pids none /sys/fs/cgroup/pids + +Then we create a hierarchy, set limits and attach processes to it: +# mkdir -p /sys/fs/cgroup/pids/parent/child +# echo 2 > /sys/fs/cgroup/pids/parent/pids.max +# echo $$ > /sys/fs/cgroup/pids/parent/cgroup.procs +# cat /sys/fs/cgroup/pids/parent/pids.current +2 +# + +It should be noted that attempts to overcome the set limit (2 in this case) will +fail: + +# cat /sys/fs/cgroup/pids/parent/pids.current +2 +# ( /bin/echo "Here's some processes for you." | cat ) +sh: fork: Resource temporary unavailable +# + +Even if we migrate to a child cgroup (which doesn't have a set limit), we will +not be able to overcome the most stringent limit in the hierarchy (in this case, +parent's): + +# echo $$ > /sys/fs/cgroup/pids/parent/child/cgroup.procs +# cat /sys/fs/cgroup/pids/parent/pids.current +2 +# cat /sys/fs/cgroup/pids/parent/child/pids.current +2 +# cat /sys/fs/cgroup/pids/parent/child/pids.max +max +# ( /bin/echo "Here's some processes for you." | cat ) +sh: fork: Resource temporary unavailable +# + +We can set a limit that is smaller than pids.current, which will stop any new +processes from being forked at all (note that the shell itself counts towards +pids.current): + +# echo 1 > /sys/fs/cgroup/pids/parent/pids.max +# /bin/echo "We can't even spawn a single process now." +sh: fork: Resource temporary unavailable +# echo 0 > /sys/fs/cgroup/pids/parent/pids.max +# /bin/echo "We can't even spawn a single process now." +sh: fork: Resource temporary unavailable +# diff --git a/Documentation/cgroup-legacy/unified-hierarchy.txt b/Documentation/cgroup-legacy/unified-hierarchy.txt new file mode 100644 index 0000000..c1f0e87 --- /dev/null +++ b/Documentation/cgroup-legacy/unified-hierarchy.txt @@ -0,0 +1,645 @@ + +Cgroup unified hierarchy + +April, 2014 Tejun Heo + +This document describes the changes made by unified hierarchy and +their rationales. It will eventually be merged into the main cgroup +documentation. + +CONTENTS + +1. Background +2. Basic Operation + 2-1. Mounting + 2-2. cgroup.subtree_control + 2-3. cgroup.controllers +3. Structural Constraints + 3-1. Top-down + 3-2. No internal tasks +4. Delegation + 4-1. Model of delegation + 4-2. Common ancestor rule +5. Other Changes + 5-1. [Un]populated Notification + 5-2. Other Core Changes + 5-3. Controller File Conventions + 5-3-1. Format + 5-3-2. Control Knobs + 5-4. Per-Controller Changes + 5-4-1. io + 5-4-2. cpuset + 5-4-3. memory +6. Planned Changes + 6-1. CAP for resource control + + +1. Background + +cgroup allows an arbitrary number of hierarchies and each hierarchy +can host any number of controllers. While this seems to provide a +high level of flexibility, it isn't quite useful in practice. + +For example, as there is only one instance of each controller, utility +type controllers such as freezer which can be useful in all +hierarchies can only be used in one. The issue is exacerbated by the +fact that controllers can't be moved around once hierarchies are +populated. Another issue is that all controllers bound to a hierarchy +are forced to have exactly the same view of the hierarchy. It isn't +possible to vary the granularity depending on the specific controller. + +In practice, these issues heavily limit which controllers can be put +on the same hierarchy and most configurations resort to putting each +controller on its own hierarchy. Only closely related ones, such as +the cpu and cpuacct controllers, make sense to put on the same +hierarchy. This often means that userland ends up managing multiple +similar hierarchies repeating the same steps on each hierarchy +whenever a hierarchy management operation is necessary. + +Unfortunately, support for multiple hierarchies comes at a steep cost. +Internal implementation in cgroup core proper is dazzlingly +complicated but more importantly the support for multiple hierarchies +restricts how cgroup is used in general and what controllers can do. + +There's no limit on how many hierarchies there may be, which means +that a task's cgroup membership can't be described in finite length. +The key may contain any varying number of entries and is unlimited in +length, which makes it highly awkward to handle and leads to addition +of controllers which exist only to identify membership, which in turn +exacerbates the original problem. + +Also, as a controller can't have any expectation regarding what shape +of hierarchies other controllers would be on, each controller has to +assume that all other controllers are operating on completely +orthogonal hierarchies. This makes it impossible, or at least very +cumbersome, for controllers to cooperate with each other. + +In most use cases, putting controllers on hierarchies which are +completely orthogonal to each other isn't necessary. What usually is +called for is the ability to have differing levels of granularity +depending on the specific controller. In other words, hierarchy may +be collapsed from leaf towards root when viewed from specific +controllers. For example, a given configuration might not care about +how memory is distributed beyond a certain level while still wanting +to control how CPU cycles are distributed. + +Unified hierarchy is the next version of cgroup interface. It aims to +address the aforementioned issues by having more structure while +retaining enough flexibility for most use cases. Various other +general and controller-specific interface issues are also addressed in +the process. + + +2. Basic Operation + +2-1. Mounting + +Unified hierarchy can be mounted with the following mount command. + + mount -t cgroup2 none $MOUNT_POINT + +All controllers which support the unified hierarchy and are not bound +to other hierarchies are automatically bound to unified hierarchy and +show up at the root of it. Controllers which are enabled only in the +root of unified hierarchy can be bound to other hierarchies. This +allows mixing unified hierarchy with the traditional multiple +hierarchies in a fully backward compatible way. + +A controller can be moved across hierarchies only after the controller +is no longer referenced in its current hierarchy. Because per-cgroup +controller states are destroyed asynchronously and controllers may +have lingering references, a controller may not show up immediately on +the unified hierarchy after the final umount of the previous +hierarchy. Similarly, a controller should be fully disabled to be +moved out of the unified hierarchy and it may take some time for the +disabled controller to become available for other hierarchies; +furthermore, due to dependencies among controllers, other controllers +may need to be disabled too. + +While useful for development and manual configurations, dynamically +moving controllers between the unified and other hierarchies is +strongly discouraged for production use. It is recommended to decide +the hierarchies and controller associations before starting using the +controllers. + + +2-2. cgroup.subtree_control + +All cgroups on unified hierarchy have a "cgroup.subtree_control" file +which governs which controllers are enabled on the children of the +cgroup. Let's assume a hierarchy like the following. + + root - A - B - C + \ D + +root's "cgroup.subtree_control" file determines which controllers are +enabled on A. A's on B. B's on C and D. This coincides with the +fact that controllers on the immediate sub-level are used to +distribute the resources of the parent. In fact, it's natural to +assume that resource control knobs of a child belong to its parent. +Enabling a controller in a "cgroup.subtree_control" file declares that +distribution of the respective resources of the cgroup will be +controlled. Note that this means that controller enable states are +shared among siblings. + +When read, the file contains a space-separated list of currently +enabled controllers. A write to the file should contain a +space-separated list of controllers with '+' or '-' prefixed (without +the quotes). Controllers prefixed with '+' are enabled and '-' +disabled. If a controller is listed multiple times, the last entry +wins. The specific operations are executed atomically - either all +succeed or fail. + + +2-3. cgroup.controllers + +Read-only "cgroup.controllers" file contains a space-separated list of +controllers which can be enabled in the cgroup's +"cgroup.subtree_control" file. + +In the root cgroup, this lists controllers which are not bound to +other hierarchies and the content changes as controllers are bound to +and unbound from other hierarchies. + +In non-root cgroups, the content of this file equals that of the +parent's "cgroup.subtree_control" file as only controllers enabled +from the parent can be used in its children. + + +3. Structural Constraints + +3-1. Top-down + +As it doesn't make sense to nest control of an uncontrolled resource, +all non-root "cgroup.subtree_control" files can only contain +controllers which are enabled in the parent's "cgroup.subtree_control" +file. A controller can be enabled only if the parent has the +controller enabled and a controller can't be disabled if one or more +children have it enabled. + + +3-2. No internal tasks + +One long-standing issue that cgroup faces is the competition between +tasks belonging to the parent cgroup and its children cgroups. This +is inherently nasty as two different types of entities compete and +there is no agreed-upon obvious way to handle it. Different +controllers are doing different things. + +The cpu controller considers tasks and cgroups as equivalents and maps +nice levels to cgroup weights. This works for some cases but falls +flat when children should be allocated specific ratios of CPU cycles +and the number of internal tasks fluctuates - the ratios constantly +change as the number of competing entities fluctuates. There also are +other issues. The mapping from nice level to weight isn't obvious or +universal, and there are various other knobs which simply aren't +available for tasks. + +The io controller implicitly creates a hidden leaf node for each +cgroup to host the tasks. The hidden leaf has its own copies of all +the knobs with "leaf_" prefixed. While this allows equivalent control +over internal tasks, it's with serious drawbacks. It always adds an +extra layer of nesting which may not be necessary, makes the interface +messy and significantly complicates the implementation. + +The memory controller currently doesn't have a way to control what +happens between internal tasks and child cgroups and the behavior is +not clearly defined. There have been attempts to add ad-hoc behaviors +and knobs to tailor the behavior to specific workloads. Continuing +this direction will lead to problems which will be extremely difficult +to resolve in the long term. + +Multiple controllers struggle with internal tasks and came up with +different ways to deal with it; unfortunately, all the approaches in +use now are severely flawed and, furthermore, the widely different +behaviors make cgroup as whole highly inconsistent. + +It is clear that this is something which needs to be addressed from +cgroup core proper in a uniform way so that controllers don't need to +worry about it and cgroup as a whole shows a consistent and logical +behavior. To achieve that, unified hierarchy enforces the following +structural constraint: + + Except for the root, only cgroups which don't contain any task may + have controllers enabled in their "cgroup.subtree_control" files. + +Combined with other properties, this guarantees that, when a +controller is looking at the part of the hierarchy which has it +enabled, tasks are always only on the leaves. This rules out +situations where child cgroups compete against internal tasks of the +parent. + +There are two things to note. Firstly, the root cgroup is exempt from +the restriction. Root contains tasks and anonymous resource +consumption which can't be associated with any other cgroup and +requires special treatment from most controllers. How resource +consumption in the root cgroup is governed is up to each controller. + +Secondly, the restriction doesn't take effect if there is no enabled +controller in the cgroup's "cgroup.subtree_control" file. This is +important as otherwise it wouldn't be possible to create children of a +populated cgroup. To control resource distribution of a cgroup, the +cgroup must create children and transfer all its tasks to the children +before enabling controllers in its "cgroup.subtree_control" file. + + +4. Delegation + +4-1. Model of delegation + +A cgroup can be delegated to a less privileged user by granting write +access of the directory and its "cgroup.procs" file to the user. Note +that the resource control knobs in a given directory concern the +resources of the parent and thus must not be delegated along with the +directory. + +Once delegated, the user can build sub-hierarchy under the directory, +organize processes as it sees fit and further distribute the resources +it got from the parent. The limits and other settings of all resource +controllers are hierarchical and regardless of what happens in the +delegated sub-hierarchy, nothing can escape the resource restrictions +imposed by the parent. + +Currently, cgroup doesn't impose any restrictions on the number of +cgroups in or nesting depth of a delegated sub-hierarchy; however, +this may in the future be limited explicitly. + + +4-2. Common ancestor rule + +On the unified hierarchy, to write to a "cgroup.procs" file, in +addition to the usual write permission to the file and uid match, the +writer must also have write access to the "cgroup.procs" file of the +common ancestor of the source and destination cgroups. This prevents +delegatees from smuggling processes across disjoint sub-hierarchies. + +Let's say cgroups C0 and C1 have been delegated to user U0 who created +C00, C01 under C0 and C10 under C1 as follows. + + ~~~~~~~~~~~~~ - C0 - C00 + ~ cgroup ~ \ C01 + ~ hierarchy ~ + ~~~~~~~~~~~~~ - C1 - C10 + +C0 and C1 are separate entities in terms of resource distribution +regardless of their relative positions in the hierarchy. The +resources the processes under C0 are entitled to are controlled by +C0's ancestors and may be completely different from C1. It's clear +that the intention of delegating C0 to U0 is allowing U0 to organize +the processes under C0 and further control the distribution of C0's +resources. + +On traditional hierarchies, if a task has write access to "tasks" or +"cgroup.procs" file of a cgroup and its uid agrees with the target, it +can move the target to the cgroup. In the above example, U0 will not +only be able to move processes in each sub-hierarchy but also across +the two sub-hierarchies, effectively allowing it to violate the +organizational and resource restrictions implied by the hierarchical +structure above C0 and C1. + +On the unified hierarchy, let's say U0 wants to write the pid of a +process which has a matching uid and is currently in C10 into +"C00/cgroup.procs". U0 obviously has write access to the file and +migration permission on the process; however, the common ancestor of +the source cgroup C10 and the destination cgroup C00 is above the +points of delegation and U0 would not have write access to its +"cgroup.procs" and thus be denied with -EACCES. + + +5. Other Changes + +5-1. [Un]populated Notification + +cgroup users often need a way to determine when a cgroup's +subhierarchy becomes empty so that it can be cleaned up. cgroup +currently provides release_agent for it; unfortunately, this mechanism +is riddled with issues. + +- It delivers events by forking and execing a userland binary + specified as the release_agent. This is a long deprecated method of + notification delivery. It's extremely heavy, slow and cumbersome to + integrate with larger infrastructure. + +- There is single monitoring point at the root. There's no way to + delegate management of a subtree. + +- The event isn't recursive. It triggers when a cgroup doesn't have + any tasks or child cgroups. Events for internal nodes trigger only + after all children are removed. This again makes it impossible to + delegate management of a subtree. + +- Events are filtered from the kernel side. A "notify_on_release" + file is used to subscribe to or suppress release events. This is + unnecessarily complicated and probably done this way because event + delivery itself was expensive. + +Unified hierarchy implements "populated" field in "cgroup.events" +interface file which can be used to monitor whether the cgroup's +subhierarchy has tasks in it or not. Its value is 0 if there is no +task in the cgroup and its descendants; otherwise, 1. poll and +[id]notify events are triggered when the value changes. + +This is significantly lighter and simpler and trivially allows +delegating management of subhierarchy - subhierarchy monitoring can +block further propagation simply by putting itself or another process +in the subhierarchy and monitor events that it's interested in from +there without interfering with monitoring higher in the tree. + +In unified hierarchy, the release_agent mechanism is no longer +supported and the interface files "release_agent" and +"notify_on_release" do not exist. + + +5-2. Other Core Changes + +- None of the mount options is allowed. + +- remount is disallowed. + +- rename(2) is disallowed. + +- The "tasks" file is removed. Everything should at process + granularity. Use the "cgroup.procs" file instead. + +- The "cgroup.procs" file is not sorted. pids will be unique unless + they got recycled in-between reads. + +- The "cgroup.clone_children" file is removed. + +- /proc/PID/cgroup keeps reporting the cgroup that a zombie belonged + to before exiting. If the cgroup is removed before the zombie is + reaped, " (deleted)" is appeneded to the path. + + +5-3. Controller File Conventions + +5-3-1. Format + +In general, all controller files should be in one of the following +formats whenever possible. + +- Values only files + + VAL0 VAL1...\n + +- Flat keyed files + + KEY0 VAL0\n + KEY1 VAL1\n + ... + +- Nested keyed files + + KEY0 SUB_KEY0=VAL00 SUB_KEY1=VAL01... + KEY1 SUB_KEY0=VAL10 SUB_KEY1=VAL11... + ... + +For a writeable file, the format for writing should generally match +reading; however, controllers may allow omitting later fields or +implement restricted shortcuts for most common use cases. + +For both flat and nested keyed files, only the values for a single key +can be written at a time. For nested keyed files, the sub key pairs +may be specified in any order and not all pairs have to be specified. + + +5-3-2. Control Knobs + +- Settings for a single feature should generally be implemented in a + single file. + +- In general, the root cgroup should be exempt from resource control + and thus shouldn't have resource control knobs. + +- If a controller implements ratio based resource distribution, the + control knob should be named "weight" and have the range [1, 10000] + and 100 should be the default value. The values are chosen to allow + enough and symmetric bias in both directions while keeping it + intuitive (the default is 100%). + +- If a controller implements an absolute resource guarantee and/or + limit, the control knobs should be named "min" and "max" + respectively. If a controller implements best effort resource + gurantee and/or limit, the control knobs should be named "low" and + "high" respectively. + + In the above four control files, the special token "max" should be + used to represent upward infinity for both reading and writing. + +- If a setting has configurable default value and specific overrides, + the default settings should be keyed with "default" and appear as + the first entry in the file. Specific entries can use "default" as + its value to indicate inheritance of the default value. + +- For events which are not very high frequency, an interface file + "events" should be created which lists event key value pairs. + Whenever a notifiable event happens, file modified event should be + generated on the file. + + +5-4. Per-Controller Changes + +5-4-1. io + +- blkio is renamed to io. The interface is overhauled anyway. The + new name is more in line with the other two major controllers, cpu + and memory, and better suited given that it may be used for cgroup + writeback without involving block layer. + +- Everything including stat is always hierarchical making separate + recursive stat files pointless and, as no internal node can have + tasks, leaf weights are meaningless. The operation model is + simplified and the interface is overhauled accordingly. + + io.stat + + The stat file. The reported stats are from the point where + bio's are issued to request_queue. The stats are counted + independent of which policies are enabled. Each line in the + file follows the following format. More fields may later be + added at the end. + + $MAJ:$MIN rbytes=$RBYTES wbytes=$WBYTES rios=$RIOS wrios=$WIOS + + io.weight + + The weight setting, currently only available and effective if + cfq-iosched is in use for the target device. The weight is + between 1 and 10000 and defaults to 100. The first line + always contains the default weight in the following format to + use when per-device setting is missing. + + default $WEIGHT + + Subsequent lines list per-device weights of the following + format. + + $MAJ:$MIN $WEIGHT + + Writing "$WEIGHT" or "default $WEIGHT" changes the default + setting. Writing "$MAJ:$MIN $WEIGHT" sets per-device weight + while "$MAJ:$MIN default" clears it. + + This file is available only on non-root cgroups. + + io.max + + The maximum bandwidth and/or iops setting, only available if + blk-throttle is enabled. The file is of the following format. + + $MAJ:$MIN rbps=$RBPS wbps=$WBPS riops=$RIOPS wiops=$WIOPS + + ${R|W}BPS are read/write bytes per second and ${R|W}IOPS are + read/write IOs per second. "max" indicates no limit. Writing + to the file follows the same format but the individual + settings may be omitted or specified in any order. + + This file is available only on non-root cgroups. + + +5-4-2. cpuset + +- Tasks are kept in empty cpusets after hotplug and take on the masks + of the nearest non-empty ancestor, instead of being moved to it. + +- A task can be moved into an empty cpuset, and again it takes on the + masks of the nearest non-empty ancestor. + + +5-4-3. memory + +- use_hierarchy is on by default and the cgroup file for the flag is + not created. + +- The original lower boundary, the soft limit, is defined as a limit + that is per default unset. As a result, the set of cgroups that + global reclaim prefers is opt-in, rather than opt-out. The costs + for optimizing these mostly negative lookups are so high that the + implementation, despite its enormous size, does not even provide the + basic desirable behavior. First off, the soft limit has no + hierarchical meaning. All configured groups are organized in a + global rbtree and treated like equal peers, regardless where they + are located in the hierarchy. This makes subtree delegation + impossible. Second, the soft limit reclaim pass is so aggressive + that it not just introduces high allocation latencies into the + system, but also impacts system performance due to overreclaim, to + the point where the feature becomes self-defeating. + + The memory.low boundary on the other hand is a top-down allocated + reserve. A cgroup enjoys reclaim protection when it and all its + ancestors are below their low boundaries, which makes delegation of + subtrees possible. Secondly, new cgroups have no reserve per + default and in the common case most cgroups are eligible for the + preferred reclaim pass. This allows the new low boundary to be + efficiently implemented with just a minor addition to the generic + reclaim code, without the need for out-of-band data structures and + reclaim passes. Because the generic reclaim code considers all + cgroups except for the ones running low in the preferred first + reclaim pass, overreclaim of individual groups is eliminated as + well, resulting in much better overall workload performance. + +- The original high boundary, the hard limit, is defined as a strict + limit that can not budge, even if the OOM killer has to be called. + But this generally goes against the goal of making the most out of + the available memory. The memory consumption of workloads varies + during runtime, and that requires users to overcommit. But doing + that with a strict upper limit requires either a fairly accurate + prediction of the working set size or adding slack to the limit. + Since working set size estimation is hard and error prone, and + getting it wrong results in OOM kills, most users tend to err on the + side of a looser limit and end up wasting precious resources. + + The memory.high boundary on the other hand can be set much more + conservatively. When hit, it throttles allocations by forcing them + into direct reclaim to work off the excess, but it never invokes the + OOM killer. As a result, a high boundary that is chosen too + aggressively will not terminate the processes, but instead it will + lead to gradual performance degradation. The user can monitor this + and make corrections until the minimal memory footprint that still + gives acceptable performance is found. + + In extreme cases, with many concurrent allocations and a complete + breakdown of reclaim progress within the group, the high boundary + can be exceeded. But even then it's mostly better to satisfy the + allocation from the slack available in other groups or the rest of + the system than killing the group. Otherwise, memory.max is there + to limit this type of spillover and ultimately contain buggy or even + malicious applications. + +- The original control file names are unwieldy and inconsistent in + many different ways. For example, the upper boundary hit count is + exported in the memory.failcnt file, but an OOM event count has to + be manually counted by listening to memory.oom_control events, and + lower boundary / soft limit events have to be counted by first + setting a threshold for that value and then counting those events. + Also, usage and limit files encode their units in the filename. + That makes the filenames very long, even though this is not + information that a user needs to be reminded of every time they type + out those names. + + To address these naming issues, as well as to signal clearly that + the new interface carries a new configuration model, the naming + conventions in it necessarily differ from the old interface. + +- The original limit files indicate the state of an unset limit with a + Very High Number, and a configured limit can be unset by echoing -1 + into those files. But that very high number is implementation and + architecture dependent and not very descriptive. And while -1 can + be understood as an underflow into the highest possible value, -2 or + -10M etc. do not work, so it's not consistent. + + memory.low, memory.high, and memory.max will use the string "max" to + indicate and set the highest possible value. + +6. Planned Changes + +6-1. CAP for resource control + +Unified hierarchy will require one of the capabilities(7), which is +yet to be decided, for all resource control related knobs. Process +organization operations - creation of sub-cgroups and migration of +processes in sub-hierarchies may be delegated by changing the +ownership and/or permissions on the cgroup directory and +"cgroup.procs" interface file; however, all operations which affect +resource control - writes to a "cgroup.subtree_control" file or any +controller-specific knobs - will require an explicit CAP privilege. + +This, in part, is to prevent the cgroup interface from being +inadvertently promoted to programmable API used by non-privileged +binaries. cgroup exposes various aspects of the system in ways which +aren't properly abstracted for direct consumption by regular programs. +This is an administration interface much closer to sysctl knobs than +system calls. Even the basic access model, being filesystem path +based, isn't suitable for direct consumption. There's no way to +access "my cgroup" in a race-free way or make multiple operations +atomic against migration to another cgroup. + +Another aspect is that, for better or for worse, the cgroup interface +goes through far less scrutiny than regular interfaces for +unprivileged userland. The upside is that cgroup is able to expose +useful features which may not be suitable for general consumption in a +reasonable time frame. It provides a relatively short path between +internal details and userland-visible interface. Of course, this +shortcut comes with high risk. We go through what we go through for +general kernel APIs for good reasons. It may end up leaking internal +details in a way which can exert significant pain by locking the +kernel into a contract that can't be maintained in a reasonable +manner. + +Also, due to the specific nature, cgroup and its controllers don't +tend to attract attention from a wide scope of developers. cgroup's +short history is already fraught with severely mis-designed +interfaces, unnecessary commitments to and exposing of internal +details, broken and dangerous implementations of various features. + +Keeping cgroup as an administration interface is both advantageous for +its role and imperative given its nature. Some of the cgroup features +may make sense for unprivileged access. If deemed justified, those +must be further abstracted and implemented as a different interface, +be it a system call or process-private filesystem, and survive through +the scrutiny that any interface for general consumption is required to +go through. + +Requiring CAP is not a complete solution but should serve as a +significant deterrent against spraying cgroup usages in non-privileged +programs. diff --git a/Documentation/cgroups/00-INDEX b/Documentation/cgroups/00-INDEX deleted file mode 100644 index 3f5a40f..0000000 --- a/Documentation/cgroups/00-INDEX +++ /dev/null @@ -1,30 +0,0 @@ -00-INDEX - - this file -blkio-controller.txt - - Description for Block IO Controller, implementation and usage details. -cgroups.txt - - Control Groups definition, implementation details, examples and API. -cpuacct.txt - - CPU Accounting Controller; account CPU usage for groups of tasks. -cpusets.txt - - documents the cpusets feature; assign CPUs and Mem to a set of tasks. -devices.txt - - Device Whitelist Controller; description, interface and security. -freezer-subsystem.txt - - checkpointing; rationale to not use signals, interface. -hugetlb.txt - - HugeTLB Controller implementation and usage details. -memcg_test.txt - - Memory Resource Controller; implementation details. -memory.txt - - Memory Resource Controller; design, accounting, interface, testing. -net_cls.txt - - Network classifier cgroups details and usages. -net_prio.txt - - Network priority cgroups details and usages. -pids.txt - - Process number cgroups details and usages. -resource_counter.txt - - Resource Counter API. -unified-hierarchy.txt - - Description the new/next cgroup interface. diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt deleted file mode 100644 index 52fa9f3..0000000 --- a/Documentation/cgroups/blkio-controller.txt +++ /dev/null @@ -1,455 +0,0 @@ - Block IO Controller - =================== -Overview -======== -cgroup subsys "blkio" implements the block io controller. There seems to be -a need of various kinds of IO control policies (like proportional BW, max BW) -both at leaf nodes as well as at intermediate nodes in a storage hierarchy. -Plan is to use the same cgroup based management interface for blkio controller -and based on user options switch IO policies in the background. - -Currently two IO control policies are implemented. First one is proportional -weight time based division of disk policy. It is implemented in CFQ. Hence -this policy takes effect only on leaf nodes when CFQ is being used. The second -one is throttling policy which can be used to specify upper IO rate limits -on devices. This policy is implemented in generic block layer and can be -used on leaf nodes as well as higher level logical devices like device mapper. - -HOWTO -===== -Proportional Weight division of bandwidth ------------------------------------------ -You can do a very simple testing of running two dd threads in two different -cgroups. Here is what you can do. - -- Enable Block IO controller - CONFIG_BLK_CGROUP=y - -- Enable group scheduling in CFQ - CONFIG_CFQ_GROUP_IOSCHED=y - -- Compile and boot into kernel and mount IO controller (blkio); see - cgroups.txt, Why are cgroups needed?. - - mount -t tmpfs cgroup_root /sys/fs/cgroup - mkdir /sys/fs/cgroup/blkio - mount -t cgroup -o blkio none /sys/fs/cgroup/blkio - -- Create two cgroups - mkdir -p /sys/fs/cgroup/blkio/test1/ /sys/fs/cgroup/blkio/test2 - -- Set weights of group test1 and test2 - echo 1000 > /sys/fs/cgroup/blkio/test1/blkio.weight - echo 500 > /sys/fs/cgroup/blkio/test2/blkio.weight - -- Create two same size files (say 512MB each) on same disk (file1, file2) and - launch two dd threads in different cgroup to read those files. - - sync - echo 3 > /proc/sys/vm/drop_caches - - dd if=/mnt/sdb/zerofile1 of=/dev/null & - echo $! > /sys/fs/cgroup/blkio/test1/tasks - cat /sys/fs/cgroup/blkio/test1/tasks - - dd if=/mnt/sdb/zerofile2 of=/dev/null & - echo $! > /sys/fs/cgroup/blkio/test2/tasks - cat /sys/fs/cgroup/blkio/test2/tasks - -- At macro level, first dd should finish first. To get more precise data, keep - on looking at (with the help of script), at blkio.disk_time and - blkio.disk_sectors files of both test1 and test2 groups. This will tell how - much disk time (in milliseconds), each group got and how many sectors each - group dispatched to the disk. We provide fairness in terms of disk time, so - ideally io.disk_time of cgroups should be in proportion to the weight. - -Throttling/Upper Limit policy ------------------------------ -- Enable Block IO controller - CONFIG_BLK_CGROUP=y - -- Enable throttling in block layer - CONFIG_BLK_DEV_THROTTLING=y - -- Mount blkio controller (see cgroups.txt, Why are cgroups needed?) - mount -t cgroup -o blkio none /sys/fs/cgroup/blkio - -- Specify a bandwidth rate on particular device for root group. The format - for policy is ": ". - - echo "8:16 1048576" > /sys/fs/cgroup/blkio/blkio.throttle.read_bps_device - - Above will put a limit of 1MB/second on reads happening for root group - on device having major/minor number 8:16. - -- Run dd to read a file and see if rate is throttled to 1MB/s or not. - - # dd if=/mnt/common/zerofile of=/dev/null bs=4K count=1024 - # iflag=direct - 1024+0 records in - 1024+0 records out - 4194304 bytes (4.2 MB) copied, 4.0001 s, 1.0 MB/s - - Limits for writes can be put using blkio.throttle.write_bps_device file. - -Hierarchical Cgroups -==================== - -Both CFQ and throttling implement hierarchy support; however, -throttling's hierarchy support is enabled iff "sane_behavior" is -enabled from cgroup side, which currently is a development option and -not publicly available. - -If somebody created a hierarchy like as follows. - - root - / \ - test1 test2 - | - test3 - -CFQ by default and throttling with "sane_behavior" will handle the -hierarchy correctly. For details on CFQ hierarchy support, refer to -Documentation/block/cfq-iosched.txt. For throttling, all limits apply -to the whole subtree while all statistics are local to the IOs -directly generated by tasks in that cgroup. - -Throttling without "sane_behavior" enabled from cgroup side will -practically treat all groups at same level as if it looks like the -following. - - pivot - / / \ \ - root test1 test2 test3 - -Various user visible config options -=================================== -CONFIG_BLK_CGROUP - - Block IO controller. - -CONFIG_DEBUG_BLK_CGROUP - - Debug help. Right now some additional stats file show up in cgroup - if this option is enabled. - -CONFIG_CFQ_GROUP_IOSCHED - - Enables group scheduling in CFQ. Currently only 1 level of group - creation is allowed. - -CONFIG_BLK_DEV_THROTTLING - - Enable block device throttling support in block layer. - -Details of cgroup files -======================= -Proportional weight policy files --------------------------------- -- blkio.weight - - Specifies per cgroup weight. This is default weight of the group - on all the devices until and unless overridden by per device rule. - (See blkio.weight_device). - Currently allowed range of weights is from 10 to 1000. - -- blkio.weight_device - - One can specify per cgroup per device rules using this interface. - These rules override the default value of group weight as specified - by blkio.weight. - - Following is the format. - - # echo dev_maj:dev_minor weight > blkio.weight_device - Configure weight=300 on /dev/sdb (8:16) in this cgroup - # echo 8:16 300 > blkio.weight_device - # cat blkio.weight_device - dev weight - 8:16 300 - - Configure weight=500 on /dev/sda (8:0) in this cgroup - # echo 8:0 500 > blkio.weight_device - # cat blkio.weight_device - dev weight - 8:0 500 - 8:16 300 - - Remove specific weight for /dev/sda in this cgroup - # echo 8:0 0 > blkio.weight_device - # cat blkio.weight_device - dev weight - 8:16 300 - -- blkio.leaf_weight[_device] - - Equivalents of blkio.weight[_device] for the purpose of - deciding how much weight tasks in the given cgroup has while - competing with the cgroup's child cgroups. For details, - please refer to Documentation/block/cfq-iosched.txt. - -- blkio.time - - disk time allocated to cgroup per device in milliseconds. First - two fields specify the major and minor number of the device and - third field specifies the disk time allocated to group in - milliseconds. - -- blkio.sectors - - number of sectors transferred to/from disk by the group. First - two fields specify the major and minor number of the device and - third field specifies the number of sectors transferred by the - group to/from the device. - -- blkio.io_service_bytes - - Number of bytes transferred to/from the disk by the group. These - are further divided by the type of operation - read or write, sync - or async. First two fields specify the major and minor number of the - device, third field specifies the operation type and the fourth field - specifies the number of bytes. - -- blkio.io_serviced - - Number of IOs (bio) issued to the disk by the group. These - are further divided by the type of operation - read or write, sync - or async. First two fields specify the major and minor number of the - device, third field specifies the operation type and the fourth field - specifies the number of IOs. - -- blkio.io_service_time - - Total amount of time between request dispatch and request completion - for the IOs done by this cgroup. This is in nanoseconds to make it - meaningful for flash devices too. For devices with queue depth of 1, - this time represents the actual service time. When queue_depth > 1, - that is no longer true as requests may be served out of order. This - may cause the service time for a given IO to include the service time - of multiple IOs when served out of order which may result in total - io_service_time > actual time elapsed. This time is further divided by - the type of operation - read or write, sync or async. First two fields - specify the major and minor number of the device, third field - specifies the operation type and the fourth field specifies the - io_service_time in ns. - -- blkio.io_wait_time - - Total amount of time the IOs for this cgroup spent waiting in the - scheduler queues for service. This can be greater than the total time - elapsed since it is cumulative io_wait_time for all IOs. It is not a - measure of total time the cgroup spent waiting but rather a measure of - the wait_time for its individual IOs. For devices with queue_depth > 1 - this metric does not include the time spent waiting for service once - the IO is dispatched to the device but till it actually gets serviced - (there might be a time lag here due to re-ordering of requests by the - device). This is in nanoseconds to make it meaningful for flash - devices too. This time is further divided by the type of operation - - read or write, sync or async. First two fields specify the major and - minor number of the device, third field specifies the operation type - and the fourth field specifies the io_wait_time in ns. - -- blkio.io_merged - - Total number of bios/requests merged into requests belonging to this - cgroup. This is further divided by the type of operation - read or - write, sync or async. - -- blkio.io_queued - - Total number of requests queued up at any given instant for this - cgroup. This is further divided by the type of operation - read or - write, sync or async. - -- blkio.avg_queue_size - - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. - The average queue size for this cgroup over the entire time of this - cgroup's existence. Queue size samples are taken each time one of the - queues of this cgroup gets a timeslice. - -- blkio.group_wait_time - - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. - This is the amount of time the cgroup had to wait since it became busy - (i.e., went from 0 to 1 request queued) to get a timeslice for one of - its queues. This is different from the io_wait_time which is the - cumulative total of the amount of time spent by each IO in that cgroup - waiting in the scheduler queue. This is in nanoseconds. If this is - read when the cgroup is in a waiting (for timeslice) state, the stat - will only report the group_wait_time accumulated till the last time it - got a timeslice and will not include the current delta. - -- blkio.empty_time - - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. - This is the amount of time a cgroup spends without any pending - requests when not being served, i.e., it does not include any time - spent idling for one of the queues of the cgroup. This is in - nanoseconds. If this is read when the cgroup is in an empty state, - the stat will only report the empty_time accumulated till the last - time it had a pending request and will not include the current delta. - -- blkio.idle_time - - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. - This is the amount of time spent by the IO scheduler idling for a - given cgroup in anticipation of a better request than the existing ones - from other queues/cgroups. This is in nanoseconds. If this is read - when the cgroup is in an idling state, the stat will only report the - idle_time accumulated till the last idle period and will not include - the current delta. - -- blkio.dequeue - - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. This - gives the statistics about how many a times a group was dequeued - from service tree of the device. First two fields specify the major - and minor number of the device and third field specifies the number - of times a group was dequeued from a particular device. - -- blkio.*_recursive - - Recursive version of various stats. These files show the - same information as their non-recursive counterparts but - include stats from all the descendant cgroups. - -Throttling/Upper limit policy files ------------------------------------ -- blkio.throttle.read_bps_device - - Specifies upper limit on READ rate from the device. IO rate is - specified in bytes per second. Rules are per device. Following is - the format. - - echo ": " > /cgrp/blkio.throttle.read_bps_device - -- blkio.throttle.write_bps_device - - Specifies upper limit on WRITE rate to the device. IO rate is - specified in bytes per second. Rules are per device. Following is - the format. - - echo ": " > /cgrp/blkio.throttle.write_bps_device - -- blkio.throttle.read_iops_device - - Specifies upper limit on READ rate from the device. IO rate is - specified in IO per second. Rules are per device. Following is - the format. - - echo ": " > /cgrp/blkio.throttle.read_iops_device - -- blkio.throttle.write_iops_device - - Specifies upper limit on WRITE rate to the device. IO rate is - specified in io per second. Rules are per device. Following is - the format. - - echo ": " > /cgrp/blkio.throttle.write_iops_device - -Note: If both BW and IOPS rules are specified for a device, then IO is - subjected to both the constraints. - -- blkio.throttle.io_serviced - - Number of IOs (bio) issued to the disk by the group. These - are further divided by the type of operation - read or write, sync - or async. First two fields specify the major and minor number of the - device, third field specifies the operation type and the fourth field - specifies the number of IOs. - -- blkio.throttle.io_service_bytes - - Number of bytes transferred to/from the disk by the group. These - are further divided by the type of operation - read or write, sync - or async. First two fields specify the major and minor number of the - device, third field specifies the operation type and the fourth field - specifies the number of bytes. - -Common files among various policies ------------------------------------ -- blkio.reset_stats - - Writing an int to this file will result in resetting all the stats - for that cgroup. - -CFQ sysfs tunable -================= -/sys/block//queue/iosched/slice_idle ------------------------------------------- -On a faster hardware CFQ can be slow, especially with sequential workload. -This happens because CFQ idles on a single queue and single queue might not -drive deeper request queue depths to keep the storage busy. In such scenarios -one can try setting slice_idle=0 and that would switch CFQ to IOPS -(IO operations per second) mode on NCQ supporting hardware. - -That means CFQ will not idle between cfq queues of a cfq group and hence be -able to driver higher queue depth and achieve better throughput. That also -means that cfq provides fairness among groups in terms of IOPS and not in -terms of disk time. - -/sys/block//queue/iosched/group_idle ------------------------------------------- -If one disables idling on individual cfq queues and cfq service trees by -setting slice_idle=0, group_idle kicks in. That means CFQ will still idle -on the group in an attempt to provide fairness among groups. - -By default group_idle is same as slice_idle and does not do anything if -slice_idle is enabled. - -One can experience an overall throughput drop if you have created multiple -groups and put applications in that group which are not driving enough -IO to keep disk busy. In that case set group_idle=0, and CFQ will not idle -on individual groups and throughput should improve. - -Writeback -========= - -Page cache is dirtied through buffered writes and shared mmaps and -written asynchronously to the backing filesystem by the writeback -mechanism. Writeback sits between the memory and IO domains and -regulates the proportion of dirty memory by balancing dirtying and -write IOs. - -On traditional cgroup hierarchies, relationships between different -controllers cannot be established making it impossible for writeback -to operate accounting for cgroup resource restrictions and all -writeback IOs are attributed to the root cgroup. - -If both the blkio and memory controllers are used on the v2 hierarchy -and the filesystem supports cgroup writeback, writeback operations -correctly follow the resource restrictions imposed by both memory and -blkio controllers. - -Writeback examines both system-wide and per-cgroup dirty memory status -and enforces the more restrictive of the two. Also, writeback control -parameters which are absolute values - vm.dirty_bytes and -vm.dirty_background_bytes - are distributed across cgroups according -to their current writeback bandwidth. - -There's a peculiarity stemming from the discrepancy in ownership -granularity between memory controller and writeback. While memory -controller tracks ownership per page, writeback operates on inode -basis. cgroup writeback bridges the gap by tracking ownership by -inode but migrating ownership if too many foreign pages, pages which -don't match the current inode ownership, have been encountered while -writing back the inode. - -This is a conscious design choice as writeback operations are -inherently tied to inodes making strictly following page ownership -complicated and inefficient. The only use case which suffers from -this compromise is multiple cgroups concurrently dirtying disjoint -regions of the same inode, which is an unlikely use case and decided -to be unsupported. Note that as memory controller assigns page -ownership on the first use and doesn't update it until the page is -released, even if cgroup writeback strictly follows page ownership, -multiple cgroups dirtying overlapping areas wouldn't work as expected. -In general, write-sharing an inode across multiple cgroups is not well -supported. - -Filesystem support for cgroup writeback ---------------------------------------- - -A filesystem can make writeback IOs cgroup-aware by updating -address_space_operations->writepage[s]() to annotate bio's using the -following two functions. - -* wbc_init_bio(@wbc, @bio) - - Should be called for each bio carrying writeback data and associates - the bio with the inode's owner cgroup. Can be called anytime - between bio allocation and submission. - -* wbc_account_io(@wbc, @page, @bytes) - - Should be called for each data segment being written out. While - this function doesn't care exactly when it's called during the - writeback session, it's the easiest and most natural to call it as - data segments are added to a bio. - -With writeback bio's annotated, cgroup support can be enabled per -super_block by setting MS_CGROUPWB in ->s_flags. This allows for -selective disabling of cgroup writeback support which is helpful when -certain filesystem features, e.g. journaled data mode, are -incompatible. - -wbc_init_bio() binds the specified bio to its cgroup. Depending on -the configuration, the bio may be executed at a lower priority and if -the writeback session is holding shared resources, e.g. a journal -entry, may lead to priority inversion. There is no one easy solution -for the problem. Filesystems can try to work around specific problem -cases by skipping wbc_init_bio() or using bio_associate_blkcg() -directly. diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt deleted file mode 100644 index c6256ae..0000000 --- a/Documentation/cgroups/cgroups.txt +++ /dev/null @@ -1,682 +0,0 @@ - CGROUPS - ------- - -Written by Paul Menage based on -Documentation/cgroups/cpusets.txt - -Original copyright statements from cpusets.txt: -Portions Copyright (C) 2004 BULL SA. -Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. -Modified by Paul Jackson -Modified by Christoph Lameter - -CONTENTS: -========= - -1. Control Groups - 1.1 What are cgroups ? - 1.2 Why are cgroups needed ? - 1.3 How are cgroups implemented ? - 1.4 What does notify_on_release do ? - 1.5 What does clone_children do ? - 1.6 How do I use cgroups ? -2. Usage Examples and Syntax - 2.1 Basic Usage - 2.2 Attaching processes - 2.3 Mounting hierarchies by name -3. Kernel API - 3.1 Overview - 3.2 Synchronization - 3.3 Subsystem API -4. Extended attributes usage -5. Questions - -1. Control Groups -================= - -1.1 What are cgroups ? ----------------------- - -Control Groups provide a mechanism for aggregating/partitioning sets of -tasks, and all their future children, into hierarchical groups with -specialized behaviour. - -Definitions: - -A *cgroup* associates a set of tasks with a set of parameters for one -or more subsystems. - -A *subsystem* is a module that makes use of the task grouping -facilities provided by cgroups to treat groups of tasks in -particular ways. A subsystem is typically a "resource controller" that -schedules a resource or applies per-cgroup limits, but it may be -anything that wants to act on a group of processes, e.g. a -virtualization subsystem. - -A *hierarchy* is a set of cgroups arranged in a tree, such that -every task in the system is in exactly one of the cgroups in the -hierarchy, and a set of subsystems; each subsystem has system-specific -state attached to each cgroup in the hierarchy. Each hierarchy has -an instance of the cgroup virtual filesystem associated with it. - -At any one time there may be multiple active hierarchies of task -cgroups. Each hierarchy is a partition of all tasks in the system. - -User-level code may create and destroy cgroups by name in an -instance of the cgroup virtual file system, specify and query to -which cgroup a task is assigned, and list the task PIDs assigned to -a cgroup. Those creations and assignments only affect the hierarchy -associated with that instance of the cgroup file system. - -On their own, the only use for cgroups is for simple job -tracking. The intention is that other subsystems hook into the generic -cgroup support to provide new attributes for cgroups, such as -accounting/limiting the resources which processes in a cgroup can -access. For example, cpusets (see Documentation/cgroups/cpusets.txt) allow -you to associate a set of CPUs and a set of memory nodes with the -tasks in each cgroup. - -1.2 Why are cgroups needed ? ----------------------------- - -There are multiple efforts to provide process aggregations in the -Linux kernel, mainly for resource-tracking purposes. Such efforts -include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server -namespaces. These all require the basic notion of a -grouping/partitioning of processes, with newly forked processes ending -up in the same group (cgroup) as their parent process. - -The kernel cgroup patch provides the minimum essential kernel -mechanisms required to efficiently implement such groups. It has -minimal impact on the system fast paths, and provides hooks for -specific subsystems such as cpusets to provide additional behaviour as -desired. - -Multiple hierarchy support is provided to allow for situations where -the division of tasks into cgroups is distinctly different for -different subsystems - having parallel hierarchies allows each -hierarchy to be a natural division of tasks, without having to handle -complex combinations of tasks that would be present if several -unrelated subsystems needed to be forced into the same tree of -cgroups. - -At one extreme, each resource controller or subsystem could be in a -separate hierarchy; at the other extreme, all subsystems -would be attached to the same hierarchy. - -As an example of a scenario (originally proposed by vatsa@in.ibm.com) -that can benefit from multiple hierarchies, consider a large -university server with various users - students, professors, system -tasks etc. The resource planning for this server could be along the -following lines: - - CPU : "Top cpuset" - / \ - CPUSet1 CPUSet2 - | | - (Professors) (Students) - - In addition (system tasks) are attached to topcpuset (so - that they can run anywhere) with a limit of 20% - - Memory : Professors (50%), Students (30%), system (20%) - - Disk : Professors (50%), Students (30%), system (20%) - - Network : WWW browsing (20%), Network File System (60%), others (20%) - / \ - Professors (15%) students (5%) - -Browsers like Firefox/Lynx go into the WWW network class, while (k)nfsd goes -into the NFS network class. - -At the same time Firefox/Lynx will share an appropriate CPU/Memory class -depending on who launched it (prof/student). - -With the ability to classify tasks differently for different resources -(by putting those resource subsystems in different hierarchies), -the admin can easily set up a script which receives exec notifications -and depending on who is launching the browser he can - - # echo browser_pid > /sys/fs/cgroup///tasks - -With only a single hierarchy, he now would potentially have to create -a separate cgroup for every browser launched and associate it with -appropriate network and other resource class. This may lead to -proliferation of such cgroups. - -Also let's say that the administrator would like to give enhanced network -access temporarily to a student's browser (since it is night and the user -wants to do online gaming :)) OR give one of the student's simulation -apps enhanced CPU power. - -With ability to write PIDs directly to resource classes, it's just a -matter of: - - # echo pid > /sys/fs/cgroup/network//tasks - (after some time) - # echo pid > /sys/fs/cgroup/network//tasks - -Without this ability, the administrator would have to split the cgroup into -multiple separate ones and then associate the new cgroups with the -new resource classes. - - - -1.3 How are cgroups implemented ? ---------------------------------- - -Control Groups extends the kernel as follows: - - - Each task in the system has a reference-counted pointer to a - css_set. - - - A css_set contains a set of reference-counted pointers to - cgroup_subsys_state objects, one for each cgroup subsystem - registered in the system. There is no direct link from a task to - the cgroup of which it's a member in each hierarchy, but this - can be determined by following pointers through the - cgroup_subsys_state objects. This is because accessing the - subsystem state is something that's expected to happen frequently - and in performance-critical code, whereas operations that require a - task's actual cgroup assignments (in particular, moving between - cgroups) are less common. A linked list runs through the cg_list - field of each task_struct using the css_set, anchored at - css_set->tasks. - - - A cgroup hierarchy filesystem can be mounted for browsing and - manipulation from user space. - - - You can list all the tasks (by PID) attached to any cgroup. - -The implementation of cgroups requires a few, simple hooks -into the rest of the kernel, none in performance-critical paths: - - - in init/main.c, to initialize the root cgroups and initial - css_set at system boot. - - - in fork and exit, to attach and detach a task from its css_set. - -In addition, a new file system of type "cgroup" may be mounted, to -enable browsing and modifying the cgroups presently known to the -kernel. When mounting a cgroup hierarchy, you may specify a -comma-separated list of subsystems to mount as the filesystem mount -options. By default, mounting the cgroup filesystem attempts to -mount a hierarchy containing all registered subsystems. - -If an active hierarchy with exactly the same set of subsystems already -exists, it will be reused for the new mount. If no existing hierarchy -matches, and any of the requested subsystems are in use in an existing -hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy -is activated, associated with the requested subsystems. - -It's not currently possible to bind a new subsystem to an active -cgroup hierarchy, or to unbind a subsystem from an active cgroup -hierarchy. This may be possible in future, but is fraught with nasty -error-recovery issues. - -When a cgroup filesystem is unmounted, if there are any -child cgroups created below the top-level cgroup, that hierarchy -will remain active even though unmounted; if there are no -child cgroups then the hierarchy will be deactivated. - -No new system calls are added for cgroups - all support for -querying and modifying cgroups is via this cgroup file system. - -Each task under /proc has an added file named 'cgroup' displaying, -for each active hierarchy, the subsystem names and the cgroup name -as the path relative to the root of the cgroup file system. - -Each cgroup is represented by a directory in the cgroup file system -containing the following files describing that cgroup: - - - tasks: list of tasks (by PID) attached to that cgroup. This list - is not guaranteed to be sorted. Writing a thread ID into this file - moves the thread into this cgroup. - - cgroup.procs: list of thread group IDs in the cgroup. This list is - not guaranteed to be sorted or free of duplicate TGIDs, and userspace - should sort/uniquify the list if this property is required. - Writing a thread group ID into this file moves all threads in that - group into this cgroup. - - notify_on_release flag: run the release agent on exit? - - release_agent: the path to use for release notifications (this file - exists in the top cgroup only) - -Other subsystems such as cpusets may add additional files in each -cgroup dir. - -New cgroups are created using the mkdir system call or shell -command. The properties of a cgroup, such as its flags, are -modified by writing to the appropriate file in that cgroups -directory, as listed above. - -The named hierarchical structure of nested cgroups allows partitioning -a large system into nested, dynamically changeable, "soft-partitions". - -The attachment of each task, automatically inherited at fork by any -children of that task, to a cgroup allows organizing the work load -on a system into related sets of tasks. A task may be re-attached to -any other cgroup, if allowed by the permissions on the necessary -cgroup file system directories. - -When a task is moved from one cgroup to another, it gets a new -css_set pointer - if there's an already existing css_set with the -desired collection of cgroups then that group is reused, otherwise a new -css_set is allocated. The appropriate existing css_set is located by -looking into a hash table. - -To allow access from a cgroup to the css_sets (and hence tasks) -that comprise it, a set of cg_cgroup_link objects form a lattice; -each cg_cgroup_link is linked into a list of cg_cgroup_links for -a single cgroup on its cgrp_link_list field, and a list of -cg_cgroup_links for a single css_set on its cg_link_list. - -Thus the set of tasks in a cgroup can be listed by iterating over -each css_set that references the cgroup, and sub-iterating over -each css_set's task set. - -The use of a Linux virtual file system (vfs) to represent the -cgroup hierarchy provides for a familiar permission and name space -for cgroups, with a minimum of additional kernel code. - -1.4 What does notify_on_release do ? ------------------------------------- - -If the notify_on_release flag is enabled (1) in a cgroup, then -whenever the last task in the cgroup leaves (exits or attaches to -some other cgroup) and the last child cgroup of that cgroup -is removed, then the kernel runs the command specified by the contents -of the "release_agent" file in that hierarchy's root directory, -supplying the pathname (relative to the mount point of the cgroup -file system) of the abandoned cgroup. This enables automatic -removal of abandoned cgroups. The default value of -notify_on_release in the root cgroup at system boot is disabled -(0). The default value of other cgroups at creation is the current -value of their parents' notify_on_release settings. The default value of -a cgroup hierarchy's release_agent path is empty. - -1.5 What does clone_children do ? ---------------------------------- - -This flag only affects the cpuset controller. If the clone_children -flag is enabled (1) in a cgroup, a new cpuset cgroup will copy its -configuration from the parent during initialization. - -1.6 How do I use cgroups ? --------------------------- - -To start a new job that is to be contained within a cgroup, using -the "cpuset" cgroup subsystem, the steps are something like: - - 1) mount -t tmpfs cgroup_root /sys/fs/cgroup - 2) mkdir /sys/fs/cgroup/cpuset - 3) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset - 4) Create the new cgroup by doing mkdir's and write's (or echo's) in - the /sys/fs/cgroup/cpuset virtual file system. - 5) Start a task that will be the "founding father" of the new job. - 6) Attach that task to the new cgroup by writing its PID to the - /sys/fs/cgroup/cpuset tasks file for that cgroup. - 7) fork, exec or clone the job tasks from this founding father task. - -For example, the following sequence of commands will setup a cgroup -named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, -and then start a subshell 'sh' in that cgroup: - - mount -t tmpfs cgroup_root /sys/fs/cgroup - mkdir /sys/fs/cgroup/cpuset - mount -t cgroup cpuset -ocpuset /sys/fs/cgroup/cpuset - cd /sys/fs/cgroup/cpuset - mkdir Charlie - cd Charlie - /bin/echo 2-3 > cpuset.cpus - /bin/echo 1 > cpuset.mems - /bin/echo $$ > tasks - sh - # The subshell 'sh' is now running in cgroup Charlie - # The next line should display '/Charlie' - cat /proc/self/cgroup - -2. Usage Examples and Syntax -============================ - -2.1 Basic Usage ---------------- - -Creating, modifying, using cgroups can be done through the cgroup -virtual filesystem. - -To mount a cgroup hierarchy with all available subsystems, type: -# mount -t cgroup xxx /sys/fs/cgroup - -The "xxx" is not interpreted by the cgroup code, but will appear in -/proc/mounts so may be any useful identifying string that you like. - -Note: Some subsystems do not work without some user input first. For instance, -if cpusets are enabled the user will have to populate the cpus and mems files -for each new cgroup created before that group can be used. - -As explained in section `1.2 Why are cgroups needed?' you should create -different hierarchies of cgroups for each single resource or group of -resources you want to control. Therefore, you should mount a tmpfs on -/sys/fs/cgroup and create directories for each cgroup resource or resource -group. - -# mount -t tmpfs cgroup_root /sys/fs/cgroup -# mkdir /sys/fs/cgroup/rg1 - -To mount a cgroup hierarchy with just the cpuset and memory -subsystems, type: -# mount -t cgroup -o cpuset,memory hier1 /sys/fs/cgroup/rg1 - -While remounting cgroups is currently supported, it is not recommend -to use it. Remounting allows changing bound subsystems and -release_agent. Rebinding is hardly useful as it only works when the -hierarchy is empty and release_agent itself should be replaced with -conventional fsnotify. The support for remounting will be removed in -the future. - -To Specify a hierarchy's release_agent: -# mount -t cgroup -o cpuset,release_agent="/sbin/cpuset_release_agent" \ - xxx /sys/fs/cgroup/rg1 - -Note that specifying 'release_agent' more than once will return failure. - -Note that changing the set of subsystems is currently only supported -when the hierarchy consists of a single (root) cgroup. Supporting -the ability to arbitrarily bind/unbind subsystems from an existing -cgroup hierarchy is intended to be implemented in the future. - -Then under /sys/fs/cgroup/rg1 you can find a tree that corresponds to the -tree of the cgroups in the system. For instance, /sys/fs/cgroup/rg1 -is the cgroup that holds the whole system. - -If you want to change the value of release_agent: -# echo "/sbin/new_release_agent" > /sys/fs/cgroup/rg1/release_agent - -It can also be changed via remount. - -If you want to create a new cgroup under /sys/fs/cgroup/rg1: -# cd /sys/fs/cgroup/rg1 -# mkdir my_cgroup - -Now you want to do something with this cgroup. -# cd my_cgroup - -In this directory you can find several files: -# ls -cgroup.procs notify_on_release tasks -(plus whatever files added by the attached subsystems) - -Now attach your shell to this cgroup: -# /bin/echo $$ > tasks - -You can also create cgroups inside your cgroup by using mkdir in this -directory. -# mkdir my_sub_cs - -To remove a cgroup, just use rmdir: -# rmdir my_sub_cs - -This will fail if the cgroup is in use (has cgroups inside, or -has processes attached, or is held alive by other subsystem-specific -reference). - -2.2 Attaching processes ------------------------ - -# /bin/echo PID > tasks - -Note that it is PID, not PIDs. You can only attach ONE task at a time. -If you have several tasks to attach, you have to do it one after another: - -# /bin/echo PID1 > tasks -# /bin/echo PID2 > tasks - ... -# /bin/echo PIDn > tasks - -You can attach the current shell task by echoing 0: - -# echo 0 > tasks - -You can use the cgroup.procs file instead of the tasks file to move all -threads in a threadgroup at once. Echoing the PID of any task in a -threadgroup to cgroup.procs causes all tasks in that threadgroup to be -attached to the cgroup. Writing 0 to cgroup.procs moves all tasks -in the writing task's threadgroup. - -Note: Since every task is always a member of exactly one cgroup in each -mounted hierarchy, to remove a task from its current cgroup you must -move it into a new cgroup (possibly the root cgroup) by writing to the -new cgroup's tasks file. - -Note: Due to some restrictions enforced by some cgroup subsystems, moving -a process to another cgroup can fail. - -2.3 Mounting hierarchies by name --------------------------------- - -Passing the name= option when mounting a cgroups hierarchy -associates the given name with the hierarchy. This can be used when -mounting a pre-existing hierarchy, in order to refer to it by name -rather than by its set of active subsystems. Each hierarchy is either -nameless, or has a unique name. - -The name should match [\w.-]+ - -When passing a name= option for a new hierarchy, you need to -specify subsystems manually; the legacy behaviour of mounting all -subsystems when none are explicitly specified is not supported when -you give a subsystem a name. - -The name of the subsystem appears as part of the hierarchy description -in /proc/mounts and /proc//cgroups. - - -3. Kernel API -============= - -3.1 Overview ------------- - -Each kernel subsystem that wants to hook into the generic cgroup -system needs to create a cgroup_subsys object. This contains -various methods, which are callbacks from the cgroup system, along -with a subsystem ID which will be assigned by the cgroup system. - -Other fields in the cgroup_subsys object include: - -- subsys_id: a unique array index for the subsystem, indicating which - entry in cgroup->subsys[] this subsystem should be managing. - -- name: should be initialized to a unique subsystem name. Should be - no longer than MAX_CGROUP_TYPE_NAMELEN. - -- early_init: indicate if the subsystem needs early initialization - at system boot. - -Each cgroup object created by the system has an array of pointers, -indexed by subsystem ID; this pointer is entirely managed by the -subsystem; the generic cgroup code will never touch this pointer. - -3.2 Synchronization -------------------- - -There is a global mutex, cgroup_mutex, used by the cgroup -system. This should be taken by anything that wants to modify a -cgroup. It may also be taken to prevent cgroups from being -modified, but more specific locks may be more appropriate in that -situation. - -See kernel/cgroup.c for more details. - -Subsystems can take/release the cgroup_mutex via the functions -cgroup_lock()/cgroup_unlock(). - -Accessing a task's cgroup pointer may be done in the following ways: -- while holding cgroup_mutex -- while holding the task's alloc_lock (via task_lock()) -- inside an rcu_read_lock() section via rcu_dereference() - -3.3 Subsystem API ------------------ - -Each subsystem should: - -- add an entry in linux/cgroup_subsys.h -- define a cgroup_subsys object called _subsys - -If a subsystem can be compiled as a module, it should also have in its -module initcall a call to cgroup_load_subsys(), and in its exitcall a -call to cgroup_unload_subsys(). It should also set its_subsys.module = -THIS_MODULE in its .c file. - -Each subsystem may export the following methods. The only mandatory -methods are css_alloc/free. Any others that are null are presumed to -be successful no-ops. - -struct cgroup_subsys_state *css_alloc(struct cgroup *cgrp) -(cgroup_mutex held by caller) - -Called to allocate a subsystem state object for a cgroup. The -subsystem should allocate its subsystem state object for the passed -cgroup, returning a pointer to the new object on success or a -ERR_PTR() value. On success, the subsystem pointer should point to -a structure of type cgroup_subsys_state (typically embedded in a -larger subsystem-specific object), which will be initialized by the -cgroup system. Note that this will be called at initialization to -create the root subsystem state for this subsystem; this case can be -identified by the passed cgroup object having a NULL parent (since -it's the root of the hierarchy) and may be an appropriate place for -initialization code. - -int css_online(struct cgroup *cgrp) -(cgroup_mutex held by caller) - -Called after @cgrp successfully completed all allocations and made -visible to cgroup_for_each_child/descendant_*() iterators. The -subsystem may choose to fail creation by returning -errno. This -callback can be used to implement reliable state sharing and -propagation along the hierarchy. See the comment on -cgroup_for_each_descendant_pre() for details. - -void css_offline(struct cgroup *cgrp); -(cgroup_mutex held by caller) - -This is the counterpart of css_online() and called iff css_online() -has succeeded on @cgrp. This signifies the beginning of the end of -@cgrp. @cgrp is being removed and the subsystem should start dropping -all references it's holding on @cgrp. When all references are dropped, -cgroup removal will proceed to the next step - css_free(). After this -callback, @cgrp should be considered dead to the subsystem. - -void css_free(struct cgroup *cgrp) -(cgroup_mutex held by caller) - -The cgroup system is about to free @cgrp; the subsystem should free -its subsystem state object. By the time this method is called, @cgrp -is completely unused; @cgrp->parent is still valid. (Note - can also -be called for a newly-created cgroup if an error occurs after this -subsystem's create() method has been called for the new cgroup). - -int can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) -(cgroup_mutex held by caller) - -Called prior to moving one or more tasks into a cgroup; if the -subsystem returns an error, this will abort the attach operation. -@tset contains the tasks to be attached and is guaranteed to have at -least one task in it. - -If there are multiple tasks in the taskset, then: - - it's guaranteed that all are from the same thread group - - @tset contains all tasks from the thread group whether or not - they're switching cgroups - - the first task is the leader - -Each @tset entry also contains the task's old cgroup and tasks which -aren't switching cgroup can be skipped easily using the -cgroup_taskset_for_each() iterator. Note that this isn't called on a -fork. If this method returns 0 (success) then this should remain valid -while the caller holds cgroup_mutex and it is ensured that either -attach() or cancel_attach() will be called in future. - -void css_reset(struct cgroup_subsys_state *css) -(cgroup_mutex held by caller) - -An optional operation which should restore @css's configuration to the -initial state. This is currently only used on the unified hierarchy -when a subsystem is disabled on a cgroup through -"cgroup.subtree_control" but should remain enabled because other -subsystems depend on it. cgroup core makes such a css invisible by -removing the associated interface files and invokes this callback so -that the hidden subsystem can return to the initial neutral state. -This prevents unexpected resource control from a hidden css and -ensures that the configuration is in the initial state when it is made -visible again later. - -void cancel_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) -(cgroup_mutex held by caller) - -Called when a task attach operation has failed after can_attach() has succeeded. -A subsystem whose can_attach() has some side-effects should provide this -function, so that the subsystem can implement a rollback. If not, not necessary. -This will be called only about subsystems whose can_attach() operation have -succeeded. The parameters are identical to can_attach(). - -void attach(struct cgroup *cgrp, struct cgroup_taskset *tset) -(cgroup_mutex held by caller) - -Called after the task has been attached to the cgroup, to allow any -post-attachment activity that requires memory allocations or blocking. -The parameters are identical to can_attach(). - -void fork(struct task_struct *task) - -Called when a task is forked into a cgroup. - -void exit(struct task_struct *task) - -Called during task exit. - -void free(struct task_struct *task) - -Called when the task_struct is freed. - -void bind(struct cgroup *root) -(cgroup_mutex held by caller) - -Called when a cgroup subsystem is rebound to a different hierarchy -and root cgroup. Currently this will only involve movement between -the default hierarchy (which never has sub-cgroups) and a hierarchy -that is being created/destroyed (and hence has no sub-cgroups). - -4. Extended attribute usage -=========================== - -cgroup filesystem supports certain types of extended attributes in its -directories and files. The current supported types are: - - Trusted (XATTR_TRUSTED) - - Security (XATTR_SECURITY) - -Both require CAP_SYS_ADMIN capability to set. - -Like in tmpfs, the extended attributes in cgroup filesystem are stored -using kernel memory and it's advised to keep the usage at minimum. This -is the reason why user defined extended attributes are not supported, since -any user can do it and there's no limit in the value size. - -The current known users for this feature are SELinux to limit cgroup usage -in containers and systemd for assorted meta data like main PID in a cgroup -(systemd creates a cgroup per service). - -5. Questions -============ - -Q: what's up with this '/bin/echo' ? -A: bash's builtin 'echo' command does not check calls to write() against - errors. If you use it in the cgroup file system, you won't be - able to tell whether a command succeeded or failed. - -Q: When I attach processes, only the first of the line gets really attached ! -A: We can only return one error code per call to write(). So you should also - put only ONE PID. - diff --git a/Documentation/cgroups/cpuacct.txt b/Documentation/cgroups/cpuacct.txt deleted file mode 100644 index 9d73cc0..0000000 --- a/Documentation/cgroups/cpuacct.txt +++ /dev/null @@ -1,49 +0,0 @@ -CPU Accounting Controller -------------------------- - -The CPU accounting controller is used to group tasks using cgroups and -account the CPU usage of these groups of tasks. - -The CPU accounting controller supports multi-hierarchy groups. An accounting -group accumulates the CPU usage of all of its child groups and the tasks -directly present in its group. - -Accounting groups can be created by first mounting the cgroup filesystem. - -# mount -t cgroup -ocpuacct none /sys/fs/cgroup - -With the above step, the initial or the parent accounting group becomes -visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in -the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup. -/sys/fs/cgroup/cpuacct.usage gives the CPU time (in nanoseconds) obtained -by this group which is essentially the CPU time obtained by all the tasks -in the system. - -New accounting groups can be created under the parent group /sys/fs/cgroup. - -# cd /sys/fs/cgroup -# mkdir g1 -# echo $$ > g1/tasks - -The above steps create a new group g1 and move the current shell -process (bash) into it. CPU time consumed by this bash and its children -can be obtained from g1/cpuacct.usage and the same is accumulated in -/sys/fs/cgroup/cpuacct.usage also. - -cpuacct.stat file lists a few statistics which further divide the -CPU time obtained by the cgroup into user and system times. Currently -the following statistics are supported: - -user: Time spent by tasks of the cgroup in user mode. -system: Time spent by tasks of the cgroup in kernel mode. - -user and system are in USER_HZ unit. - -cpuacct controller uses percpu_counter interface to collect user and -system times. This has two side effects: - -- It is theoretically possible to see wrong values for user and system times. - This is because percpu_counter_read() on 32bit systems isn't safe - against concurrent writes. -- It is possible to see slightly outdated values for user and system times - due to the batch processing nature of percpu_counter. diff --git a/Documentation/cgroups/cpusets.txt b/Documentation/cgroups/cpusets.txt deleted file mode 100644 index fdf7dff..0000000 --- a/Documentation/cgroups/cpusets.txt +++ /dev/null @@ -1,839 +0,0 @@ - CPUSETS - ------- - -Copyright (C) 2004 BULL SA. -Written by Simon.Derr@bull.net - -Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. -Modified by Paul Jackson -Modified by Christoph Lameter -Modified by Paul Menage -Modified by Hidetoshi Seto - -CONTENTS: -========= - -1. Cpusets - 1.1 What are cpusets ? - 1.2 Why are cpusets needed ? - 1.3 How are cpusets implemented ? - 1.4 What are exclusive cpusets ? - 1.5 What is memory_pressure ? - 1.6 What is memory spread ? - 1.7 What is sched_load_balance ? - 1.8 What is sched_relax_domain_level ? - 1.9 How do I use cpusets ? -2. Usage Examples and Syntax - 2.1 Basic Usage - 2.2 Adding/removing cpus - 2.3 Setting flags - 2.4 Attaching processes -3. Questions -4. Contact - -1. Cpusets -========== - -1.1 What are cpusets ? ----------------------- - -Cpusets provide a mechanism for assigning a set of CPUs and Memory -Nodes to a set of tasks. In this document "Memory Node" refers to -an on-line node that contains memory. - -Cpusets constrain the CPU and Memory placement of tasks to only -the resources within a task's current cpuset. They form a nested -hierarchy visible in a virtual file system. These are the essential -hooks, beyond what is already present, required to manage dynamic -job placement on large systems. - -Cpusets use the generic cgroup subsystem described in -Documentation/cgroups/cgroups.txt. - -Requests by a task, using the sched_setaffinity(2) system call to -include CPUs in its CPU affinity mask, and using the mbind(2) and -set_mempolicy(2) system calls to include Memory Nodes in its memory -policy, are both filtered through that task's cpuset, filtering out any -CPUs or Memory Nodes not in that cpuset. The scheduler will not -schedule a task on a CPU that is not allowed in its cpus_allowed -vector, and the kernel page allocator will not allocate a page on a -node that is not allowed in the requesting task's mems_allowed vector. - -User level code may create and destroy cpusets by name in the cgroup -virtual file system, manage the attributes and permissions of these -cpusets and which CPUs and Memory Nodes are assigned to each cpuset, -specify and query to which cpuset a task is assigned, and list the -task pids assigned to a cpuset. - - -1.2 Why are cpusets needed ? ----------------------------- - -The management of large computer systems, with many processors (CPUs), -complex memory cache hierarchies and multiple Memory Nodes having -non-uniform access times (NUMA) presents additional challenges for -the efficient scheduling and memory placement of processes. - -Frequently more modest sized systems can be operated with adequate -efficiency just by letting the operating system automatically share -the available CPU and Memory resources amongst the requesting tasks. - -But larger systems, which benefit more from careful processor and -memory placement to reduce memory access times and contention, -and which typically represent a larger investment for the customer, -can benefit from explicitly placing jobs on properly sized subsets of -the system. - -This can be especially valuable on: - - * Web Servers running multiple instances of the same web application, - * Servers running different applications (for instance, a web server - and a database), or - * NUMA systems running large HPC applications with demanding - performance characteristics. - -These subsets, or "soft partitions" must be able to be dynamically -adjusted, as the job mix changes, without impacting other concurrently -executing jobs. The location of the running jobs pages may also be moved -when the memory locations are changed. - -The kernel cpuset patch provides the minimum essential kernel -mechanisms required to efficiently implement such subsets. It -leverages existing CPU and Memory Placement facilities in the Linux -kernel to avoid any additional impact on the critical scheduler or -memory allocator code. - - -1.3 How are cpusets implemented ? ---------------------------------- - -Cpusets provide a Linux kernel mechanism to constrain which CPUs and -Memory Nodes are used by a process or set of processes. - -The Linux kernel already has a pair of mechanisms to specify on which -CPUs a task may be scheduled (sched_setaffinity) and on which Memory -Nodes it may obtain memory (mbind, set_mempolicy). - -Cpusets extends these two mechanisms as follows: - - - Cpusets are sets of allowed CPUs and Memory Nodes, known to the - kernel. - - Each task in the system is attached to a cpuset, via a pointer - in the task structure to a reference counted cgroup structure. - - Calls to sched_setaffinity are filtered to just those CPUs - allowed in that task's cpuset. - - Calls to mbind and set_mempolicy are filtered to just - those Memory Nodes allowed in that task's cpuset. - - The root cpuset contains all the systems CPUs and Memory - Nodes. - - For any cpuset, one can define child cpusets containing a subset - of the parents CPU and Memory Node resources. - - The hierarchy of cpusets can be mounted at /dev/cpuset, for - browsing and manipulation from user space. - - A cpuset may be marked exclusive, which ensures that no other - cpuset (except direct ancestors and descendants) may contain - any overlapping CPUs or Memory Nodes. - - You can list all the tasks (by pid) attached to any cpuset. - -The implementation of cpusets requires a few, simple hooks -into the rest of the kernel, none in performance critical paths: - - - in init/main.c, to initialize the root cpuset at system boot. - - in fork and exit, to attach and detach a task from its cpuset. - - in sched_setaffinity, to mask the requested CPUs by what's - allowed in that task's cpuset. - - in sched.c migrate_live_tasks(), to keep migrating tasks within - the CPUs allowed by their cpuset, if possible. - - in the mbind and set_mempolicy system calls, to mask the requested - Memory Nodes by what's allowed in that task's cpuset. - - in page_alloc.c, to restrict memory to allowed nodes. - - in vmscan.c, to restrict page recovery to the current cpuset. - -You should mount the "cgroup" filesystem type in order to enable -browsing and modifying the cpusets presently known to the kernel. No -new system calls are added for cpusets - all support for querying and -modifying cpusets is via this cpuset file system. - -The /proc//status file for each task has four added lines, -displaying the task's cpus_allowed (on which CPUs it may be scheduled) -and mems_allowed (on which Memory Nodes it may obtain memory), -in the two formats seen in the following example: - - Cpus_allowed: ffffffff,ffffffff,ffffffff,ffffffff - Cpus_allowed_list: 0-127 - Mems_allowed: ffffffff,ffffffff - Mems_allowed_list: 0-63 - -Each cpuset is represented by a directory in the cgroup file system -containing (on top of the standard cgroup files) the following -files describing that cpuset: - - - cpuset.cpus: list of CPUs in that cpuset - - cpuset.mems: list of Memory Nodes in that cpuset - - cpuset.memory_migrate flag: if set, move pages to cpusets nodes - - cpuset.cpu_exclusive flag: is cpu placement exclusive? - - cpuset.mem_exclusive flag: is memory placement exclusive? - - cpuset.mem_hardwall flag: is memory allocation hardwalled - - cpuset.memory_pressure: measure of how much paging pressure in cpuset - - cpuset.memory_spread_page flag: if set, spread page cache evenly on allowed nodes - - cpuset.memory_spread_slab flag: if set, spread slab cache evenly on allowed nodes - - cpuset.sched_load_balance flag: if set, load balance within CPUs on that cpuset - - cpuset.sched_relax_domain_level: the searching range when migrating tasks - -In addition, only the root cpuset has the following file: - - cpuset.memory_pressure_enabled flag: compute memory_pressure? - -New cpusets are created using the mkdir system call or shell -command. The properties of a cpuset, such as its flags, allowed -CPUs and Memory Nodes, and attached tasks, are modified by writing -to the appropriate file in that cpusets directory, as listed above. - -The named hierarchical structure of nested cpusets allows partitioning -a large system into nested, dynamically changeable, "soft-partitions". - -The attachment of each task, automatically inherited at fork by any -children of that task, to a cpuset allows organizing the work load -on a system into related sets of tasks such that each set is constrained -to using the CPUs and Memory Nodes of a particular cpuset. A task -may be re-attached to any other cpuset, if allowed by the permissions -on the necessary cpuset file system directories. - -Such management of a system "in the large" integrates smoothly with -the detailed placement done on individual tasks and memory regions -using the sched_setaffinity, mbind and set_mempolicy system calls. - -The following rules apply to each cpuset: - - - Its CPUs and Memory Nodes must be a subset of its parents. - - It can't be marked exclusive unless its parent is. - - If its cpu or memory is exclusive, they may not overlap any sibling. - -These rules, and the natural hierarchy of cpusets, enable efficient -enforcement of the exclusive guarantee, without having to scan all -cpusets every time any of them change to ensure nothing overlaps a -exclusive cpuset. Also, the use of a Linux virtual file system (vfs) -to represent the cpuset hierarchy provides for a familiar permission -and name space for cpusets, with a minimum of additional kernel code. - -The cpus and mems files in the root (top_cpuset) cpuset are -read-only. The cpus file automatically tracks the value of -cpu_online_mask using a CPU hotplug notifier, and the mems file -automatically tracks the value of node_states[N_MEMORY]--i.e., -nodes with memory--using the cpuset_track_online_nodes() hook. - - -1.4 What are exclusive cpusets ? --------------------------------- - -If a cpuset is cpu or mem exclusive, no other cpuset, other than -a direct ancestor or descendant, may share any of the same CPUs or -Memory Nodes. - -A cpuset that is cpuset.mem_exclusive *or* cpuset.mem_hardwall is "hardwalled", -i.e. it restricts kernel allocations for page, buffer and other data -commonly shared by the kernel across multiple users. All cpusets, -whether hardwalled or not, restrict allocations of memory for user -space. This enables configuring a system so that several independent -jobs can share common kernel data, such as file system pages, while -isolating each job's user allocation in its own cpuset. To do this, -construct a large mem_exclusive cpuset to hold all the jobs, and -construct child, non-mem_exclusive cpusets for each individual job. -Only a small amount of typical kernel memory, such as requests from -interrupt handlers, is allowed to be taken outside even a -mem_exclusive cpuset. - - -1.5 What is memory_pressure ? ------------------------------ -The memory_pressure of a cpuset provides a simple per-cpuset metric -of the rate that the tasks in a cpuset are attempting to free up in -use memory on the nodes of the cpuset to satisfy additional memory -requests. - -This enables batch managers monitoring jobs running in dedicated -cpusets to efficiently detect what level of memory pressure that job -is causing. - -This is useful both on tightly managed systems running a wide mix of -submitted jobs, which may choose to terminate or re-prioritize jobs that -are trying to use more memory than allowed on the nodes assigned to them, -and with tightly coupled, long running, massively parallel scientific -computing jobs that will dramatically fail to meet required performance -goals if they start to use more memory than allowed to them. - -This mechanism provides a very economical way for the batch manager -to monitor a cpuset for signs of memory pressure. It's up to the -batch manager or other user code to decide what to do about it and -take action. - -==> Unless this feature is enabled by writing "1" to the special file - /dev/cpuset/memory_pressure_enabled, the hook in the rebalance - code of __alloc_pages() for this metric reduces to simply noticing - that the cpuset_memory_pressure_enabled flag is zero. So only - systems that enable this feature will compute the metric. - -Why a per-cpuset, running average: - - Because this meter is per-cpuset, rather than per-task or mm, - the system load imposed by a batch scheduler monitoring this - metric is sharply reduced on large systems, because a scan of - the tasklist can be avoided on each set of queries. - - Because this meter is a running average, instead of an accumulating - counter, a batch scheduler can detect memory pressure with a - single read, instead of having to read and accumulate results - for a period of time. - - Because this meter is per-cpuset rather than per-task or mm, - the batch scheduler can obtain the key information, memory - pressure in a cpuset, with a single read, rather than having to - query and accumulate results over all the (dynamically changing) - set of tasks in the cpuset. - -A per-cpuset simple digital filter (requires a spinlock and 3 words -of data per-cpuset) is kept, and updated by any task attached to that -cpuset, if it enters the synchronous (direct) page reclaim code. - -A per-cpuset file provides an integer number representing the recent -(half-life of 10 seconds) rate of direct page reclaims caused by -the tasks in the cpuset, in units of reclaims attempted per second, -times 1000. - - -1.6 What is memory spread ? ---------------------------- -There are two boolean flag files per cpuset that control where the -kernel allocates pages for the file system buffers and related in -kernel data structures. They are called 'cpuset.memory_spread_page' and -'cpuset.memory_spread_slab'. - -If the per-cpuset boolean flag file 'cpuset.memory_spread_page' is set, then -the kernel will spread the file system buffers (page cache) evenly -over all the nodes that the faulting task is allowed to use, instead -of preferring to put those pages on the node where the task is running. - -If the per-cpuset boolean flag file 'cpuset.memory_spread_slab' is set, -then the kernel will spread some file system related slab caches, -such as for inodes and dentries evenly over all the nodes that the -faulting task is allowed to use, instead of preferring to put those -pages on the node where the task is running. - -The setting of these flags does not affect anonymous data segment or -stack segment pages of a task. - -By default, both kinds of memory spreading are off, and memory -pages are allocated on the node local to where the task is running, -except perhaps as modified by the task's NUMA mempolicy or cpuset -configuration, so long as sufficient free memory pages are available. - -When new cpusets are created, they inherit the memory spread settings -of their parent. - -Setting memory spreading causes allocations for the affected page -or slab caches to ignore the task's NUMA mempolicy and be spread -instead. Tasks using mbind() or set_mempolicy() calls to set NUMA -mempolicies will not notice any change in these calls as a result of -their containing task's memory spread settings. If memory spreading -is turned off, then the currently specified NUMA mempolicy once again -applies to memory page allocations. - -Both 'cpuset.memory_spread_page' and 'cpuset.memory_spread_slab' are boolean flag -files. By default they contain "0", meaning that the feature is off -for that cpuset. If a "1" is written to that file, then that turns -the named feature on. - -The implementation is simple. - -Setting the flag 'cpuset.memory_spread_page' turns on a per-process flag -PFA_SPREAD_PAGE for each task that is in that cpuset or subsequently -joins that cpuset. The page allocation calls for the page cache -is modified to perform an inline check for this PFA_SPREAD_PAGE task -flag, and if set, a call to a new routine cpuset_mem_spread_node() -returns the node to prefer for the allocation. - -Similarly, setting 'cpuset.memory_spread_slab' turns on the flag -PFA_SPREAD_SLAB, and appropriately marked slab caches will allocate -pages from the node returned by cpuset_mem_spread_node(). - -The cpuset_mem_spread_node() routine is also simple. It uses the -value of a per-task rotor cpuset_mem_spread_rotor to select the next -node in the current task's mems_allowed to prefer for the allocation. - -This memory placement policy is also known (in other contexts) as -round-robin or interleave. - -This policy can provide substantial improvements for jobs that need -to place thread local data on the corresponding node, but that need -to access large file system data sets that need to be spread across -the several nodes in the jobs cpuset in order to fit. Without this -policy, especially for jobs that might have one thread reading in the -data set, the memory allocation across the nodes in the jobs cpuset -can become very uneven. - -1.7 What is sched_load_balance ? --------------------------------- - -The kernel scheduler (kernel/sched/core.c) automatically load balances -tasks. If one CPU is underutilized, kernel code running on that -CPU will look for tasks on other more overloaded CPUs and move those -tasks to itself, within the constraints of such placement mechanisms -as cpusets and sched_setaffinity. - -The algorithmic cost of load balancing and its impact on key shared -kernel data structures such as the task list increases more than -linearly with the number of CPUs being balanced. So the scheduler -has support to partition the systems CPUs into a number of sched -domains such that it only load balances within each sched domain. -Each sched domain covers some subset of the CPUs in the system; -no two sched domains overlap; some CPUs might not be in any sched -domain and hence won't be load balanced. - -Put simply, it costs less to balance between two smaller sched domains -than one big one, but doing so means that overloads in one of the -two domains won't be load balanced to the other one. - -By default, there is one sched domain covering all CPUs, including those -marked isolated using the kernel boot time "isolcpus=" argument. However, -the isolated CPUs will not participate in load balancing, and will not -have tasks running on them unless explicitly assigned. - -This default load balancing across all CPUs is not well suited for -the following two situations: - 1) On large systems, load balancing across many CPUs is expensive. - If the system is managed using cpusets to place independent jobs - on separate sets of CPUs, full load balancing is unnecessary. - 2) Systems supporting realtime on some CPUs need to minimize - system overhead on those CPUs, including avoiding task load - balancing if that is not needed. - -When the per-cpuset flag "cpuset.sched_load_balance" is enabled (the default -setting), it requests that all the CPUs in that cpusets allowed 'cpuset.cpus' -be contained in a single sched domain, ensuring that load balancing -can move a task (not otherwised pinned, as by sched_setaffinity) -from any CPU in that cpuset to any other. - -When the per-cpuset flag "cpuset.sched_load_balance" is disabled, then the -scheduler will avoid load balancing across the CPUs in that cpuset, ---except-- in so far as is necessary because some overlapping cpuset -has "sched_load_balance" enabled. - -So, for example, if the top cpuset has the flag "cpuset.sched_load_balance" -enabled, then the scheduler will have one sched domain covering all -CPUs, and the setting of the "cpuset.sched_load_balance" flag in any other -cpusets won't matter, as we're already fully load balancing. - -Therefore in the above two situations, the top cpuset flag -"cpuset.sched_load_balance" should be disabled, and only some of the smaller, -child cpusets have this flag enabled. - -When doing this, you don't usually want to leave any unpinned tasks in -the top cpuset that might use non-trivial amounts of CPU, as such tasks -may be artificially constrained to some subset of CPUs, depending on -the particulars of this flag setting in descendant cpusets. Even if -such a task could use spare CPU cycles in some other CPUs, the kernel -scheduler might not consider the possibility of load balancing that -task to that underused CPU. - -Of course, tasks pinned to a particular CPU can be left in a cpuset -that disables "cpuset.sched_load_balance" as those tasks aren't going anywhere -else anyway. - -There is an impedance mismatch here, between cpusets and sched domains. -Cpusets are hierarchical and nest. Sched domains are flat; they don't -overlap and each CPU is in at most one sched domain. - -It is necessary for sched domains to be flat because load balancing -across partially overlapping sets of CPUs would risk unstable dynamics -that would be beyond our understanding. So if each of two partially -overlapping cpusets enables the flag 'cpuset.sched_load_balance', then we -form a single sched domain that is a superset of both. We won't move -a task to a CPU outside its cpuset, but the scheduler load balancing -code might waste some compute cycles considering that possibility. - -This mismatch is why there is not a simple one-to-one relation -between which cpusets have the flag "cpuset.sched_load_balance" enabled, -and the sched domain configuration. If a cpuset enables the flag, it -will get balancing across all its CPUs, but if it disables the flag, -it will only be assured of no load balancing if no other overlapping -cpuset enables the flag. - -If two cpusets have partially overlapping 'cpuset.cpus' allowed, and only -one of them has this flag enabled, then the other may find its -tasks only partially load balanced, just on the overlapping CPUs. -This is just the general case of the top_cpuset example given a few -paragraphs above. In the general case, as in the top cpuset case, -don't leave tasks that might use non-trivial amounts of CPU in -such partially load balanced cpusets, as they may be artificially -constrained to some subset of the CPUs allowed to them, for lack of -load balancing to the other CPUs. - -CPUs in "cpuset.isolcpus" were excluded from load balancing by the -isolcpus= kernel boot option, and will never be load balanced regardless -of the value of "cpuset.sched_load_balance" in any cpuset. - -1.7.1 sched_load_balance implementation details. ------------------------------------------------- - -The per-cpuset flag 'cpuset.sched_load_balance' defaults to enabled (contrary -to most cpuset flags.) When enabled for a cpuset, the kernel will -ensure that it can load balance across all the CPUs in that cpuset -(makes sure that all the CPUs in the cpus_allowed of that cpuset are -in the same sched domain.) - -If two overlapping cpusets both have 'cpuset.sched_load_balance' enabled, -then they will be (must be) both in the same sched domain. - -If, as is the default, the top cpuset has 'cpuset.sched_load_balance' enabled, -then by the above that means there is a single sched domain covering -the whole system, regardless of any other cpuset settings. - -The kernel commits to user space that it will avoid load balancing -where it can. It will pick as fine a granularity partition of sched -domains as it can while still providing load balancing for any set -of CPUs allowed to a cpuset having 'cpuset.sched_load_balance' enabled. - -The internal kernel cpuset to scheduler interface passes from the -cpuset code to the scheduler code a partition of the load balanced -CPUs in the system. This partition is a set of subsets (represented -as an array of struct cpumask) of CPUs, pairwise disjoint, that cover -all the CPUs that must be load balanced. - -The cpuset code builds a new such partition and passes it to the -scheduler sched domain setup code, to have the sched domains rebuilt -as necessary, whenever: - - the 'cpuset.sched_load_balance' flag of a cpuset with non-empty CPUs changes, - - or CPUs come or go from a cpuset with this flag enabled, - - or 'cpuset.sched_relax_domain_level' value of a cpuset with non-empty CPUs - and with this flag enabled changes, - - or a cpuset with non-empty CPUs and with this flag enabled is removed, - - or a cpu is offlined/onlined. - -This partition exactly defines what sched domains the scheduler should -setup - one sched domain for each element (struct cpumask) in the -partition. - -The scheduler remembers the currently active sched domain partitions. -When the scheduler routine partition_sched_domains() is invoked from -the cpuset code to update these sched domains, it compares the new -partition requested with the current, and updates its sched domains, -removing the old and adding the new, for each change. - - -1.8 What is sched_relax_domain_level ? --------------------------------------- - -In sched domain, the scheduler migrates tasks in 2 ways; periodic load -balance on tick, and at time of some schedule events. - -When a task is woken up, scheduler try to move the task on idle CPU. -For example, if a task A running on CPU X activates another task B -on the same CPU X, and if CPU Y is X's sibling and performing idle, -then scheduler migrate task B to CPU Y so that task B can start on -CPU Y without waiting task A on CPU X. - -And if a CPU run out of tasks in its runqueue, the CPU try to pull -extra tasks from other busy CPUs to help them before it is going to -be idle. - -Of course it takes some searching cost to find movable tasks and/or -idle CPUs, the scheduler might not search all CPUs in the domain -every time. In fact, in some architectures, the searching ranges on -events are limited in the same socket or node where the CPU locates, -while the load balance on tick searches all. - -For example, assume CPU Z is relatively far from CPU X. Even if CPU Z -is idle while CPU X and the siblings are busy, scheduler can't migrate -woken task B from X to Z since it is out of its searching range. -As the result, task B on CPU X need to wait task A or wait load balance -on the next tick. For some applications in special situation, waiting -1 tick may be too long. - -The 'cpuset.sched_relax_domain_level' file allows you to request changing -this searching range as you like. This file takes int value which -indicates size of searching range in levels ideally as follows, -otherwise initial value -1 that indicates the cpuset has no request. - - -1 : no request. use system default or follow request of others. - 0 : no search. - 1 : search siblings (hyperthreads in a core). - 2 : search cores in a package. - 3 : search cpus in a node [= system wide on non-NUMA system] - 4 : search nodes in a chunk of node [on NUMA system] - 5 : search system wide [on NUMA system] - -The system default is architecture dependent. The system default -can be changed using the relax_domain_level= boot parameter. - -This file is per-cpuset and affect the sched domain where the cpuset -belongs to. Therefore if the flag 'cpuset.sched_load_balance' of a cpuset -is disabled, then 'cpuset.sched_relax_domain_level' have no effect since -there is no sched domain belonging the cpuset. - -If multiple cpusets are overlapping and hence they form a single sched -domain, the largest value among those is used. Be careful, if one -requests 0 and others are -1 then 0 is used. - -Note that modifying this file will have both good and bad effects, -and whether it is acceptable or not depends on your situation. -Don't modify this file if you are not sure. - -If your situation is: - - The migration costs between each cpu can be assumed considerably - small(for you) due to your special application's behavior or - special hardware support for CPU cache etc. - - The searching cost doesn't have impact(for you) or you can make - the searching cost enough small by managing cpuset to compact etc. - - The latency is required even it sacrifices cache hit rate etc. -then increasing 'sched_relax_domain_level' would benefit you. - - -1.9 How do I use cpusets ? --------------------------- - -In order to minimize the impact of cpusets on critical kernel -code, such as the scheduler, and due to the fact that the kernel -does not support one task updating the memory placement of another -task directly, the impact on a task of changing its cpuset CPU -or Memory Node placement, or of changing to which cpuset a task -is attached, is subtle. - -If a cpuset has its Memory Nodes modified, then for each task attached -to that cpuset, the next time that the kernel attempts to allocate -a page of memory for that task, the kernel will notice the change -in the task's cpuset, and update its per-task memory placement to -remain within the new cpusets memory placement. If the task was using -mempolicy MPOL_BIND, and the nodes to which it was bound overlap with -its new cpuset, then the task will continue to use whatever subset -of MPOL_BIND nodes are still allowed in the new cpuset. If the task -was using MPOL_BIND and now none of its MPOL_BIND nodes are allowed -in the new cpuset, then the task will be essentially treated as if it -was MPOL_BIND bound to the new cpuset (even though its NUMA placement, -as queried by get_mempolicy(), doesn't change). If a task is moved -from one cpuset to another, then the kernel will adjust the task's -memory placement, as above, the next time that the kernel attempts -to allocate a page of memory for that task. - -If a cpuset has its 'cpuset.cpus' modified, then each task in that cpuset -will have its allowed CPU placement changed immediately. Similarly, -if a task's pid is written to another cpusets 'cpuset.tasks' file, then its -allowed CPU placement is changed immediately. If such a task had been -bound to some subset of its cpuset using the sched_setaffinity() call, -the task will be allowed to run on any CPU allowed in its new cpuset, -negating the effect of the prior sched_setaffinity() call. - -In summary, the memory placement of a task whose cpuset is changed is -updated by the kernel, on the next allocation of a page for that task, -and the processor placement is updated immediately. - -Normally, once a page is allocated (given a physical page -of main memory) then that page stays on whatever node it -was allocated, so long as it remains allocated, even if the -cpusets memory placement policy 'cpuset.mems' subsequently changes. -If the cpuset flag file 'cpuset.memory_migrate' is set true, then when -tasks are attached to that cpuset, any pages that task had -allocated to it on nodes in its previous cpuset are migrated -to the task's new cpuset. The relative placement of the page within -the cpuset is preserved during these migration operations if possible. -For example if the page was on the second valid node of the prior cpuset -then the page will be placed on the second valid node of the new cpuset. - -Also if 'cpuset.memory_migrate' is set true, then if that cpuset's -'cpuset.mems' file is modified, pages allocated to tasks in that -cpuset, that were on nodes in the previous setting of 'cpuset.mems', -will be moved to nodes in the new setting of 'mems.' -Pages that were not in the task's prior cpuset, or in the cpuset's -prior 'cpuset.mems' setting, will not be moved. - -There is an exception to the above. If hotplug functionality is used -to remove all the CPUs that are currently assigned to a cpuset, -then all the tasks in that cpuset will be moved to the nearest ancestor -with non-empty cpus. But the moving of some (or all) tasks might fail if -cpuset is bound with another cgroup subsystem which has some restrictions -on task attaching. In this failing case, those tasks will stay -in the original cpuset, and the kernel will automatically update -their cpus_allowed to allow all online CPUs. When memory hotplug -functionality for removing Memory Nodes is available, a similar exception -is expected to apply there as well. In general, the kernel prefers to -violate cpuset placement, over starving a task that has had all -its allowed CPUs or Memory Nodes taken offline. - -There is a second exception to the above. GFP_ATOMIC requests are -kernel internal allocations that must be satisfied, immediately. -The kernel may drop some request, in rare cases even panic, if a -GFP_ATOMIC alloc fails. If the request cannot be satisfied within -the current task's cpuset, then we relax the cpuset, and look for -memory anywhere we can find it. It's better to violate the cpuset -than stress the kernel. - -To start a new job that is to be contained within a cpuset, the steps are: - - 1) mkdir /sys/fs/cgroup/cpuset - 2) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset - 3) Create the new cpuset by doing mkdir's and write's (or echo's) in - the /sys/fs/cgroup/cpuset virtual file system. - 4) Start a task that will be the "founding father" of the new job. - 5) Attach that task to the new cpuset by writing its pid to the - /sys/fs/cgroup/cpuset tasks file for that cpuset. - 6) fork, exec or clone the job tasks from this founding father task. - -For example, the following sequence of commands will setup a cpuset -named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, -and then start a subshell 'sh' in that cpuset: - - mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset - cd /sys/fs/cgroup/cpuset - mkdir Charlie - cd Charlie - /bin/echo 2-3 > cpuset.cpus - /bin/echo 1 > cpuset.mems - /bin/echo $$ > tasks - sh - # The subshell 'sh' is now running in cpuset Charlie - # The next line should display '/Charlie' - cat /proc/self/cpuset - -There are ways to query or modify cpusets: - - via the cpuset file system directly, using the various cd, mkdir, echo, - cat, rmdir commands from the shell, or their equivalent from C. - - via the C library libcpuset. - - via the C library libcgroup. - (http://sourceforge.net/projects/libcg/) - - via the python application cset. - (http://code.google.com/p/cpuset/) - -The sched_setaffinity calls can also be done at the shell prompt using -SGI's runon or Robert Love's taskset. The mbind and set_mempolicy -calls can be done at the shell prompt using the numactl command -(part of Andi Kleen's numa package). - -2. Usage Examples and Syntax -============================ - -2.1 Basic Usage ---------------- - -Creating, modifying, using the cpusets can be done through the cpuset -virtual filesystem. - -To mount it, type: -# mount -t cgroup -o cpuset cpuset /sys/fs/cgroup/cpuset - -Then under /sys/fs/cgroup/cpuset you can find a tree that corresponds to the -tree of the cpusets in the system. For instance, /sys/fs/cgroup/cpuset -is the cpuset that holds the whole system. - -If you want to create a new cpuset under /sys/fs/cgroup/cpuset: -# cd /sys/fs/cgroup/cpuset -# mkdir my_cpuset - -Now you want to do something with this cpuset. -# cd my_cpuset - -In this directory you can find several files: -# ls -cgroup.clone_children cpuset.memory_pressure -cgroup.event_control cpuset.memory_spread_page -cgroup.procs cpuset.memory_spread_slab -cpuset.cpu_exclusive cpuset.mems -cpuset.cpus cpuset.sched_load_balance -cpuset.mem_exclusive cpuset.sched_relax_domain_level -cpuset.mem_hardwall notify_on_release -cpuset.memory_migrate tasks - -Reading them will give you information about the state of this cpuset: -the CPUs and Memory Nodes it can use, the processes that are using -it, its properties. By writing to these files you can manipulate -the cpuset. - -Set some flags: -# /bin/echo 1 > cpuset.cpu_exclusive - -Add some cpus: -# /bin/echo 0-7 > cpuset.cpus - -Add some mems: -# /bin/echo 0-7 > cpuset.mems - -Now attach your shell to this cpuset: -# /bin/echo $$ > tasks - -You can also create cpusets inside your cpuset by using mkdir in this -directory. -# mkdir my_sub_cs - -To remove a cpuset, just use rmdir: -# rmdir my_sub_cs -This will fail if the cpuset is in use (has cpusets inside, or has -processes attached). - -Note that for legacy reasons, the "cpuset" filesystem exists as a -wrapper around the cgroup filesystem. - -The command - -mount -t cpuset X /sys/fs/cgroup/cpuset - -is equivalent to - -mount -t cgroup -ocpuset,noprefix X /sys/fs/cgroup/cpuset -echo "/sbin/cpuset_release_agent" > /sys/fs/cgroup/cpuset/release_agent - -2.2 Adding/removing cpus ------------------------- - -This is the syntax to use when writing in the cpus or mems files -in cpuset directories: - -# /bin/echo 1-4 > cpuset.cpus -> set cpus list to cpus 1,2,3,4 -# /bin/echo 1,2,3,4 > cpuset.cpus -> set cpus list to cpus 1,2,3,4 - -To add a CPU to a cpuset, write the new list of CPUs including the -CPU to be added. To add 6 to the above cpuset: - -# /bin/echo 1-4,6 > cpuset.cpus -> set cpus list to cpus 1,2,3,4,6 - -Similarly to remove a CPU from a cpuset, write the new list of CPUs -without the CPU to be removed. - -To remove all the CPUs: - -# /bin/echo "" > cpuset.cpus -> clear cpus list - -2.3 Setting flags ------------------ - -The syntax is very simple: - -# /bin/echo 1 > cpuset.cpu_exclusive -> set flag 'cpuset.cpu_exclusive' -# /bin/echo 0 > cpuset.cpu_exclusive -> unset flag 'cpuset.cpu_exclusive' - -2.4 Attaching processes ------------------------ - -# /bin/echo PID > tasks - -Note that it is PID, not PIDs. You can only attach ONE task at a time. -If you have several tasks to attach, you have to do it one after another: - -# /bin/echo PID1 > tasks -# /bin/echo PID2 > tasks - ... -# /bin/echo PIDn > tasks - - -3. Questions -============ - -Q: what's up with this '/bin/echo' ? -A: bash's builtin 'echo' command does not check calls to write() against - errors. If you use it in the cpuset file system, you won't be - able to tell whether a command succeeded or failed. - -Q: When I attach processes, only the first of the line gets really attached ! -A: We can only return one error code per call to write(). So you should also - put only ONE pid. - -4. Contact -========== - -Web: http://www.bullopensource.org/cpuset diff --git a/Documentation/cgroups/devices.txt b/Documentation/cgroups/devices.txt deleted file mode 100644 index 3c1095c..0000000 --- a/Documentation/cgroups/devices.txt +++ /dev/null @@ -1,116 +0,0 @@ -Device Whitelist Controller - -1. Description: - -Implement a cgroup to track and enforce open and mknod restrictions -on device files. A device cgroup associates a device access -whitelist with each cgroup. A whitelist entry has 4 fields. -'type' is a (all), c (char), or b (block). 'all' means it applies -to all types and all major and minor numbers. Major and minor are -either an integer or * for all. Access is a composition of r -(read), w (write), and m (mknod). - -The root device cgroup starts with rwm to 'all'. A child device -cgroup gets a copy of the parent. Administrators can then remove -devices from the whitelist or add new entries. A child cgroup can -never receive a device access which is denied by its parent. - -2. User Interface - -An entry is added using devices.allow, and removed using -devices.deny. For instance - - echo 'c 1:3 mr' > /sys/fs/cgroup/1/devices.allow - -allows cgroup 1 to read and mknod the device usually known as -/dev/null. Doing - - echo a > /sys/fs/cgroup/1/devices.deny - -will remove the default 'a *:* rwm' entry. Doing - - echo a > /sys/fs/cgroup/1/devices.allow - -will add the 'a *:* rwm' entry to the whitelist. - -3. Security - -Any task can move itself between cgroups. This clearly won't -suffice, but we can decide the best way to adequately restrict -movement as people get some experience with this. We may just want -to require CAP_SYS_ADMIN, which at least is a separate bit from -CAP_MKNOD. We may want to just refuse moving to a cgroup which -isn't a descendant of the current one. Or we may want to use -CAP_MAC_ADMIN, since we really are trying to lock down root. - -CAP_SYS_ADMIN is needed to modify the whitelist or move another -task to a new cgroup. (Again we'll probably want to change that). - -A cgroup may not be granted more permissions than the cgroup's -parent has. - -4. Hierarchy - -device cgroups maintain hierarchy by making sure a cgroup never has more -access permissions than its parent. Every time an entry is written to -a cgroup's devices.deny file, all its children will have that entry removed -from their whitelist and all the locally set whitelist entries will be -re-evaluated. In case one of the locally set whitelist entries would provide -more access than the cgroup's parent, it'll be removed from the whitelist. - -Example: - A - / \ - B - - group behavior exceptions - A allow "b 8:* rwm", "c 116:1 rw" - B deny "c 1:3 rwm", "c 116:2 rwm", "b 3:* rwm" - -If a device is denied in group A: - # echo "c 116:* r" > A/devices.deny -it'll propagate down and after revalidating B's entries, the whitelist entry -"c 116:2 rwm" will be removed: - - group whitelist entries denied devices - A all "b 8:* rwm", "c 116:* rw" - B "c 1:3 rwm", "b 3:* rwm" all the rest - -In case parent's exceptions change and local exceptions are not allowed -anymore, they'll be deleted. - -Notice that new whitelist entries will not be propagated: - A - / \ - B - - group whitelist entries denied devices - A "c 1:3 rwm", "c 1:5 r" all the rest - B "c 1:3 rwm", "c 1:5 r" all the rest - -when adding "c *:3 rwm": - # echo "c *:3 rwm" >A/devices.allow - -the result: - group whitelist entries denied devices - A "c *:3 rwm", "c 1:5 r" all the rest - B "c 1:3 rwm", "c 1:5 r" all the rest - -but now it'll be possible to add new entries to B: - # echo "c 2:3 rwm" >B/devices.allow - # echo "c 50:3 r" >B/devices.allow -or even - # echo "c *:3 rwm" >B/devices.allow - -Allowing or denying all by writing 'a' to devices.allow or devices.deny will -not be possible once the device cgroups has children. - -4.1 Hierarchy (internal implementation) - -device cgroups is implemented internally using a behavior (ALLOW, DENY) and a -list of exceptions. The internal state is controlled using the same user -interface to preserve compatibility with the previous whitelist-only -implementation. Removal or addition of exceptions that will reduce the access -to devices will be propagated down the hierarchy. -For every propagated exception, the effective rules will be re-evaluated based -on current parent's access rules. diff --git a/Documentation/cgroups/freezer-subsystem.txt b/Documentation/cgroups/freezer-subsystem.txt deleted file mode 100644 index e831cb2..0000000 --- a/Documentation/cgroups/freezer-subsystem.txt +++ /dev/null @@ -1,123 +0,0 @@ -The cgroup freezer is useful to batch job management system which start -and stop sets of tasks in order to schedule the resources of a machine -according to the desires of a system administrator. This sort of program -is often used on HPC clusters to schedule access to the cluster as a -whole. The cgroup freezer uses cgroups to describe the set of tasks to -be started/stopped by the batch job management system. It also provides -a means to start and stop the tasks composing the job. - -The cgroup freezer will also be useful for checkpointing running groups -of tasks. The freezer allows the checkpoint code to obtain a consistent -image of the tasks by attempting to force the tasks in a cgroup into a -quiescent state. Once the tasks are quiescent another task can -walk /proc or invoke a kernel interface to gather information about the -quiesced tasks. Checkpointed tasks can be restarted later should a -recoverable error occur. This also allows the checkpointed tasks to be -migrated between nodes in a cluster by copying the gathered information -to another node and restarting the tasks there. - -Sequences of SIGSTOP and SIGCONT are not always sufficient for stopping -and resuming tasks in userspace. Both of these signals are observable -from within the tasks we wish to freeze. While SIGSTOP cannot be caught, -blocked, or ignored it can be seen by waiting or ptracing parent tasks. -SIGCONT is especially unsuitable since it can be caught by the task. Any -programs designed to watch for SIGSTOP and SIGCONT could be broken by -attempting to use SIGSTOP and SIGCONT to stop and resume tasks. We can -demonstrate this problem using nested bash shells: - - $ echo $$ - 16644 - $ bash - $ echo $$ - 16690 - - From a second, unrelated bash shell: - $ kill -SIGSTOP 16690 - $ kill -SIGCONT 16690 - - - -This happens because bash can observe both signals and choose how it -responds to them. - -Another example of a program which catches and responds to these -signals is gdb. In fact any program designed to use ptrace is likely to -have a problem with this method of stopping and resuming tasks. - -In contrast, the cgroup freezer uses the kernel freezer code to -prevent the freeze/unfreeze cycle from becoming visible to the tasks -being frozen. This allows the bash example above and gdb to run as -expected. - -The cgroup freezer is hierarchical. Freezing a cgroup freezes all -tasks belonging to the cgroup and all its descendant cgroups. Each -cgroup has its own state (self-state) and the state inherited from the -parent (parent-state). Iff both states are THAWED, the cgroup is -THAWED. - -The following cgroupfs files are created by cgroup freezer. - -* freezer.state: Read-write. - - When read, returns the effective state of the cgroup - "THAWED", - "FREEZING" or "FROZEN". This is the combined self and parent-states. - If any is freezing, the cgroup is freezing (FREEZING or FROZEN). - - FREEZING cgroup transitions into FROZEN state when all tasks - belonging to the cgroup and its descendants become frozen. Note that - a cgroup reverts to FREEZING from FROZEN after a new task is added - to the cgroup or one of its descendant cgroups until the new task is - frozen. - - When written, sets the self-state of the cgroup. Two values are - allowed - "FROZEN" and "THAWED". If FROZEN is written, the cgroup, - if not already freezing, enters FREEZING state along with all its - descendant cgroups. - - If THAWED is written, the self-state of the cgroup is changed to - THAWED. Note that the effective state may not change to THAWED if - the parent-state is still freezing. If a cgroup's effective state - becomes THAWED, all its descendants which are freezing because of - the cgroup also leave the freezing state. - -* freezer.self_freezing: Read only. - - Shows the self-state. 0 if the self-state is THAWED; otherwise, 1. - This value is 1 iff the last write to freezer.state was "FROZEN". - -* freezer.parent_freezing: Read only. - - Shows the parent-state. 0 if none of the cgroup's ancestors is - frozen; otherwise, 1. - -The root cgroup is non-freezable and the above interface files don't -exist. - -* Examples of usage : - - # mkdir /sys/fs/cgroup/freezer - # mount -t cgroup -ofreezer freezer /sys/fs/cgroup/freezer - # mkdir /sys/fs/cgroup/freezer/0 - # echo $some_pid > /sys/fs/cgroup/freezer/0/tasks - -to get status of the freezer subsystem : - - # cat /sys/fs/cgroup/freezer/0/freezer.state - THAWED - -to freeze all tasks in the container : - - # echo FROZEN > /sys/fs/cgroup/freezer/0/freezer.state - # cat /sys/fs/cgroup/freezer/0/freezer.state - FREEZING - # cat /sys/fs/cgroup/freezer/0/freezer.state - FROZEN - -to unfreeze all tasks in the container : - - # echo THAWED > /sys/fs/cgroup/freezer/0/freezer.state - # cat /sys/fs/cgroup/freezer/0/freezer.state - THAWED - -This is the basic mechanism which should do the right thing for user space task -in a simple scenario. diff --git a/Documentation/cgroups/hugetlb.txt b/Documentation/cgroups/hugetlb.txt deleted file mode 100644 index 106245c..0000000 --- a/Documentation/cgroups/hugetlb.txt +++ /dev/null @@ -1,45 +0,0 @@ -HugeTLB Controller -------------------- - -The HugeTLB controller allows to limit the HugeTLB usage per control group and -enforces the controller limit during page fault. Since HugeTLB doesn't -support page reclaim, enforcing the limit at page fault time implies that, -the application will get SIGBUS signal if it tries to access HugeTLB pages -beyond its limit. This requires the application to know beforehand how much -HugeTLB pages it would require for its use. - -HugeTLB controller can be created by first mounting the cgroup filesystem. - -# mount -t cgroup -o hugetlb none /sys/fs/cgroup - -With the above step, the initial or the parent HugeTLB group becomes -visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in -the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup. - -New groups can be created under the parent group /sys/fs/cgroup. - -# cd /sys/fs/cgroup -# mkdir g1 -# echo $$ > g1/tasks - -The above steps create a new group g1 and move the current shell -process (bash) into it. - -Brief summary of control files - - hugetlb..limit_in_bytes # set/show limit of "hugepagesize" hugetlb usage - hugetlb..max_usage_in_bytes # show max "hugepagesize" hugetlb usage recorded - hugetlb..usage_in_bytes # show current usage for "hugepagesize" hugetlb - hugetlb..failcnt # show the number of allocation failure due to HugeTLB limit - -For a system supporting two hugepage size (16M and 16G) the control -files include: - -hugetlb.16GB.limit_in_bytes -hugetlb.16GB.max_usage_in_bytes -hugetlb.16GB.usage_in_bytes -hugetlb.16GB.failcnt -hugetlb.16MB.limit_in_bytes -hugetlb.16MB.max_usage_in_bytes -hugetlb.16MB.usage_in_bytes -hugetlb.16MB.failcnt diff --git a/Documentation/cgroups/memcg_test.txt b/Documentation/cgroups/memcg_test.txt deleted file mode 100644 index 8870b02..0000000 --- a/Documentation/cgroups/memcg_test.txt +++ /dev/null @@ -1,280 +0,0 @@ -Memory Resource Controller(Memcg) Implementation Memo. -Last Updated: 2010/2 -Base Kernel Version: based on 2.6.33-rc7-mm(candidate for 34). - -Because VM is getting complex (one of reasons is memcg...), memcg's behavior -is complex. This is a document for memcg's internal behavior. -Please note that implementation details can be changed. - -(*) Topics on API should be in Documentation/cgroups/memory.txt) - -0. How to record usage ? - 2 objects are used. - - page_cgroup ....an object per page. - Allocated at boot or memory hotplug. Freed at memory hot removal. - - swap_cgroup ... an entry per swp_entry. - Allocated at swapon(). Freed at swapoff(). - - The page_cgroup has USED bit and double count against a page_cgroup never - occurs. swap_cgroup is used only when a charged page is swapped-out. - -1. Charge - - a page/swp_entry may be charged (usage += PAGE_SIZE) at - - mem_cgroup_try_charge() - -2. Uncharge - a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by - - mem_cgroup_uncharge() - Called when a page's refcount goes down to 0. - - mem_cgroup_uncharge_swap() - Called when swp_entry's refcnt goes down to 0. A charge against swap - disappears. - -3. charge-commit-cancel - Memcg pages are charged in two steps: - mem_cgroup_try_charge() - mem_cgroup_commit_charge() or mem_cgroup_cancel_charge() - - At try_charge(), there are no flags to say "this page is charged". - at this point, usage += PAGE_SIZE. - - At commit(), the page is associated with the memcg. - - At cancel(), simply usage -= PAGE_SIZE. - -Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y. - -4. Anonymous - Anonymous page is newly allocated at - - page fault into MAP_ANONYMOUS mapping. - - Copy-On-Write. - - 4.1 Swap-in. - At swap-in, the page is taken from swap-cache. There are 2 cases. - - (a) If the SwapCache is newly allocated and read, it has no charges. - (b) If the SwapCache has been mapped by processes, it has been - charged already. - - 4.2 Swap-out. - At swap-out, typical state transition is below. - - (a) add to swap cache. (marked as SwapCache) - swp_entry's refcnt += 1. - (b) fully unmapped. - swp_entry's refcnt += # of ptes. - (c) write back to swap. - (d) delete from swap cache. (remove from SwapCache) - swp_entry's refcnt -= 1. - - - Finally, at task exit, - (e) zap_pte() is called and swp_entry's refcnt -=1 -> 0. - -5. Page Cache - Page Cache is charged at - - add_to_page_cache_locked(). - - The logic is very clear. (About migration, see below) - Note: __remove_from_page_cache() is called by remove_from_page_cache() - and __remove_mapping(). - -6. Shmem(tmpfs) Page Cache - The best way to understand shmem's page state transition is to read - mm/shmem.c. - But brief explanation of the behavior of memcg around shmem will be - helpful to understand the logic. - - Shmem's page (just leaf page, not direct/indirect block) can be on - - radix-tree of shmem's inode. - - SwapCache. - - Both on radix-tree and SwapCache. This happens at swap-in - and swap-out, - - It's charged when... - - A new page is added to shmem's radix-tree. - - A swp page is read. (move a charge from swap_cgroup to page_cgroup) - -7. Page Migration - - mem_cgroup_migrate() - -8. LRU - Each memcg has its own private LRU. Now, its handling is under global - VM's control (means that it's handled under global zone->lru_lock). - Almost all routines around memcg's LRU is called by global LRU's - list management functions under zone->lru_lock(). - - A special function is mem_cgroup_isolate_pages(). This scans - memcg's private LRU and call __isolate_lru_page() to extract a page - from LRU. - (By __isolate_lru_page(), the page is removed from both of global and - private LRU.) - - -9. Typical Tests. - - Tests for racy cases. - - 9.1 Small limit to memcg. - When you do test to do racy case, it's good test to set memcg's limit - to be very small rather than GB. Many races found in the test under - xKB or xxMB limits. - (Memory behavior under GB and Memory behavior under MB shows very - different situation.) - - 9.2 Shmem - Historically, memcg's shmem handling was poor and we saw some amount - of troubles here. This is because shmem is page-cache but can be - SwapCache. Test with shmem/tmpfs is always good test. - - 9.3 Migration - For NUMA, migration is an another special case. To do easy test, cpuset - is useful. Following is a sample script to do migration. - - mount -t cgroup -o cpuset none /opt/cpuset - - mkdir /opt/cpuset/01 - echo 1 > /opt/cpuset/01/cpuset.cpus - echo 0 > /opt/cpuset/01/cpuset.mems - echo 1 > /opt/cpuset/01/cpuset.memory_migrate - mkdir /opt/cpuset/02 - echo 1 > /opt/cpuset/02/cpuset.cpus - echo 1 > /opt/cpuset/02/cpuset.mems - echo 1 > /opt/cpuset/02/cpuset.memory_migrate - - In above set, when you moves a task from 01 to 02, page migration to - node 0 to node 1 will occur. Following is a script to migrate all - under cpuset. - -- - move_task() - { - for pid in $1 - do - /bin/echo $pid >$2/tasks 2>/dev/null - echo -n $pid - echo -n " " - done - echo END - } - - G1_TASK=`cat ${G1}/tasks` - G2_TASK=`cat ${G2}/tasks` - move_task "${G1_TASK}" ${G2} & - -- - 9.4 Memory hotplug. - memory hotplug test is one of good test. - to offline memory, do following. - # echo offline > /sys/devices/system/memory/memoryXXX/state - (XXX is the place of memory) - This is an easy way to test page migration, too. - - 9.5 mkdir/rmdir - When using hierarchy, mkdir/rmdir test should be done. - Use tests like the following. - - echo 1 >/opt/cgroup/01/memory/use_hierarchy - mkdir /opt/cgroup/01/child_a - mkdir /opt/cgroup/01/child_b - - set limit to 01. - add limit to 01/child_b - run jobs under child_a and child_b - - create/delete following groups at random while jobs are running. - /opt/cgroup/01/child_a/child_aa - /opt/cgroup/01/child_b/child_bb - /opt/cgroup/01/child_c - - running new jobs in new group is also good. - - 9.6 Mount with other subsystems. - Mounting with other subsystems is a good test because there is a - race and lock dependency with other cgroup subsystems. - - example) - # mount -t cgroup none /cgroup -o cpuset,memory,cpu,devices - - and do task move, mkdir, rmdir etc...under this. - - 9.7 swapoff. - Besides management of swap is one of complicated parts of memcg, - call path of swap-in at swapoff is not same as usual swap-in path.. - It's worth to be tested explicitly. - - For example, test like following is good. - (Shell-A) - # mount -t cgroup none /cgroup -o memory - # mkdir /cgroup/test - # echo 40M > /cgroup/test/memory.limit_in_bytes - # echo 0 > /cgroup/test/tasks - Run malloc(100M) program under this. You'll see 60M of swaps. - (Shell-B) - # move all tasks in /cgroup/test to /cgroup - # /sbin/swapoff -a - # rmdir /cgroup/test - # kill malloc task. - - Of course, tmpfs v.s. swapoff test should be tested, too. - - 9.8 OOM-Killer - Out-of-memory caused by memcg's limit will kill tasks under - the memcg. When hierarchy is used, a task under hierarchy - will be killed by the kernel. - In this case, panic_on_oom shouldn't be invoked and tasks - in other groups shouldn't be killed. - - It's not difficult to cause OOM under memcg as following. - Case A) when you can swapoff - #swapoff -a - #echo 50M > /memory.limit_in_bytes - run 51M of malloc - - Case B) when you use mem+swap limitation. - #echo 50M > memory.limit_in_bytes - #echo 50M > memory.memsw.limit_in_bytes - run 51M of malloc - - 9.9 Move charges at task migration - Charges associated with a task can be moved along with task migration. - - (Shell-A) - #mkdir /cgroup/A - #echo $$ >/cgroup/A/tasks - run some programs which uses some amount of memory in /cgroup/A. - - (Shell-B) - #mkdir /cgroup/B - #echo 1 >/cgroup/B/memory.move_charge_at_immigrate - #echo "pid of the program running in group A" >/cgroup/B/tasks - - You can see charges have been moved by reading *.usage_in_bytes or - memory.stat of both A and B. - See 8.2 of Documentation/cgroups/memory.txt to see what value should be - written to move_charge_at_immigrate. - - 9.10 Memory thresholds - Memory controller implements memory thresholds using cgroups notification - API. You can use tools/cgroup/cgroup_event_listener.c to test it. - - (Shell-A) Create cgroup and run event listener - # mkdir /cgroup/A - # ./cgroup_event_listener /cgroup/A/memory.usage_in_bytes 5M - - (Shell-B) Add task to cgroup and try to allocate and free memory - # echo $$ >/cgroup/A/tasks - # a="$(dd if=/dev/zero bs=1M count=10)" - # a= - - You will see message from cgroup_event_listener every time you cross - the thresholds. - - Use /cgroup/A/memory.memsw.usage_in_bytes to test memsw thresholds. - - It's good idea to test root cgroup as well. diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt deleted file mode 100644 index ff71e16..0000000 --- a/Documentation/cgroups/memory.txt +++ /dev/null @@ -1,876 +0,0 @@ -Memory Resource Controller - -NOTE: This document is hopelessly outdated and it asks for a complete - rewrite. It still contains a useful information so we are keeping it - here but make sure to check the current code if you need a deeper - understanding. - -NOTE: The Memory Resource Controller has generically been referred to as the - memory controller in this document. Do not confuse memory controller - used here with the memory controller that is used in hardware. - -(For editors) -In this document: - When we mention a cgroup (cgroupfs's directory) with memory controller, - we call it "memory cgroup". When you see git-log and source code, you'll - see patch's title and function names tend to use "memcg". - In this document, we avoid using it. - -Benefits and Purpose of the memory controller - -The memory controller isolates the memory behaviour of a group of tasks -from the rest of the system. The article on LWN [12] mentions some probable -uses of the memory controller. The memory controller can be used to - -a. Isolate an application or a group of applications - Memory-hungry applications can be isolated and limited to a smaller - amount of memory. -b. Create a cgroup with a limited amount of memory; this can be used - as a good alternative to booting with mem=XXXX. -c. Virtualization solutions can control the amount of memory they want - to assign to a virtual machine instance. -d. A CD/DVD burner could control the amount of memory used by the - rest of the system to ensure that burning does not fail due to lack - of available memory. -e. There are several other use cases; find one or use the controller just - for fun (to learn and hack on the VM subsystem). - -Current Status: linux-2.6.34-mmotm(development version of 2010/April) - -Features: - - accounting anonymous pages, file caches, swap caches usage and limiting them. - - pages are linked to per-memcg LRU exclusively, and there is no global LRU. - - optionally, memory+swap usage can be accounted and limited. - - hierarchical accounting - - soft limit - - moving (recharging) account at moving a task is selectable. - - usage threshold notifier - - memory pressure notifier - - oom-killer disable knob and oom-notifier - - Root cgroup has no limit controls. - - Kernel memory support is a work in progress, and the current version provides - basically functionality. (See Section 2.7) - -Brief summary of control files. - - tasks # attach a task(thread) and show list of threads - cgroup.procs # show list of processes - cgroup.event_control # an interface for event_fd() - memory.usage_in_bytes # show current usage for memory - (See 5.5 for details) - memory.memsw.usage_in_bytes # show current usage for memory+Swap - (See 5.5 for details) - memory.limit_in_bytes # set/show limit of memory usage - memory.memsw.limit_in_bytes # set/show limit of memory+Swap usage - memory.failcnt # show the number of memory usage hits limits - memory.memsw.failcnt # show the number of memory+Swap hits limits - memory.max_usage_in_bytes # show max memory usage recorded - memory.memsw.max_usage_in_bytes # show max memory+Swap usage recorded - memory.soft_limit_in_bytes # set/show soft limit of memory usage - memory.stat # show various statistics - memory.use_hierarchy # set/show hierarchical account enabled - memory.force_empty # trigger forced move charge to parent - memory.pressure_level # set memory pressure notifications - memory.swappiness # set/show swappiness parameter of vmscan - (See sysctl's vm.swappiness) - memory.move_charge_at_immigrate # set/show controls of moving charges - memory.oom_control # set/show oom controls. - memory.numa_stat # show the number of memory usage per numa node - - memory.kmem.limit_in_bytes # set/show hard limit for kernel memory - memory.kmem.usage_in_bytes # show current kernel memory allocation - memory.kmem.failcnt # show the number of kernel memory usage hits limits - memory.kmem.max_usage_in_bytes # show max kernel memory usage recorded - - memory.kmem.tcp.limit_in_bytes # set/show hard limit for tcp buf memory - memory.kmem.tcp.usage_in_bytes # show current tcp buf memory allocation - memory.kmem.tcp.failcnt # show the number of tcp buf memory usage hits limits - memory.kmem.tcp.max_usage_in_bytes # show max tcp buf memory usage recorded - -1. History - -The memory controller has a long history. A request for comments for the memory -controller was posted by Balbir Singh [1]. At the time the RFC was posted -there were several implementations for memory control. The goal of the -RFC was to build consensus and agreement for the minimal features required -for memory control. The first RSS controller was posted by Balbir Singh[2] -in Feb 2007. Pavel Emelianov [3][4][5] has since posted three versions of the -RSS controller. At OLS, at the resource management BoF, everyone suggested -that we handle both page cache and RSS together. Another request was raised -to allow user space handling of OOM. The current memory controller is -at version 6; it combines both mapped (RSS) and unmapped Page -Cache Control [11]. - -2. Memory Control - -Memory is a unique resource in the sense that it is present in a limited -amount. If a task requires a lot of CPU processing, the task can spread -its processing over a period of hours, days, months or years, but with -memory, the same physical memory needs to be reused to accomplish the task. - -The memory controller implementation has been divided into phases. These -are: - -1. Memory controller -2. mlock(2) controller -3. Kernel user memory accounting and slab control -4. user mappings length controller - -The memory controller is the first controller developed. - -2.1. Design - -The core of the design is a counter called the page_counter. The -page_counter tracks the current memory usage and limit of the group of -processes associated with the controller. Each cgroup has a memory controller -specific data structure (mem_cgroup) associated with it. - -2.2. Accounting - - +--------------------+ - | mem_cgroup | - | (page_counter) | - +--------------------+ - / ^ \ - / | \ - +---------------+ | +---------------+ - | mm_struct | |.... | mm_struct | - | | | | | - +---------------+ | +---------------+ - | - + --------------+ - | - +---------------+ +------+--------+ - | page +----------> page_cgroup| - | | | | - +---------------+ +---------------+ - - (Figure 1: Hierarchy of Accounting) - - -Figure 1 shows the important aspects of the controller - -1. Accounting happens per cgroup -2. Each mm_struct knows about which cgroup it belongs to -3. Each page has a pointer to the page_cgroup, which in turn knows the - cgroup it belongs to - -The accounting is done as follows: mem_cgroup_charge_common() is invoked to -set up the necessary data structures and check if the cgroup that is being -charged is over its limit. If it is, then reclaim is invoked on the cgroup. -More details can be found in the reclaim section of this document. -If everything goes well, a page meta-data-structure called page_cgroup is -updated. page_cgroup has its own LRU on cgroup. -(*) page_cgroup structure is allocated at boot/memory-hotplug time. - -2.2.1 Accounting details - -All mapped anon pages (RSS) and cache pages (Page Cache) are accounted. -Some pages which are never reclaimable and will not be on the LRU -are not accounted. We just account pages under usual VM management. - -RSS pages are accounted at page_fault unless they've already been accounted -for earlier. A file page will be accounted for as Page Cache when it's -inserted into inode (radix-tree). While it's mapped into the page tables of -processes, duplicate accounting is carefully avoided. - -An RSS page is unaccounted when it's fully unmapped. A PageCache page is -unaccounted when it's removed from radix-tree. Even if RSS pages are fully -unmapped (by kswapd), they may exist as SwapCache in the system until they -are really freed. Such SwapCaches are also accounted. -A swapped-in page is not accounted until it's mapped. - -Note: The kernel does swapin-readahead and reads multiple swaps at once. -This means swapped-in pages may contain pages for other tasks than a task -causing page fault. So, we avoid accounting at swap-in I/O. - -At page migration, accounting information is kept. - -Note: we just account pages-on-LRU because our purpose is to control amount -of used pages; not-on-LRU pages tend to be out-of-control from VM view. - -2.3 Shared Page Accounting - -Shared pages are accounted on the basis of the first touch approach. The -cgroup that first touches a page is accounted for the page. The principle -behind this approach is that a cgroup that aggressively uses a shared -page will eventually get charged for it (once it is uncharged from -the cgroup that brought it in -- this will happen on memory pressure). - -But see section 8.2: when moving a task to another cgroup, its pages may -be recharged to the new cgroup, if move_charge_at_immigrate has been chosen. - -Exception: If CONFIG_MEMCG_SWAP is not used. -When you do swapoff and make swapped-out pages of shmem(tmpfs) to -be backed into memory in force, charges for pages are accounted against the -caller of swapoff rather than the users of shmem. - -2.4 Swap Extension (CONFIG_MEMCG_SWAP) - -Swap Extension allows you to record charge for swap. A swapped-in page is -charged back to original page allocator if possible. - -When swap is accounted, following files are added. - - memory.memsw.usage_in_bytes. - - memory.memsw.limit_in_bytes. - -memsw means memory+swap. Usage of memory+swap is limited by -memsw.limit_in_bytes. - -Example: Assume a system with 4G of swap. A task which allocates 6G of memory -(by mistake) under 2G memory limitation will use all swap. -In this case, setting memsw.limit_in_bytes=3G will prevent bad use of swap. -By using the memsw limit, you can avoid system OOM which can be caused by swap -shortage. - -* why 'memory+swap' rather than swap. -The global LRU(kswapd) can swap out arbitrary pages. Swap-out means -to move account from memory to swap...there is no change in usage of -memory+swap. In other words, when we want to limit the usage of swap without -affecting global LRU, memory+swap limit is better than just limiting swap from -an OS point of view. - -* What happens when a cgroup hits memory.memsw.limit_in_bytes -When a cgroup hits memory.memsw.limit_in_bytes, it's useless to do swap-out -in this cgroup. Then, swap-out will not be done by cgroup routine and file -caches are dropped. But as mentioned above, global LRU can do swapout memory -from it for sanity of the system's memory management state. You can't forbid -it by cgroup. - -2.5 Reclaim - -Each cgroup maintains a per cgroup LRU which has the same structure as -global VM. When a cgroup goes over its limit, we first try -to reclaim memory from the cgroup so as to make space for the new -pages that the cgroup has touched. If the reclaim is unsuccessful, -an OOM routine is invoked to select and kill the bulkiest task in the -cgroup. (See 10. OOM Control below.) - -The reclaim algorithm has not been modified for cgroups, except that -pages that are selected for reclaiming come from the per-cgroup LRU -list. - -NOTE: Reclaim does not work for the root cgroup, since we cannot set any -limits on the root cgroup. - -Note2: When panic_on_oom is set to "2", the whole system will panic. - -When oom event notifier is registered, event will be delivered. -(See oom_control section) - -2.6 Locking - - lock_page_cgroup()/unlock_page_cgroup() should not be called under - mapping->tree_lock. - - Other lock order is following: - PG_locked. - mm->page_table_lock - zone->lru_lock - lock_page_cgroup. - In many cases, just lock_page_cgroup() is called. - per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by - zone->lru_lock, it has no lock of its own. - -2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM) - -With the Kernel memory extension, the Memory Controller is able to limit -the amount of kernel memory used by the system. Kernel memory is fundamentally -different than user memory, since it can't be swapped out, which makes it -possible to DoS the system by consuming too much of this precious resource. - -Kernel memory won't be accounted at all until limit on a group is set. This -allows for existing setups to continue working without disruption. The limit -cannot be set if the cgroup have children, or if there are already tasks in the -cgroup. Attempting to set the limit under those conditions will return -EBUSY. -When use_hierarchy == 1 and a group is accounted, its children will -automatically be accounted regardless of their limit value. - -After a group is first limited, it will be kept being accounted until it -is removed. The memory limitation itself, can of course be removed by writing --1 to memory.kmem.limit_in_bytes. In this case, kmem will be accounted, but not -limited. - -Kernel memory limits are not imposed for the root cgroup. Usage for the root -cgroup may or may not be accounted. The memory used is accumulated into -memory.kmem.usage_in_bytes, or in a separate counter when it makes sense. -(currently only for tcp). -The main "kmem" counter is fed into the main counter, so kmem charges will -also be visible from the user counter. - -Currently no soft limit is implemented for kernel memory. It is future work -to trigger slab reclaim when those limits are reached. - -2.7.1 Current Kernel Memory resources accounted - -* stack pages: every process consumes some stack pages. By accounting into -kernel memory, we prevent new processes from being created when the kernel -memory usage is too high. - -* slab pages: pages allocated by the SLAB or SLUB allocator are tracked. A copy -of each kmem_cache is created every time the cache is touched by the first time -from inside the memcg. The creation is done lazily, so some objects can still be -skipped while the cache is being created. All objects in a slab page should -belong to the same memcg. This only fails to hold when a task is migrated to a -different memcg during the page allocation by the cache. - -* sockets memory pressure: some sockets protocols have memory pressure -thresholds. The Memory Controller allows them to be controlled individually -per cgroup, instead of globally. - -* tcp memory pressure: sockets memory pressure for the tcp protocol. - -2.7.2 Common use cases - -Because the "kmem" counter is fed to the main user counter, kernel memory can -never be limited completely independently of user memory. Say "U" is the user -limit, and "K" the kernel limit. There are three possible ways limits can be -set: - - U != 0, K = unlimited: - This is the standard memcg limitation mechanism already present before kmem - accounting. Kernel memory is completely ignored. - - U != 0, K < U: - Kernel memory is a subset of the user memory. This setup is useful in - deployments where the total amount of memory per-cgroup is overcommited. - Overcommiting kernel memory limits is definitely not recommended, since the - box can still run out of non-reclaimable memory. - In this case, the admin could set up K so that the sum of all groups is - never greater than the total memory, and freely set U at the cost of his - QoS. - WARNING: In the current implementation, memory reclaim will NOT be - triggered for a cgroup when it hits K while staying below U, which makes - this setup impractical. - - U != 0, K >= U: - Since kmem charges will also be fed to the user counter and reclaim will be - triggered for the cgroup for both kinds of memory. This setup gives the - admin a unified view of memory, and it is also useful for people who just - want to track kernel memory usage. - -3. User Interface - -3.0. Configuration - -a. Enable CONFIG_CGROUPS -b. Enable CONFIG_MEMCG -c. Enable CONFIG_MEMCG_SWAP (to use swap extension) -d. Enable CONFIG_MEMCG_KMEM (to use kmem extension) - -3.1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?) -# mount -t tmpfs none /sys/fs/cgroup -# mkdir /sys/fs/cgroup/memory -# mount -t cgroup none /sys/fs/cgroup/memory -o memory - -3.2. Make the new group and move bash into it -# mkdir /sys/fs/cgroup/memory/0 -# echo $$ > /sys/fs/cgroup/memory/0/tasks - -Since now we're in the 0 cgroup, we can alter the memory limit: -# echo 4M > /sys/fs/cgroup/memory/0/memory.limit_in_bytes - -NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, -mega or gigabytes. (Here, Kilo, Mega, Giga are Kibibytes, Mebibytes, Gibibytes.) - -NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited). -NOTE: We cannot set limits on the root cgroup any more. - -# cat /sys/fs/cgroup/memory/0/memory.limit_in_bytes -4194304 - -We can check the usage: -# cat /sys/fs/cgroup/memory/0/memory.usage_in_bytes -1216512 - -A successful write to this file does not guarantee a successful setting of -this limit to the value written into the file. This can be due to a -number of factors, such as rounding up to page boundaries or the total -availability of memory on the system. The user is required to re-read -this file after a write to guarantee the value committed by the kernel. - -# echo 1 > memory.limit_in_bytes -# cat memory.limit_in_bytes -4096 - -The memory.failcnt field gives the number of times that the cgroup limit was -exceeded. - -The memory.stat file gives accounting information. Now, the number of -caches, RSS and Active pages/Inactive pages are shown. - -4. Testing - -For testing features and implementation, see memcg_test.txt. - -Performance test is also important. To see pure memory controller's overhead, -testing on tmpfs will give you good numbers of small overheads. -Example: do kernel make on tmpfs. - -Page-fault scalability is also important. At measuring parallel -page fault test, multi-process test may be better than multi-thread -test because it has noise of shared objects/status. - -But the above two are testing extreme situations. -Trying usual test under memory controller is always helpful. - -4.1 Troubleshooting - -Sometimes a user might find that the application under a cgroup is -terminated by the OOM killer. There are several causes for this: - -1. The cgroup limit is too low (just too low to do anything useful) -2. The user is using anonymous memory and swap is turned off or too low - -A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of -some of the pages cached in the cgroup (page cache pages). - -To know what happens, disabling OOM_Kill as per "10. OOM Control" (below) and -seeing what happens will be helpful. - -4.2 Task migration - -When a task migrates from one cgroup to another, its charge is not -carried forward by default. The pages allocated from the original cgroup still -remain charged to it, the charge is dropped when the page is freed or -reclaimed. - -You can move charges of a task along with task migration. -See 8. "Move charges at task migration" - -4.3 Removing a cgroup - -A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a -cgroup might have some charge associated with it, even though all -tasks have migrated away from it. (because we charge against pages, not -against tasks.) - -We move the stats to root (if use_hierarchy==0) or parent (if -use_hierarchy==1), and no change on the charge except uncharging -from the child. - -Charges recorded in swap information is not updated at removal of cgroup. -Recorded information is discarded and a cgroup which uses swap (swapcache) -will be charged as a new owner of it. - -About use_hierarchy, see Section 6. - -5. Misc. interfaces. - -5.1 force_empty - memory.force_empty interface is provided to make cgroup's memory usage empty. - When writing anything to this - - # echo 0 > memory.force_empty - - the cgroup will be reclaimed and as many pages reclaimed as possible. - - The typical use case for this interface is before calling rmdir(). - Because rmdir() moves all pages to parent, some out-of-use page caches can be - moved to the parent. If you want to avoid that, force_empty will be useful. - - Also, note that when memory.kmem.limit_in_bytes is set the charges due to - kernel pages will still be seen. This is not considered a failure and the - write will still return success. In this case, it is expected that - memory.kmem.usage_in_bytes == memory.usage_in_bytes. - - About use_hierarchy, see Section 6. - -5.2 stat file - -memory.stat file includes following statistics - -# per-memory cgroup local status -cache - # of bytes of page cache memory. -rss - # of bytes of anonymous and swap cache memory (includes - transparent hugepages). -rss_huge - # of bytes of anonymous transparent hugepages. -mapped_file - # of bytes of mapped file (includes tmpfs/shmem) -pgpgin - # of charging events to the memory cgroup. The charging - event happens each time a page is accounted as either mapped - anon page(RSS) or cache page(Page Cache) to the cgroup. -pgpgout - # of uncharging events to the memory cgroup. The uncharging - event happens each time a page is unaccounted from the cgroup. -swap - # of bytes of swap usage -dirty - # of bytes that are waiting to get written back to the disk. -writeback - # of bytes of file/anon cache that are queued for syncing to - disk. -inactive_anon - # of bytes of anonymous and swap cache memory on inactive - LRU list. -active_anon - # of bytes of anonymous and swap cache memory on active - LRU list. -inactive_file - # of bytes of file-backed memory on inactive LRU list. -active_file - # of bytes of file-backed memory on active LRU list. -unevictable - # of bytes of memory that cannot be reclaimed (mlocked etc). - -# status considering hierarchy (see memory.use_hierarchy settings) - -hierarchical_memory_limit - # of bytes of memory limit with regard to hierarchy - under which the memory cgroup is -hierarchical_memsw_limit - # of bytes of memory+swap limit with regard to - hierarchy under which memory cgroup is. - -total_ - # hierarchical version of , which in - addition to the cgroup's own value includes the - sum of all hierarchical children's values of - , i.e. total_cache - -# The following additional stats are dependent on CONFIG_DEBUG_VM. - -recent_rotated_anon - VM internal parameter. (see mm/vmscan.c) -recent_rotated_file - VM internal parameter. (see mm/vmscan.c) -recent_scanned_anon - VM internal parameter. (see mm/vmscan.c) -recent_scanned_file - VM internal parameter. (see mm/vmscan.c) - -Memo: - recent_rotated means recent frequency of LRU rotation. - recent_scanned means recent # of scans to LRU. - showing for better debug please see the code for meanings. - -Note: - Only anonymous and swap cache memory is listed as part of 'rss' stat. - This should not be confused with the true 'resident set size' or the - amount of physical memory used by the cgroup. - 'rss + file_mapped" will give you resident set size of cgroup. - (Note: file and shmem may be shared among other cgroups. In that case, - file_mapped is accounted only when the memory cgroup is owner of page - cache.) - -5.3 swappiness - -Overrides /proc/sys/vm/swappiness for the particular group. The tunable -in the root cgroup corresponds to the global swappiness setting. - -Please note that unlike during the global reclaim, limit reclaim -enforces that 0 swappiness really prevents from any swapping even if -there is a swap storage available. This might lead to memcg OOM killer -if there are no file pages to reclaim. - -5.4 failcnt - -A memory cgroup provides memory.failcnt and memory.memsw.failcnt files. -This failcnt(== failure count) shows the number of times that a usage counter -hit its limit. When a memory cgroup hits a limit, failcnt increases and -memory under it will be reclaimed. - -You can reset failcnt by writing 0 to failcnt file. -# echo 0 > .../memory.failcnt - -5.5 usage_in_bytes - -For efficiency, as other kernel components, memory cgroup uses some optimization -to avoid unnecessary cacheline false sharing. usage_in_bytes is affected by the -method and doesn't show 'exact' value of memory (and swap) usage, it's a fuzz -value for efficient access. (Of course, when necessary, it's synchronized.) -If you want to know more exact memory usage, you should use RSS+CACHE(+SWAP) -value in memory.stat(see 5.2). - -5.6 numa_stat - -This is similar to numa_maps but operates on a per-memcg basis. This is -useful for providing visibility into the numa locality information within -an memcg since the pages are allowed to be allocated from any physical -node. One of the use cases is evaluating application performance by -combining this information with the application's CPU allocation. - -Each memcg's numa_stat file includes "total", "file", "anon" and "unevictable" -per-node page counts including "hierarchical_" which sums up all -hierarchical children's values in addition to the memcg's own value. - -The output format of memory.numa_stat is: - -total= N0= N1= ... -file= N0= N1= ... -anon= N0= N1= ... -unevictable= N0= N1= ... -hierarchical_= N0= N1= ... - -The "total" count is sum of file + anon + unevictable. - -6. Hierarchy support - -The memory controller supports a deep hierarchy and hierarchical accounting. -The hierarchy is created by creating the appropriate cgroups in the -cgroup filesystem. Consider for example, the following cgroup filesystem -hierarchy - - root - / | \ - / | \ - a b c - | \ - | \ - d e - -In the diagram above, with hierarchical accounting enabled, all memory -usage of e, is accounted to its ancestors up until the root (i.e, c and root), -that has memory.use_hierarchy enabled. If one of the ancestors goes over its -limit, the reclaim algorithm reclaims from the tasks in the ancestor and the -children of the ancestor. - -6.1 Enabling hierarchical accounting and reclaim - -A memory cgroup by default disables the hierarchy feature. Support -can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup - -# echo 1 > memory.use_hierarchy - -The feature can be disabled by - -# echo 0 > memory.use_hierarchy - -NOTE1: Enabling/disabling will fail if either the cgroup already has other - cgroups created below it, or if the parent cgroup has use_hierarchy - enabled. - -NOTE2: When panic_on_oom is set to "2", the whole system will panic in - case of an OOM event in any cgroup. - -7. Soft limits - -Soft limits allow for greater sharing of memory. The idea behind soft limits -is to allow control groups to use as much of the memory as needed, provided - -a. There is no memory contention -b. They do not exceed their hard limit - -When the system detects memory contention or low memory, control groups -are pushed back to their soft limits. If the soft limit of each control -group is very high, they are pushed back as much as possible to make -sure that one control group does not starve the others of memory. - -Please note that soft limits is a best-effort feature; it comes with -no guarantees, but it does its best to make sure that when memory is -heavily contended for, memory is allocated based on the soft limit -hints/setup. Currently soft limit based reclaim is set up such that -it gets invoked from balance_pgdat (kswapd). - -7.1 Interface - -Soft limits can be setup by using the following commands (in this example we -assume a soft limit of 256 MiB) - -# echo 256M > memory.soft_limit_in_bytes - -If we want to change this to 1G, we can at any time use - -# echo 1G > memory.soft_limit_in_bytes - -NOTE1: Soft limits take effect over a long period of time, since they involve - reclaiming memory for balancing between memory cgroups -NOTE2: It is recommended to set the soft limit always below the hard limit, - otherwise the hard limit will take precedence. - -8. Move charges at task migration - -Users can move charges associated with a task along with task migration, that -is, uncharge task's pages from the old cgroup and charge them to the new cgroup. -This feature is not supported in !CONFIG_MMU environments because of lack of -page tables. - -8.1 Interface - -This feature is disabled by default. It can be enabled (and disabled again) by -writing to memory.move_charge_at_immigrate of the destination cgroup. - -If you want to enable it: - -# echo (some positive value) > memory.move_charge_at_immigrate - -Note: Each bits of move_charge_at_immigrate has its own meaning about what type - of charges should be moved. See 8.2 for details. -Note: Charges are moved only when you move mm->owner, in other words, - a leader of a thread group. -Note: If we cannot find enough space for the task in the destination cgroup, we - try to make space by reclaiming memory. Task migration may fail if we - cannot make enough space. -Note: It can take several seconds if you move charges much. - -And if you want disable it again: - -# echo 0 > memory.move_charge_at_immigrate - -8.2 Type of charges which can be moved - -Each bit in move_charge_at_immigrate has its own meaning about what type of -charges should be moved. But in any case, it must be noted that an account of -a page or a swap can be moved only when it is charged to the task's current -(old) memory cgroup. - - bit | what type of charges would be moved ? - -----+------------------------------------------------------------------------ - 0 | A charge of an anonymous page (or swap of it) used by the target task. - | You must enable Swap Extension (see 2.4) to enable move of swap charges. - -----+------------------------------------------------------------------------ - 1 | A charge of file pages (normal file, tmpfs file (e.g. ipc shared memory) - | and swaps of tmpfs file) mmapped by the target task. Unlike the case of - | anonymous pages, file pages (and swaps) in the range mmapped by the task - | will be moved even if the task hasn't done page fault, i.e. they might - | not be the task's "RSS", but other task's "RSS" that maps the same file. - | And mapcount of the page is ignored (the page can be moved even if - | page_mapcount(page) > 1). You must enable Swap Extension (see 2.4) to - | enable move of swap charges. - -8.3 TODO - -- All of moving charge operations are done under cgroup_mutex. It's not good - behavior to hold the mutex too long, so we may need some trick. - -9. Memory thresholds - -Memory cgroup implements memory thresholds using the cgroups notification -API (see cgroups.txt). It allows to register multiple memory and memsw -thresholds and gets notifications when it crosses. - -To register a threshold, an application must: -- create an eventfd using eventfd(2); -- open memory.usage_in_bytes or memory.memsw.usage_in_bytes; -- write string like " " to - cgroup.event_control. - -Application will be notified through eventfd when memory usage crosses -threshold in any direction. - -It's applicable for root and non-root cgroup. - -10. OOM Control - -memory.oom_control file is for OOM notification and other controls. - -Memory cgroup implements OOM notifier using the cgroup notification -API (See cgroups.txt). It allows to register multiple OOM notification -delivery and gets notification when OOM happens. - -To register a notifier, an application must: - - create an eventfd using eventfd(2) - - open memory.oom_control file - - write string like " " to - cgroup.event_control - -The application will be notified through eventfd when OOM happens. -OOM notification doesn't work for the root cgroup. - -You can disable the OOM-killer by writing "1" to memory.oom_control file, as: - - #echo 1 > memory.oom_control - -If OOM-killer is disabled, tasks under cgroup will hang/sleep -in memory cgroup's OOM-waitqueue when they request accountable memory. - -For running them, you have to relax the memory cgroup's OOM status by - * enlarge limit or reduce usage. -To reduce usage, - * kill some tasks. - * move some tasks to other group with account migration. - * remove some files (on tmpfs?) - -Then, stopped tasks will work again. - -At reading, current status of OOM is shown. - oom_kill_disable 0 or 1 (if 1, oom-killer is disabled) - under_oom 0 or 1 (if 1, the memory cgroup is under OOM, tasks may - be stopped.) - -11. Memory Pressure - -The pressure level notifications can be used to monitor the memory -allocation cost; based on the pressure, applications can implement -different strategies of managing their memory resources. The pressure -levels are defined as following: - -The "low" level means that the system is reclaiming memory for new -allocations. Monitoring this reclaiming activity might be useful for -maintaining cache level. Upon notification, the program (typically -"Activity Manager") might analyze vmstat and act in advance (i.e. -prematurely shutdown unimportant services). - -The "medium" level means that the system is experiencing medium memory -pressure, the system might be making swap, paging out active file caches, -etc. Upon this event applications may decide to further analyze -vmstat/zoneinfo/memcg or internal memory usage statistics and free any -resources that can be easily reconstructed or re-read from a disk. - -The "critical" level means that the system is actively thrashing, it is -about to out of memory (OOM) or even the in-kernel OOM killer is on its -way to trigger. Applications should do whatever they can to help the -system. It might be too late to consult with vmstat or any other -statistics, so it's advisable to take an immediate action. - -The events are propagated upward until the event is handled, i.e. the -events are not pass-through. Here is what this means: for example you have -three cgroups: A->B->C. Now you set up an event listener on cgroups A, B -and C, and suppose group C experiences some pressure. In this situation, -only group C will receive the notification, i.e. groups A and B will not -receive it. This is done to avoid excessive "broadcasting" of messages, -which disturbs the system and which is especially bad if we are low on -memory or thrashing. So, organize the cgroups wisely, or propagate the -events manually (or, ask us to implement the pass-through events, -explaining why would you need them.) - -The file memory.pressure_level is only used to setup an eventfd. To -register a notification, an application must: - -- create an eventfd using eventfd(2); -- open memory.pressure_level; -- write string like " " - to cgroup.event_control. - -Application will be notified through eventfd when memory pressure is at -the specific level (or higher). Read/write operations to -memory.pressure_level are no implemented. - -Test: - - Here is a small script example that makes a new cgroup, sets up a - memory limit, sets up a notification in the cgroup and then makes child - cgroup experience a critical pressure: - - # cd /sys/fs/cgroup/memory/ - # mkdir foo - # cd foo - # cgroup_event_listener memory.pressure_level low & - # echo 8000000 > memory.limit_in_bytes - # echo 8000000 > memory.memsw.limit_in_bytes - # echo $$ > tasks - # dd if=/dev/zero | read x - - (Expect a bunch of notifications, and eventually, the oom-killer will - trigger.) - -12. TODO - -1. Make per-cgroup scanner reclaim not-shared pages first -2. Teach controller to account for shared-pages -3. Start reclamation in the background when the limit is - not yet hit but the usage is getting closer - -Summary - -Overall, the memory controller has been a stable controller and has been -commented and discussed quite extensively in the community. - -References - -1. Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/ -2. Singh, Balbir. Memory Controller (RSS Control), - http://lwn.net/Articles/222762/ -3. Emelianov, Pavel. Resource controllers based on process cgroups - http://lkml.org/lkml/2007/3/6/198 -4. Emelianov, Pavel. RSS controller based on process cgroups (v2) - http://lkml.org/lkml/2007/4/9/78 -5. Emelianov, Pavel. RSS controller based on process cgroups (v3) - http://lkml.org/lkml/2007/5/30/244 -6. Menage, Paul. Control Groups v10, http://lwn.net/Articles/236032/ -7. Vaidyanathan, Srinivasan, Control Groups: Pagecache accounting and control - subsystem (v3), http://lwn.net/Articles/235534/ -8. Singh, Balbir. RSS controller v2 test results (lmbench), - http://lkml.org/lkml/2007/5/17/232 -9. Singh, Balbir. RSS controller v2 AIM9 results - http://lkml.org/lkml/2007/5/18/1 -10. Singh, Balbir. Memory controller v6 test results, - http://lkml.org/lkml/2007/8/19/36 -11. Singh, Balbir. Memory controller introduction (v6), - http://lkml.org/lkml/2007/8/17/69 -12. Corbet, Jonathan, Controlling memory use in cgroups, - http://lwn.net/Articles/243795/ diff --git a/Documentation/cgroups/net_cls.txt b/Documentation/cgroups/net_cls.txt deleted file mode 100644 index ec18234..0000000 --- a/Documentation/cgroups/net_cls.txt +++ /dev/null @@ -1,39 +0,0 @@ -Network classifier cgroup -------------------------- - -The Network classifier cgroup provides an interface to -tag network packets with a class identifier (classid). - -The Traffic Controller (tc) can be used to assign -different priorities to packets from different cgroups. -Also, Netfilter (iptables) can use this tag to perform -actions on such packets. - -Creating a net_cls cgroups instance creates a net_cls.classid file. -This net_cls.classid value is initialized to 0. - -You can write hexadecimal values to net_cls.classid; the format for these -values is 0xAAAABBBB; AAAA is the major handle number and BBBB -is the minor handle number. -Reading net_cls.classid yields a decimal result. - -Example: -mkdir /sys/fs/cgroup/net_cls -mount -t cgroup -onet_cls net_cls /sys/fs/cgroup/net_cls -mkdir /sys/fs/cgroup/net_cls/0 -echo 0x100001 > /sys/fs/cgroup/net_cls/0/net_cls.classid - - setting a 10:1 handle. - -cat /sys/fs/cgroup/net_cls/0/net_cls.classid -1048577 - -configuring tc: -tc qdisc add dev eth0 root handle 10: htb - -tc class add dev eth0 parent 10: classid 10:1 htb rate 40mbit - - creating traffic class 10:1 - -tc filter add dev eth0 parent 10: protocol ip prio 10 handle 1: cgroup - -configuring iptables, basic example: -iptables -A OUTPUT -m cgroup ! --cgroup 0x100001 -j DROP diff --git a/Documentation/cgroups/net_prio.txt b/Documentation/cgroups/net_prio.txt deleted file mode 100644 index a82cbd2..0000000 --- a/Documentation/cgroups/net_prio.txt +++ /dev/null @@ -1,55 +0,0 @@ -Network priority cgroup -------------------------- - -The Network priority cgroup provides an interface to allow an administrator to -dynamically set the priority of network traffic generated by various -applications - -Nominally, an application would set the priority of its traffic via the -SO_PRIORITY socket option. This however, is not always possible because: - -1) The application may not have been coded to set this value -2) The priority of application traffic is often a site-specific administrative - decision rather than an application defined one. - -This cgroup allows an administrator to assign a process to a group which defines -the priority of egress traffic on a given interface. Network priority groups can -be created by first mounting the cgroup filesystem. - -# mount -t cgroup -onet_prio none /sys/fs/cgroup/net_prio - -With the above step, the initial group acting as the parent accounting group -becomes visible at '/sys/fs/cgroup/net_prio'. This group includes all tasks in -the system. '/sys/fs/cgroup/net_prio/tasks' lists the tasks in this cgroup. - -Each net_prio cgroup contains two files that are subsystem specific - -net_prio.prioidx -This file is read-only, and is simply informative. It contains a unique integer -value that the kernel uses as an internal representation of this cgroup. - -net_prio.ifpriomap -This file contains a map of the priorities assigned to traffic originating from -processes in this group and egressing the system on various interfaces. It -contains a list of tuples in the form . Contents of this file -can be modified by echoing a string into the file using the same tuple format. -for example: - -echo "eth0 5" > /sys/fs/cgroups/net_prio/iscsi/net_prio.ifpriomap - -This command would force any traffic originating from processes belonging to the -iscsi net_prio cgroup and egressing on interface eth0 to have the priority of -said traffic set to the value 5. The parent accounting group also has a -writeable 'net_prio.ifpriomap' file that can be used to set a system default -priority. - -Priorities are set immediately prior to queueing a frame to the device -queueing discipline (qdisc) so priorities will be assigned prior to the hardware -queue selection being made. - -One usage for the net_prio cgroup is with mqprio qdisc allowing application -traffic to be steered to hardware/driver based traffic classes. These mappings -can then be managed by administrators or other networking protocols such as -DCBX. - -A new net_prio cgroup inherits the parent's configuration. diff --git a/Documentation/cgroups/pids.txt b/Documentation/cgroups/pids.txt deleted file mode 100644 index 1a078b5..0000000 --- a/Documentation/cgroups/pids.txt +++ /dev/null @@ -1,85 +0,0 @@ - Process Number Controller - ========================= - -Abstract --------- - -The process number controller is used to allow a cgroup hierarchy to stop any -new tasks from being fork()'d or clone()'d after a certain limit is reached. - -Since it is trivial to hit the task limit without hitting any kmemcg limits in -place, PIDs are a fundamental resource. As such, PID exhaustion must be -preventable in the scope of a cgroup hierarchy by allowing resource limiting of -the number of tasks in a cgroup. - -Usage ------ - -In order to use the `pids` controller, set the maximum number of tasks in -pids.max (this is not available in the root cgroup for obvious reasons). The -number of processes currently in the cgroup is given by pids.current. - -Organisational operations are not blocked by cgroup policies, so it is possible -to have pids.current > pids.max. This can be done by either setting the limit to -be smaller than pids.current, or attaching enough processes to the cgroup such -that pids.current > pids.max. However, it is not possible to violate a cgroup -policy through fork() or clone(). fork() and clone() will return -EAGAIN if the -creation of a new process would cause a cgroup policy to be violated. - -To set a cgroup to have no limit, set pids.max to "max". This is the default for -all new cgroups (N.B. that PID limits are hierarchical, so the most stringent -limit in the hierarchy is followed). - -pids.current tracks all child cgroup hierarchies, so parent/pids.current is a -superset of parent/child/pids.current. - -Example -------- - -First, we mount the pids controller: -# mkdir -p /sys/fs/cgroup/pids -# mount -t cgroup -o pids none /sys/fs/cgroup/pids - -Then we create a hierarchy, set limits and attach processes to it: -# mkdir -p /sys/fs/cgroup/pids/parent/child -# echo 2 > /sys/fs/cgroup/pids/parent/pids.max -# echo $$ > /sys/fs/cgroup/pids/parent/cgroup.procs -# cat /sys/fs/cgroup/pids/parent/pids.current -2 -# - -It should be noted that attempts to overcome the set limit (2 in this case) will -fail: - -# cat /sys/fs/cgroup/pids/parent/pids.current -2 -# ( /bin/echo "Here's some processes for you." | cat ) -sh: fork: Resource temporary unavailable -# - -Even if we migrate to a child cgroup (which doesn't have a set limit), we will -not be able to overcome the most stringent limit in the hierarchy (in this case, -parent's): - -# echo $$ > /sys/fs/cgroup/pids/parent/child/cgroup.procs -# cat /sys/fs/cgroup/pids/parent/pids.current -2 -# cat /sys/fs/cgroup/pids/parent/child/pids.current -2 -# cat /sys/fs/cgroup/pids/parent/child/pids.max -max -# ( /bin/echo "Here's some processes for you." | cat ) -sh: fork: Resource temporary unavailable -# - -We can set a limit that is smaller than pids.current, which will stop any new -processes from being forked at all (note that the shell itself counts towards -pids.current): - -# echo 1 > /sys/fs/cgroup/pids/parent/pids.max -# /bin/echo "We can't even spawn a single process now." -sh: fork: Resource temporary unavailable -# echo 0 > /sys/fs/cgroup/pids/parent/pids.max -# /bin/echo "We can't even spawn a single process now." -sh: fork: Resource temporary unavailable -# diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt deleted file mode 100644 index c1f0e87..0000000 --- a/Documentation/cgroups/unified-hierarchy.txt +++ /dev/null @@ -1,645 +0,0 @@ - -Cgroup unified hierarchy - -April, 2014 Tejun Heo - -This document describes the changes made by unified hierarchy and -their rationales. It will eventually be merged into the main cgroup -documentation. - -CONTENTS - -1. Background -2. Basic Operation - 2-1. Mounting - 2-2. cgroup.subtree_control - 2-3. cgroup.controllers -3. Structural Constraints - 3-1. Top-down - 3-2. No internal tasks -4. Delegation - 4-1. Model of delegation - 4-2. Common ancestor rule -5. Other Changes - 5-1. [Un]populated Notification - 5-2. Other Core Changes - 5-3. Controller File Conventions - 5-3-1. Format - 5-3-2. Control Knobs - 5-4. Per-Controller Changes - 5-4-1. io - 5-4-2. cpuset - 5-4-3. memory -6. Planned Changes - 6-1. CAP for resource control - - -1. Background - -cgroup allows an arbitrary number of hierarchies and each hierarchy -can host any number of controllers. While this seems to provide a -high level of flexibility, it isn't quite useful in practice. - -For example, as there is only one instance of each controller, utility -type controllers such as freezer which can be useful in all -hierarchies can only be used in one. The issue is exacerbated by the -fact that controllers can't be moved around once hierarchies are -populated. Another issue is that all controllers bound to a hierarchy -are forced to have exactly the same view of the hierarchy. It isn't -possible to vary the granularity depending on the specific controller. - -In practice, these issues heavily limit which controllers can be put -on the same hierarchy and most configurations resort to putting each -controller on its own hierarchy. Only closely related ones, such as -the cpu and cpuacct controllers, make sense to put on the same -hierarchy. This often means that userland ends up managing multiple -similar hierarchies repeating the same steps on each hierarchy -whenever a hierarchy management operation is necessary. - -Unfortunately, support for multiple hierarchies comes at a steep cost. -Internal implementation in cgroup core proper is dazzlingly -complicated but more importantly the support for multiple hierarchies -restricts how cgroup is used in general and what controllers can do. - -There's no limit on how many hierarchies there may be, which means -that a task's cgroup membership can't be described in finite length. -The key may contain any varying number of entries and is unlimited in -length, which makes it highly awkward to handle and leads to addition -of controllers which exist only to identify membership, which in turn -exacerbates the original problem. - -Also, as a controller can't have any expectation regarding what shape -of hierarchies other controllers would be on, each controller has to -assume that all other controllers are operating on completely -orthogonal hierarchies. This makes it impossible, or at least very -cumbersome, for controllers to cooperate with each other. - -In most use cases, putting controllers on hierarchies which are -completely orthogonal to each other isn't necessary. What usually is -called for is the ability to have differing levels of granularity -depending on the specific controller. In other words, hierarchy may -be collapsed from leaf towards root when viewed from specific -controllers. For example, a given configuration might not care about -how memory is distributed beyond a certain level while still wanting -to control how CPU cycles are distributed. - -Unified hierarchy is the next version of cgroup interface. It aims to -address the aforementioned issues by having more structure while -retaining enough flexibility for most use cases. Various other -general and controller-specific interface issues are also addressed in -the process. - - -2. Basic Operation - -2-1. Mounting - -Unified hierarchy can be mounted with the following mount command. - - mount -t cgroup2 none $MOUNT_POINT - -All controllers which support the unified hierarchy and are not bound -to other hierarchies are automatically bound to unified hierarchy and -show up at the root of it. Controllers which are enabled only in the -root of unified hierarchy can be bound to other hierarchies. This -allows mixing unified hierarchy with the traditional multiple -hierarchies in a fully backward compatible way. - -A controller can be moved across hierarchies only after the controller -is no longer referenced in its current hierarchy. Because per-cgroup -controller states are destroyed asynchronously and controllers may -have lingering references, a controller may not show up immediately on -the unified hierarchy after the final umount of the previous -hierarchy. Similarly, a controller should be fully disabled to be -moved out of the unified hierarchy and it may take some time for the -disabled controller to become available for other hierarchies; -furthermore, due to dependencies among controllers, other controllers -may need to be disabled too. - -While useful for development and manual configurations, dynamically -moving controllers between the unified and other hierarchies is -strongly discouraged for production use. It is recommended to decide -the hierarchies and controller associations before starting using the -controllers. - - -2-2. cgroup.subtree_control - -All cgroups on unified hierarchy have a "cgroup.subtree_control" file -which governs which controllers are enabled on the children of the -cgroup. Let's assume a hierarchy like the following. - - root - A - B - C - \ D - -root's "cgroup.subtree_control" file determines which controllers are -enabled on A. A's on B. B's on C and D. This coincides with the -fact that controllers on the immediate sub-level are used to -distribute the resources of the parent. In fact, it's natural to -assume that resource control knobs of a child belong to its parent. -Enabling a controller in a "cgroup.subtree_control" file declares that -distribution of the respective resources of the cgroup will be -controlled. Note that this means that controller enable states are -shared among siblings. - -When read, the file contains a space-separated list of currently -enabled controllers. A write to the file should contain a -space-separated list of controllers with '+' or '-' prefixed (without -the quotes). Controllers prefixed with '+' are enabled and '-' -disabled. If a controller is listed multiple times, the last entry -wins. The specific operations are executed atomically - either all -succeed or fail. - - -2-3. cgroup.controllers - -Read-only "cgroup.controllers" file contains a space-separated list of -controllers which can be enabled in the cgroup's -"cgroup.subtree_control" file. - -In the root cgroup, this lists controllers which are not bound to -other hierarchies and the content changes as controllers are bound to -and unbound from other hierarchies. - -In non-root cgroups, the content of this file equals that of the -parent's "cgroup.subtree_control" file as only controllers enabled -from the parent can be used in its children. - - -3. Structural Constraints - -3-1. Top-down - -As it doesn't make sense to nest control of an uncontrolled resource, -all non-root "cgroup.subtree_control" files can only contain -controllers which are enabled in the parent's "cgroup.subtree_control" -file. A controller can be enabled only if the parent has the -controller enabled and a controller can't be disabled if one or more -children have it enabled. - - -3-2. No internal tasks - -One long-standing issue that cgroup faces is the competition between -tasks belonging to the parent cgroup and its children cgroups. This -is inherently nasty as two different types of entities compete and -there is no agreed-upon obvious way to handle it. Different -controllers are doing different things. - -The cpu controller considers tasks and cgroups as equivalents and maps -nice levels to cgroup weights. This works for some cases but falls -flat when children should be allocated specific ratios of CPU cycles -and the number of internal tasks fluctuates - the ratios constantly -change as the number of competing entities fluctuates. There also are -other issues. The mapping from nice level to weight isn't obvious or -universal, and there are various other knobs which simply aren't -available for tasks. - -The io controller implicitly creates a hidden leaf node for each -cgroup to host the tasks. The hidden leaf has its own copies of all -the knobs with "leaf_" prefixed. While this allows equivalent control -over internal tasks, it's with serious drawbacks. It always adds an -extra layer of nesting which may not be necessary, makes the interface -messy and significantly complicates the implementation. - -The memory controller currently doesn't have a way to control what -happens between internal tasks and child cgroups and the behavior is -not clearly defined. There have been attempts to add ad-hoc behaviors -and knobs to tailor the behavior to specific workloads. Continuing -this direction will lead to problems which will be extremely difficult -to resolve in the long term. - -Multiple controllers struggle with internal tasks and came up with -different ways to deal with it; unfortunately, all the approaches in -use now are severely flawed and, furthermore, the widely different -behaviors make cgroup as whole highly inconsistent. - -It is clear that this is something which needs to be addressed from -cgroup core proper in a uniform way so that controllers don't need to -worry about it and cgroup as a whole shows a consistent and logical -behavior. To achieve that, unified hierarchy enforces the following -structural constraint: - - Except for the root, only cgroups which don't contain any task may - have controllers enabled in their "cgroup.subtree_control" files. - -Combined with other properties, this guarantees that, when a -controller is looking at the part of the hierarchy which has it -enabled, tasks are always only on the leaves. This rules out -situations where child cgroups compete against internal tasks of the -parent. - -There are two things to note. Firstly, the root cgroup is exempt from -the restriction. Root contains tasks and anonymous resource -consumption which can't be associated with any other cgroup and -requires special treatment from most controllers. How resource -consumption in the root cgroup is governed is up to each controller. - -Secondly, the restriction doesn't take effect if there is no enabled -controller in the cgroup's "cgroup.subtree_control" file. This is -important as otherwise it wouldn't be possible to create children of a -populated cgroup. To control resource distribution of a cgroup, the -cgroup must create children and transfer all its tasks to the children -before enabling controllers in its "cgroup.subtree_control" file. - - -4. Delegation - -4-1. Model of delegation - -A cgroup can be delegated to a less privileged user by granting write -access of the directory and its "cgroup.procs" file to the user. Note -that the resource control knobs in a given directory concern the -resources of the parent and thus must not be delegated along with the -directory. - -Once delegated, the user can build sub-hierarchy under the directory, -organize processes as it sees fit and further distribute the resources -it got from the parent. The limits and other settings of all resource -controllers are hierarchical and regardless of what happens in the -delegated sub-hierarchy, nothing can escape the resource restrictions -imposed by the parent. - -Currently, cgroup doesn't impose any restrictions on the number of -cgroups in or nesting depth of a delegated sub-hierarchy; however, -this may in the future be limited explicitly. - - -4-2. Common ancestor rule - -On the unified hierarchy, to write to a "cgroup.procs" file, in -addition to the usual write permission to the file and uid match, the -writer must also have write access to the "cgroup.procs" file of the -common ancestor of the source and destination cgroups. This prevents -delegatees from smuggling processes across disjoint sub-hierarchies. - -Let's say cgroups C0 and C1 have been delegated to user U0 who created -C00, C01 under C0 and C10 under C1 as follows. - - ~~~~~~~~~~~~~ - C0 - C00 - ~ cgroup ~ \ C01 - ~ hierarchy ~ - ~~~~~~~~~~~~~ - C1 - C10 - -C0 and C1 are separate entities in terms of resource distribution -regardless of their relative positions in the hierarchy. The -resources the processes under C0 are entitled to are controlled by -C0's ancestors and may be completely different from C1. It's clear -that the intention of delegating C0 to U0 is allowing U0 to organize -the processes under C0 and further control the distribution of C0's -resources. - -On traditional hierarchies, if a task has write access to "tasks" or -"cgroup.procs" file of a cgroup and its uid agrees with the target, it -can move the target to the cgroup. In the above example, U0 will not -only be able to move processes in each sub-hierarchy but also across -the two sub-hierarchies, effectively allowing it to violate the -organizational and resource restrictions implied by the hierarchical -structure above C0 and C1. - -On the unified hierarchy, let's say U0 wants to write the pid of a -process which has a matching uid and is currently in C10 into -"C00/cgroup.procs". U0 obviously has write access to the file and -migration permission on the process; however, the common ancestor of -the source cgroup C10 and the destination cgroup C00 is above the -points of delegation and U0 would not have write access to its -"cgroup.procs" and thus be denied with -EACCES. - - -5. Other Changes - -5-1. [Un]populated Notification - -cgroup users often need a way to determine when a cgroup's -subhierarchy becomes empty so that it can be cleaned up. cgroup -currently provides release_agent for it; unfortunately, this mechanism -is riddled with issues. - -- It delivers events by forking and execing a userland binary - specified as the release_agent. This is a long deprecated method of - notification delivery. It's extremely heavy, slow and cumbersome to - integrate with larger infrastructure. - -- There is single monitoring point at the root. There's no way to - delegate management of a subtree. - -- The event isn't recursive. It triggers when a cgroup doesn't have - any tasks or child cgroups. Events for internal nodes trigger only - after all children are removed. This again makes it impossible to - delegate management of a subtree. - -- Events are filtered from the kernel side. A "notify_on_release" - file is used to subscribe to or suppress release events. This is - unnecessarily complicated and probably done this way because event - delivery itself was expensive. - -Unified hierarchy implements "populated" field in "cgroup.events" -interface file which can be used to monitor whether the cgroup's -subhierarchy has tasks in it or not. Its value is 0 if there is no -task in the cgroup and its descendants; otherwise, 1. poll and -[id]notify events are triggered when the value changes. - -This is significantly lighter and simpler and trivially allows -delegating management of subhierarchy - subhierarchy monitoring can -block further propagation simply by putting itself or another process -in the subhierarchy and monitor events that it's interested in from -there without interfering with monitoring higher in the tree. - -In unified hierarchy, the release_agent mechanism is no longer -supported and the interface files "release_agent" and -"notify_on_release" do not exist. - - -5-2. Other Core Changes - -- None of the mount options is allowed. - -- remount is disallowed. - -- rename(2) is disallowed. - -- The "tasks" file is removed. Everything should at process - granularity. Use the "cgroup.procs" file instead. - -- The "cgroup.procs" file is not sorted. pids will be unique unless - they got recycled in-between reads. - -- The "cgroup.clone_children" file is removed. - -- /proc/PID/cgroup keeps reporting the cgroup that a zombie belonged - to before exiting. If the cgroup is removed before the zombie is - reaped, " (deleted)" is appeneded to the path. - - -5-3. Controller File Conventions - -5-3-1. Format - -In general, all controller files should be in one of the following -formats whenever possible. - -- Values only files - - VAL0 VAL1...\n - -- Flat keyed files - - KEY0 VAL0\n - KEY1 VAL1\n - ... - -- Nested keyed files - - KEY0 SUB_KEY0=VAL00 SUB_KEY1=VAL01... - KEY1 SUB_KEY0=VAL10 SUB_KEY1=VAL11... - ... - -For a writeable file, the format for writing should generally match -reading; however, controllers may allow omitting later fields or -implement restricted shortcuts for most common use cases. - -For both flat and nested keyed files, only the values for a single key -can be written at a time. For nested keyed files, the sub key pairs -may be specified in any order and not all pairs have to be specified. - - -5-3-2. Control Knobs - -- Settings for a single feature should generally be implemented in a - single file. - -- In general, the root cgroup should be exempt from resource control - and thus shouldn't have resource control knobs. - -- If a controller implements ratio based resource distribution, the - control knob should be named "weight" and have the range [1, 10000] - and 100 should be the default value. The values are chosen to allow - enough and symmetric bias in both directions while keeping it - intuitive (the default is 100%). - -- If a controller implements an absolute resource guarantee and/or - limit, the control knobs should be named "min" and "max" - respectively. If a controller implements best effort resource - gurantee and/or limit, the control knobs should be named "low" and - "high" respectively. - - In the above four control files, the special token "max" should be - used to represent upward infinity for both reading and writing. - -- If a setting has configurable default value and specific overrides, - the default settings should be keyed with "default" and appear as - the first entry in the file. Specific entries can use "default" as - its value to indicate inheritance of the default value. - -- For events which are not very high frequency, an interface file - "events" should be created which lists event key value pairs. - Whenever a notifiable event happens, file modified event should be - generated on the file. - - -5-4. Per-Controller Changes - -5-4-1. io - -- blkio is renamed to io. The interface is overhauled anyway. The - new name is more in line with the other two major controllers, cpu - and memory, and better suited given that it may be used for cgroup - writeback without involving block layer. - -- Everything including stat is always hierarchical making separate - recursive stat files pointless and, as no internal node can have - tasks, leaf weights are meaningless. The operation model is - simplified and the interface is overhauled accordingly. - - io.stat - - The stat file. The reported stats are from the point where - bio's are issued to request_queue. The stats are counted - independent of which policies are enabled. Each line in the - file follows the following format. More fields may later be - added at the end. - - $MAJ:$MIN rbytes=$RBYTES wbytes=$WBYTES rios=$RIOS wrios=$WIOS - - io.weight - - The weight setting, currently only available and effective if - cfq-iosched is in use for the target device. The weight is - between 1 and 10000 and defaults to 100. The first line - always contains the default weight in the following format to - use when per-device setting is missing. - - default $WEIGHT - - Subsequent lines list per-device weights of the following - format. - - $MAJ:$MIN $WEIGHT - - Writing "$WEIGHT" or "default $WEIGHT" changes the default - setting. Writing "$MAJ:$MIN $WEIGHT" sets per-device weight - while "$MAJ:$MIN default" clears it. - - This file is available only on non-root cgroups. - - io.max - - The maximum bandwidth and/or iops setting, only available if - blk-throttle is enabled. The file is of the following format. - - $MAJ:$MIN rbps=$RBPS wbps=$WBPS riops=$RIOPS wiops=$WIOPS - - ${R|W}BPS are read/write bytes per second and ${R|W}IOPS are - read/write IOs per second. "max" indicates no limit. Writing - to the file follows the same format but the individual - settings may be omitted or specified in any order. - - This file is available only on non-root cgroups. - - -5-4-2. cpuset - -- Tasks are kept in empty cpusets after hotplug and take on the masks - of the nearest non-empty ancestor, instead of being moved to it. - -- A task can be moved into an empty cpuset, and again it takes on the - masks of the nearest non-empty ancestor. - - -5-4-3. memory - -- use_hierarchy is on by default and the cgroup file for the flag is - not created. - -- The original lower boundary, the soft limit, is defined as a limit - that is per default unset. As a result, the set of cgroups that - global reclaim prefers is opt-in, rather than opt-out. The costs - for optimizing these mostly negative lookups are so high that the - implementation, despite its enormous size, does not even provide the - basic desirable behavior. First off, the soft limit has no - hierarchical meaning. All configured groups are organized in a - global rbtree and treated like equal peers, regardless where they - are located in the hierarchy. This makes subtree delegation - impossible. Second, the soft limit reclaim pass is so aggressive - that it not just introduces high allocation latencies into the - system, but also impacts system performance due to overreclaim, to - the point where the feature becomes self-defeating. - - The memory.low boundary on the other hand is a top-down allocated - reserve. A cgroup enjoys reclaim protection when it and all its - ancestors are below their low boundaries, which makes delegation of - subtrees possible. Secondly, new cgroups have no reserve per - default and in the common case most cgroups are eligible for the - preferred reclaim pass. This allows the new low boundary to be - efficiently implemented with just a minor addition to the generic - reclaim code, without the need for out-of-band data structures and - reclaim passes. Because the generic reclaim code considers all - cgroups except for the ones running low in the preferred first - reclaim pass, overreclaim of individual groups is eliminated as - well, resulting in much better overall workload performance. - -- The original high boundary, the hard limit, is defined as a strict - limit that can not budge, even if the OOM killer has to be called. - But this generally goes against the goal of making the most out of - the available memory. The memory consumption of workloads varies - during runtime, and that requires users to overcommit. But doing - that with a strict upper limit requires either a fairly accurate - prediction of the working set size or adding slack to the limit. - Since working set size estimation is hard and error prone, and - getting it wrong results in OOM kills, most users tend to err on the - side of a looser limit and end up wasting precious resources. - - The memory.high boundary on the other hand can be set much more - conservatively. When hit, it throttles allocations by forcing them - into direct reclaim to work off the excess, but it never invokes the - OOM killer. As a result, a high boundary that is chosen too - aggressively will not terminate the processes, but instead it will - lead to gradual performance degradation. The user can monitor this - and make corrections until the minimal memory footprint that still - gives acceptable performance is found. - - In extreme cases, with many concurrent allocations and a complete - breakdown of reclaim progress within the group, the high boundary - can be exceeded. But even then it's mostly better to satisfy the - allocation from the slack available in other groups or the rest of - the system than killing the group. Otherwise, memory.max is there - to limit this type of spillover and ultimately contain buggy or even - malicious applications. - -- The original control file names are unwieldy and inconsistent in - many different ways. For example, the upper boundary hit count is - exported in the memory.failcnt file, but an OOM event count has to - be manually counted by listening to memory.oom_control events, and - lower boundary / soft limit events have to be counted by first - setting a threshold for that value and then counting those events. - Also, usage and limit files encode their units in the filename. - That makes the filenames very long, even though this is not - information that a user needs to be reminded of every time they type - out those names. - - To address these naming issues, as well as to signal clearly that - the new interface carries a new configuration model, the naming - conventions in it necessarily differ from the old interface. - -- The original limit files indicate the state of an unset limit with a - Very High Number, and a configured limit can be unset by echoing -1 - into those files. But that very high number is implementation and - architecture dependent and not very descriptive. And while -1 can - be understood as an underflow into the highest possible value, -2 or - -10M etc. do not work, so it's not consistent. - - memory.low, memory.high, and memory.max will use the string "max" to - indicate and set the highest possible value. - -6. Planned Changes - -6-1. CAP for resource control - -Unified hierarchy will require one of the capabilities(7), which is -yet to be decided, for all resource control related knobs. Process -organization operations - creation of sub-cgroups and migration of -processes in sub-hierarchies may be delegated by changing the -ownership and/or permissions on the cgroup directory and -"cgroup.procs" interface file; however, all operations which affect -resource control - writes to a "cgroup.subtree_control" file or any -controller-specific knobs - will require an explicit CAP privilege. - -This, in part, is to prevent the cgroup interface from being -inadvertently promoted to programmable API used by non-privileged -binaries. cgroup exposes various aspects of the system in ways which -aren't properly abstracted for direct consumption by regular programs. -This is an administration interface much closer to sysctl knobs than -system calls. Even the basic access model, being filesystem path -based, isn't suitable for direct consumption. There's no way to -access "my cgroup" in a race-free way or make multiple operations -atomic against migration to another cgroup. - -Another aspect is that, for better or for worse, the cgroup interface -goes through far less scrutiny than regular interfaces for -unprivileged userland. The upside is that cgroup is able to expose -useful features which may not be suitable for general consumption in a -reasonable time frame. It provides a relatively short path between -internal details and userland-visible interface. Of course, this -shortcut comes with high risk. We go through what we go through for -general kernel APIs for good reasons. It may end up leaking internal -details in a way which can exert significant pain by locking the -kernel into a contract that can't be maintained in a reasonable -manner. - -Also, due to the specific nature, cgroup and its controllers don't -tend to attract attention from a wide scope of developers. cgroup's -short history is already fraught with severely mis-designed -interfaces, unnecessary commitments to and exposing of internal -details, broken and dangerous implementations of various features. - -Keeping cgroup as an administration interface is both advantageous for -its role and imperative given its nature. Some of the cgroup features -may make sense for unprivileged access. If deemed justified, those -must be further abstracted and implemented as a different interface, -be it a system call or process-private filesystem, and survive through -the scrutiny that any interface for general consumption is required to -go through. - -Requiring CAP is not a complete solution but should serve as a -significant deterrent against spraying cgroup usages in non-privileged -programs. -- cgit v0.10.2 From 6c2920926b10e8303378408e3c2b8952071d4344 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 16 Nov 2015 11:13:34 -0500 Subject: cgroup: replace unified-hierarchy.txt with a proper cgroup v2 documentation Now that cgroup v2 is almost out of the door, replace the development documentation unified-hierarchy.txt with Documentation/cgroup.txt which is a superset of unified-hierarchy.txt and authoritatively describes all userland-visible aspects of cgroup. v2: Updated to include all information from blkio-controller.txt and list filesystems which support cgroup writeback as suggested by Vivek. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: Vivek Goyal diff --git a/Documentation/cgroup-legacy/blkio-controller.txt b/Documentation/cgroup-legacy/blkio-controller.txt index 52fa9f3..4ecc954 100644 --- a/Documentation/cgroup-legacy/blkio-controller.txt +++ b/Documentation/cgroup-legacy/blkio-controller.txt @@ -374,82 +374,3 @@ One can experience an overall throughput drop if you have created multiple groups and put applications in that group which are not driving enough IO to keep disk busy. In that case set group_idle=0, and CFQ will not idle on individual groups and throughput should improve. - -Writeback -========= - -Page cache is dirtied through buffered writes and shared mmaps and -written asynchronously to the backing filesystem by the writeback -mechanism. Writeback sits between the memory and IO domains and -regulates the proportion of dirty memory by balancing dirtying and -write IOs. - -On traditional cgroup hierarchies, relationships between different -controllers cannot be established making it impossible for writeback -to operate accounting for cgroup resource restrictions and all -writeback IOs are attributed to the root cgroup. - -If both the blkio and memory controllers are used on the v2 hierarchy -and the filesystem supports cgroup writeback, writeback operations -correctly follow the resource restrictions imposed by both memory and -blkio controllers. - -Writeback examines both system-wide and per-cgroup dirty memory status -and enforces the more restrictive of the two. Also, writeback control -parameters which are absolute values - vm.dirty_bytes and -vm.dirty_background_bytes - are distributed across cgroups according -to their current writeback bandwidth. - -There's a peculiarity stemming from the discrepancy in ownership -granularity between memory controller and writeback. While memory -controller tracks ownership per page, writeback operates on inode -basis. cgroup writeback bridges the gap by tracking ownership by -inode but migrating ownership if too many foreign pages, pages which -don't match the current inode ownership, have been encountered while -writing back the inode. - -This is a conscious design choice as writeback operations are -inherently tied to inodes making strictly following page ownership -complicated and inefficient. The only use case which suffers from -this compromise is multiple cgroups concurrently dirtying disjoint -regions of the same inode, which is an unlikely use case and decided -to be unsupported. Note that as memory controller assigns page -ownership on the first use and doesn't update it until the page is -released, even if cgroup writeback strictly follows page ownership, -multiple cgroups dirtying overlapping areas wouldn't work as expected. -In general, write-sharing an inode across multiple cgroups is not well -supported. - -Filesystem support for cgroup writeback ---------------------------------------- - -A filesystem can make writeback IOs cgroup-aware by updating -address_space_operations->writepage[s]() to annotate bio's using the -following two functions. - -* wbc_init_bio(@wbc, @bio) - - Should be called for each bio carrying writeback data and associates - the bio with the inode's owner cgroup. Can be called anytime - between bio allocation and submission. - -* wbc_account_io(@wbc, @page, @bytes) - - Should be called for each data segment being written out. While - this function doesn't care exactly when it's called during the - writeback session, it's the easiest and most natural to call it as - data segments are added to a bio. - -With writeback bio's annotated, cgroup support can be enabled per -super_block by setting MS_CGROUPWB in ->s_flags. This allows for -selective disabling of cgroup writeback support which is helpful when -certain filesystem features, e.g. journaled data mode, are -incompatible. - -wbc_init_bio() binds the specified bio to its cgroup. Depending on -the configuration, the bio may be executed at a lower priority and if -the writeback session is holding shared resources, e.g. a journal -entry, may lead to priority inversion. There is no one easy solution -for the problem. Filesystems can try to work around specific problem -cases by skipping wbc_init_bio() or using bio_associate_blkcg() -directly. diff --git a/Documentation/cgroup-legacy/unified-hierarchy.txt b/Documentation/cgroup-legacy/unified-hierarchy.txt deleted file mode 100644 index c1f0e87..0000000 --- a/Documentation/cgroup-legacy/unified-hierarchy.txt +++ /dev/null @@ -1,645 +0,0 @@ - -Cgroup unified hierarchy - -April, 2014 Tejun Heo - -This document describes the changes made by unified hierarchy and -their rationales. It will eventually be merged into the main cgroup -documentation. - -CONTENTS - -1. Background -2. Basic Operation - 2-1. Mounting - 2-2. cgroup.subtree_control - 2-3. cgroup.controllers -3. Structural Constraints - 3-1. Top-down - 3-2. No internal tasks -4. Delegation - 4-1. Model of delegation - 4-2. Common ancestor rule -5. Other Changes - 5-1. [Un]populated Notification - 5-2. Other Core Changes - 5-3. Controller File Conventions - 5-3-1. Format - 5-3-2. Control Knobs - 5-4. Per-Controller Changes - 5-4-1. io - 5-4-2. cpuset - 5-4-3. memory -6. Planned Changes - 6-1. CAP for resource control - - -1. Background - -cgroup allows an arbitrary number of hierarchies and each hierarchy -can host any number of controllers. While this seems to provide a -high level of flexibility, it isn't quite useful in practice. - -For example, as there is only one instance of each controller, utility -type controllers such as freezer which can be useful in all -hierarchies can only be used in one. The issue is exacerbated by the -fact that controllers can't be moved around once hierarchies are -populated. Another issue is that all controllers bound to a hierarchy -are forced to have exactly the same view of the hierarchy. It isn't -possible to vary the granularity depending on the specific controller. - -In practice, these issues heavily limit which controllers can be put -on the same hierarchy and most configurations resort to putting each -controller on its own hierarchy. Only closely related ones, such as -the cpu and cpuacct controllers, make sense to put on the same -hierarchy. This often means that userland ends up managing multiple -similar hierarchies repeating the same steps on each hierarchy -whenever a hierarchy management operation is necessary. - -Unfortunately, support for multiple hierarchies comes at a steep cost. -Internal implementation in cgroup core proper is dazzlingly -complicated but more importantly the support for multiple hierarchies -restricts how cgroup is used in general and what controllers can do. - -There's no limit on how many hierarchies there may be, which means -that a task's cgroup membership can't be described in finite length. -The key may contain any varying number of entries and is unlimited in -length, which makes it highly awkward to handle and leads to addition -of controllers which exist only to identify membership, which in turn -exacerbates the original problem. - -Also, as a controller can't have any expectation regarding what shape -of hierarchies other controllers would be on, each controller has to -assume that all other controllers are operating on completely -orthogonal hierarchies. This makes it impossible, or at least very -cumbersome, for controllers to cooperate with each other. - -In most use cases, putting controllers on hierarchies which are -completely orthogonal to each other isn't necessary. What usually is -called for is the ability to have differing levels of granularity -depending on the specific controller. In other words, hierarchy may -be collapsed from leaf towards root when viewed from specific -controllers. For example, a given configuration might not care about -how memory is distributed beyond a certain level while still wanting -to control how CPU cycles are distributed. - -Unified hierarchy is the next version of cgroup interface. It aims to -address the aforementioned issues by having more structure while -retaining enough flexibility for most use cases. Various other -general and controller-specific interface issues are also addressed in -the process. - - -2. Basic Operation - -2-1. Mounting - -Unified hierarchy can be mounted with the following mount command. - - mount -t cgroup2 none $MOUNT_POINT - -All controllers which support the unified hierarchy and are not bound -to other hierarchies are automatically bound to unified hierarchy and -show up at the root of it. Controllers which are enabled only in the -root of unified hierarchy can be bound to other hierarchies. This -allows mixing unified hierarchy with the traditional multiple -hierarchies in a fully backward compatible way. - -A controller can be moved across hierarchies only after the controller -is no longer referenced in its current hierarchy. Because per-cgroup -controller states are destroyed asynchronously and controllers may -have lingering references, a controller may not show up immediately on -the unified hierarchy after the final umount of the previous -hierarchy. Similarly, a controller should be fully disabled to be -moved out of the unified hierarchy and it may take some time for the -disabled controller to become available for other hierarchies; -furthermore, due to dependencies among controllers, other controllers -may need to be disabled too. - -While useful for development and manual configurations, dynamically -moving controllers between the unified and other hierarchies is -strongly discouraged for production use. It is recommended to decide -the hierarchies and controller associations before starting using the -controllers. - - -2-2. cgroup.subtree_control - -All cgroups on unified hierarchy have a "cgroup.subtree_control" file -which governs which controllers are enabled on the children of the -cgroup. Let's assume a hierarchy like the following. - - root - A - B - C - \ D - -root's "cgroup.subtree_control" file determines which controllers are -enabled on A. A's on B. B's on C and D. This coincides with the -fact that controllers on the immediate sub-level are used to -distribute the resources of the parent. In fact, it's natural to -assume that resource control knobs of a child belong to its parent. -Enabling a controller in a "cgroup.subtree_control" file declares that -distribution of the respective resources of the cgroup will be -controlled. Note that this means that controller enable states are -shared among siblings. - -When read, the file contains a space-separated list of currently -enabled controllers. A write to the file should contain a -space-separated list of controllers with '+' or '-' prefixed (without -the quotes). Controllers prefixed with '+' are enabled and '-' -disabled. If a controller is listed multiple times, the last entry -wins. The specific operations are executed atomically - either all -succeed or fail. - - -2-3. cgroup.controllers - -Read-only "cgroup.controllers" file contains a space-separated list of -controllers which can be enabled in the cgroup's -"cgroup.subtree_control" file. - -In the root cgroup, this lists controllers which are not bound to -other hierarchies and the content changes as controllers are bound to -and unbound from other hierarchies. - -In non-root cgroups, the content of this file equals that of the -parent's "cgroup.subtree_control" file as only controllers enabled -from the parent can be used in its children. - - -3. Structural Constraints - -3-1. Top-down - -As it doesn't make sense to nest control of an uncontrolled resource, -all non-root "cgroup.subtree_control" files can only contain -controllers which are enabled in the parent's "cgroup.subtree_control" -file. A controller can be enabled only if the parent has the -controller enabled and a controller can't be disabled if one or more -children have it enabled. - - -3-2. No internal tasks - -One long-standing issue that cgroup faces is the competition between -tasks belonging to the parent cgroup and its children cgroups. This -is inherently nasty as two different types of entities compete and -there is no agreed-upon obvious way to handle it. Different -controllers are doing different things. - -The cpu controller considers tasks and cgroups as equivalents and maps -nice levels to cgroup weights. This works for some cases but falls -flat when children should be allocated specific ratios of CPU cycles -and the number of internal tasks fluctuates - the ratios constantly -change as the number of competing entities fluctuates. There also are -other issues. The mapping from nice level to weight isn't obvious or -universal, and there are various other knobs which simply aren't -available for tasks. - -The io controller implicitly creates a hidden leaf node for each -cgroup to host the tasks. The hidden leaf has its own copies of all -the knobs with "leaf_" prefixed. While this allows equivalent control -over internal tasks, it's with serious drawbacks. It always adds an -extra layer of nesting which may not be necessary, makes the interface -messy and significantly complicates the implementation. - -The memory controller currently doesn't have a way to control what -happens between internal tasks and child cgroups and the behavior is -not clearly defined. There have been attempts to add ad-hoc behaviors -and knobs to tailor the behavior to specific workloads. Continuing -this direction will lead to problems which will be extremely difficult -to resolve in the long term. - -Multiple controllers struggle with internal tasks and came up with -different ways to deal with it; unfortunately, all the approaches in -use now are severely flawed and, furthermore, the widely different -behaviors make cgroup as whole highly inconsistent. - -It is clear that this is something which needs to be addressed from -cgroup core proper in a uniform way so that controllers don't need to -worry about it and cgroup as a whole shows a consistent and logical -behavior. To achieve that, unified hierarchy enforces the following -structural constraint: - - Except for the root, only cgroups which don't contain any task may - have controllers enabled in their "cgroup.subtree_control" files. - -Combined with other properties, this guarantees that, when a -controller is looking at the part of the hierarchy which has it -enabled, tasks are always only on the leaves. This rules out -situations where child cgroups compete against internal tasks of the -parent. - -There are two things to note. Firstly, the root cgroup is exempt from -the restriction. Root contains tasks and anonymous resource -consumption which can't be associated with any other cgroup and -requires special treatment from most controllers. How resource -consumption in the root cgroup is governed is up to each controller. - -Secondly, the restriction doesn't take effect if there is no enabled -controller in the cgroup's "cgroup.subtree_control" file. This is -important as otherwise it wouldn't be possible to create children of a -populated cgroup. To control resource distribution of a cgroup, the -cgroup must create children and transfer all its tasks to the children -before enabling controllers in its "cgroup.subtree_control" file. - - -4. Delegation - -4-1. Model of delegation - -A cgroup can be delegated to a less privileged user by granting write -access of the directory and its "cgroup.procs" file to the user. Note -that the resource control knobs in a given directory concern the -resources of the parent and thus must not be delegated along with the -directory. - -Once delegated, the user can build sub-hierarchy under the directory, -organize processes as it sees fit and further distribute the resources -it got from the parent. The limits and other settings of all resource -controllers are hierarchical and regardless of what happens in the -delegated sub-hierarchy, nothing can escape the resource restrictions -imposed by the parent. - -Currently, cgroup doesn't impose any restrictions on the number of -cgroups in or nesting depth of a delegated sub-hierarchy; however, -this may in the future be limited explicitly. - - -4-2. Common ancestor rule - -On the unified hierarchy, to write to a "cgroup.procs" file, in -addition to the usual write permission to the file and uid match, the -writer must also have write access to the "cgroup.procs" file of the -common ancestor of the source and destination cgroups. This prevents -delegatees from smuggling processes across disjoint sub-hierarchies. - -Let's say cgroups C0 and C1 have been delegated to user U0 who created -C00, C01 under C0 and C10 under C1 as follows. - - ~~~~~~~~~~~~~ - C0 - C00 - ~ cgroup ~ \ C01 - ~ hierarchy ~ - ~~~~~~~~~~~~~ - C1 - C10 - -C0 and C1 are separate entities in terms of resource distribution -regardless of their relative positions in the hierarchy. The -resources the processes under C0 are entitled to are controlled by -C0's ancestors and may be completely different from C1. It's clear -that the intention of delegating C0 to U0 is allowing U0 to organize -the processes under C0 and further control the distribution of C0's -resources. - -On traditional hierarchies, if a task has write access to "tasks" or -"cgroup.procs" file of a cgroup and its uid agrees with the target, it -can move the target to the cgroup. In the above example, U0 will not -only be able to move processes in each sub-hierarchy but also across -the two sub-hierarchies, effectively allowing it to violate the -organizational and resource restrictions implied by the hierarchical -structure above C0 and C1. - -On the unified hierarchy, let's say U0 wants to write the pid of a -process which has a matching uid and is currently in C10 into -"C00/cgroup.procs". U0 obviously has write access to the file and -migration permission on the process; however, the common ancestor of -the source cgroup C10 and the destination cgroup C00 is above the -points of delegation and U0 would not have write access to its -"cgroup.procs" and thus be denied with -EACCES. - - -5. Other Changes - -5-1. [Un]populated Notification - -cgroup users often need a way to determine when a cgroup's -subhierarchy becomes empty so that it can be cleaned up. cgroup -currently provides release_agent for it; unfortunately, this mechanism -is riddled with issues. - -- It delivers events by forking and execing a userland binary - specified as the release_agent. This is a long deprecated method of - notification delivery. It's extremely heavy, slow and cumbersome to - integrate with larger infrastructure. - -- There is single monitoring point at the root. There's no way to - delegate management of a subtree. - -- The event isn't recursive. It triggers when a cgroup doesn't have - any tasks or child cgroups. Events for internal nodes trigger only - after all children are removed. This again makes it impossible to - delegate management of a subtree. - -- Events are filtered from the kernel side. A "notify_on_release" - file is used to subscribe to or suppress release events. This is - unnecessarily complicated and probably done this way because event - delivery itself was expensive. - -Unified hierarchy implements "populated" field in "cgroup.events" -interface file which can be used to monitor whether the cgroup's -subhierarchy has tasks in it or not. Its value is 0 if there is no -task in the cgroup and its descendants; otherwise, 1. poll and -[id]notify events are triggered when the value changes. - -This is significantly lighter and simpler and trivially allows -delegating management of subhierarchy - subhierarchy monitoring can -block further propagation simply by putting itself or another process -in the subhierarchy and monitor events that it's interested in from -there without interfering with monitoring higher in the tree. - -In unified hierarchy, the release_agent mechanism is no longer -supported and the interface files "release_agent" and -"notify_on_release" do not exist. - - -5-2. Other Core Changes - -- None of the mount options is allowed. - -- remount is disallowed. - -- rename(2) is disallowed. - -- The "tasks" file is removed. Everything should at process - granularity. Use the "cgroup.procs" file instead. - -- The "cgroup.procs" file is not sorted. pids will be unique unless - they got recycled in-between reads. - -- The "cgroup.clone_children" file is removed. - -- /proc/PID/cgroup keeps reporting the cgroup that a zombie belonged - to before exiting. If the cgroup is removed before the zombie is - reaped, " (deleted)" is appeneded to the path. - - -5-3. Controller File Conventions - -5-3-1. Format - -In general, all controller files should be in one of the following -formats whenever possible. - -- Values only files - - VAL0 VAL1...\n - -- Flat keyed files - - KEY0 VAL0\n - KEY1 VAL1\n - ... - -- Nested keyed files - - KEY0 SUB_KEY0=VAL00 SUB_KEY1=VAL01... - KEY1 SUB_KEY0=VAL10 SUB_KEY1=VAL11... - ... - -For a writeable file, the format for writing should generally match -reading; however, controllers may allow omitting later fields or -implement restricted shortcuts for most common use cases. - -For both flat and nested keyed files, only the values for a single key -can be written at a time. For nested keyed files, the sub key pairs -may be specified in any order and not all pairs have to be specified. - - -5-3-2. Control Knobs - -- Settings for a single feature should generally be implemented in a - single file. - -- In general, the root cgroup should be exempt from resource control - and thus shouldn't have resource control knobs. - -- If a controller implements ratio based resource distribution, the - control knob should be named "weight" and have the range [1, 10000] - and 100 should be the default value. The values are chosen to allow - enough and symmetric bias in both directions while keeping it - intuitive (the default is 100%). - -- If a controller implements an absolute resource guarantee and/or - limit, the control knobs should be named "min" and "max" - respectively. If a controller implements best effort resource - gurantee and/or limit, the control knobs should be named "low" and - "high" respectively. - - In the above four control files, the special token "max" should be - used to represent upward infinity for both reading and writing. - -- If a setting has configurable default value and specific overrides, - the default settings should be keyed with "default" and appear as - the first entry in the file. Specific entries can use "default" as - its value to indicate inheritance of the default value. - -- For events which are not very high frequency, an interface file - "events" should be created which lists event key value pairs. - Whenever a notifiable event happens, file modified event should be - generated on the file. - - -5-4. Per-Controller Changes - -5-4-1. io - -- blkio is renamed to io. The interface is overhauled anyway. The - new name is more in line with the other two major controllers, cpu - and memory, and better suited given that it may be used for cgroup - writeback without involving block layer. - -- Everything including stat is always hierarchical making separate - recursive stat files pointless and, as no internal node can have - tasks, leaf weights are meaningless. The operation model is - simplified and the interface is overhauled accordingly. - - io.stat - - The stat file. The reported stats are from the point where - bio's are issued to request_queue. The stats are counted - independent of which policies are enabled. Each line in the - file follows the following format. More fields may later be - added at the end. - - $MAJ:$MIN rbytes=$RBYTES wbytes=$WBYTES rios=$RIOS wrios=$WIOS - - io.weight - - The weight setting, currently only available and effective if - cfq-iosched is in use for the target device. The weight is - between 1 and 10000 and defaults to 100. The first line - always contains the default weight in the following format to - use when per-device setting is missing. - - default $WEIGHT - - Subsequent lines list per-device weights of the following - format. - - $MAJ:$MIN $WEIGHT - - Writing "$WEIGHT" or "default $WEIGHT" changes the default - setting. Writing "$MAJ:$MIN $WEIGHT" sets per-device weight - while "$MAJ:$MIN default" clears it. - - This file is available only on non-root cgroups. - - io.max - - The maximum bandwidth and/or iops setting, only available if - blk-throttle is enabled. The file is of the following format. - - $MAJ:$MIN rbps=$RBPS wbps=$WBPS riops=$RIOPS wiops=$WIOPS - - ${R|W}BPS are read/write bytes per second and ${R|W}IOPS are - read/write IOs per second. "max" indicates no limit. Writing - to the file follows the same format but the individual - settings may be omitted or specified in any order. - - This file is available only on non-root cgroups. - - -5-4-2. cpuset - -- Tasks are kept in empty cpusets after hotplug and take on the masks - of the nearest non-empty ancestor, instead of being moved to it. - -- A task can be moved into an empty cpuset, and again it takes on the - masks of the nearest non-empty ancestor. - - -5-4-3. memory - -- use_hierarchy is on by default and the cgroup file for the flag is - not created. - -- The original lower boundary, the soft limit, is defined as a limit - that is per default unset. As a result, the set of cgroups that - global reclaim prefers is opt-in, rather than opt-out. The costs - for optimizing these mostly negative lookups are so high that the - implementation, despite its enormous size, does not even provide the - basic desirable behavior. First off, the soft limit has no - hierarchical meaning. All configured groups are organized in a - global rbtree and treated like equal peers, regardless where they - are located in the hierarchy. This makes subtree delegation - impossible. Second, the soft limit reclaim pass is so aggressive - that it not just introduces high allocation latencies into the - system, but also impacts system performance due to overreclaim, to - the point where the feature becomes self-defeating. - - The memory.low boundary on the other hand is a top-down allocated - reserve. A cgroup enjoys reclaim protection when it and all its - ancestors are below their low boundaries, which makes delegation of - subtrees possible. Secondly, new cgroups have no reserve per - default and in the common case most cgroups are eligible for the - preferred reclaim pass. This allows the new low boundary to be - efficiently implemented with just a minor addition to the generic - reclaim code, without the need for out-of-band data structures and - reclaim passes. Because the generic reclaim code considers all - cgroups except for the ones running low in the preferred first - reclaim pass, overreclaim of individual groups is eliminated as - well, resulting in much better overall workload performance. - -- The original high boundary, the hard limit, is defined as a strict - limit that can not budge, even if the OOM killer has to be called. - But this generally goes against the goal of making the most out of - the available memory. The memory consumption of workloads varies - during runtime, and that requires users to overcommit. But doing - that with a strict upper limit requires either a fairly accurate - prediction of the working set size or adding slack to the limit. - Since working set size estimation is hard and error prone, and - getting it wrong results in OOM kills, most users tend to err on the - side of a looser limit and end up wasting precious resources. - - The memory.high boundary on the other hand can be set much more - conservatively. When hit, it throttles allocations by forcing them - into direct reclaim to work off the excess, but it never invokes the - OOM killer. As a result, a high boundary that is chosen too - aggressively will not terminate the processes, but instead it will - lead to gradual performance degradation. The user can monitor this - and make corrections until the minimal memory footprint that still - gives acceptable performance is found. - - In extreme cases, with many concurrent allocations and a complete - breakdown of reclaim progress within the group, the high boundary - can be exceeded. But even then it's mostly better to satisfy the - allocation from the slack available in other groups or the rest of - the system than killing the group. Otherwise, memory.max is there - to limit this type of spillover and ultimately contain buggy or even - malicious applications. - -- The original control file names are unwieldy and inconsistent in - many different ways. For example, the upper boundary hit count is - exported in the memory.failcnt file, but an OOM event count has to - be manually counted by listening to memory.oom_control events, and - lower boundary / soft limit events have to be counted by first - setting a threshold for that value and then counting those events. - Also, usage and limit files encode their units in the filename. - That makes the filenames very long, even though this is not - information that a user needs to be reminded of every time they type - out those names. - - To address these naming issues, as well as to signal clearly that - the new interface carries a new configuration model, the naming - conventions in it necessarily differ from the old interface. - -- The original limit files indicate the state of an unset limit with a - Very High Number, and a configured limit can be unset by echoing -1 - into those files. But that very high number is implementation and - architecture dependent and not very descriptive. And while -1 can - be understood as an underflow into the highest possible value, -2 or - -10M etc. do not work, so it's not consistent. - - memory.low, memory.high, and memory.max will use the string "max" to - indicate and set the highest possible value. - -6. Planned Changes - -6-1. CAP for resource control - -Unified hierarchy will require one of the capabilities(7), which is -yet to be decided, for all resource control related knobs. Process -organization operations - creation of sub-cgroups and migration of -processes in sub-hierarchies may be delegated by changing the -ownership and/or permissions on the cgroup directory and -"cgroup.procs" interface file; however, all operations which affect -resource control - writes to a "cgroup.subtree_control" file or any -controller-specific knobs - will require an explicit CAP privilege. - -This, in part, is to prevent the cgroup interface from being -inadvertently promoted to programmable API used by non-privileged -binaries. cgroup exposes various aspects of the system in ways which -aren't properly abstracted for direct consumption by regular programs. -This is an administration interface much closer to sysctl knobs than -system calls. Even the basic access model, being filesystem path -based, isn't suitable for direct consumption. There's no way to -access "my cgroup" in a race-free way or make multiple operations -atomic against migration to another cgroup. - -Another aspect is that, for better or for worse, the cgroup interface -goes through far less scrutiny than regular interfaces for -unprivileged userland. The upside is that cgroup is able to expose -useful features which may not be suitable for general consumption in a -reasonable time frame. It provides a relatively short path between -internal details and userland-visible interface. Of course, this -shortcut comes with high risk. We go through what we go through for -general kernel APIs for good reasons. It may end up leaking internal -details in a way which can exert significant pain by locking the -kernel into a contract that can't be maintained in a reasonable -manner. - -Also, due to the specific nature, cgroup and its controllers don't -tend to attract attention from a wide scope of developers. cgroup's -short history is already fraught with severely mis-designed -interfaces, unnecessary commitments to and exposing of internal -details, broken and dangerous implementations of various features. - -Keeping cgroup as an administration interface is both advantageous for -its role and imperative given its nature. Some of the cgroup features -may make sense for unprivileged access. If deemed justified, those -must be further abstracted and implemented as a different interface, -be it a system call or process-private filesystem, and survive through -the scrutiny that any interface for general consumption is required to -go through. - -Requiring CAP is not a complete solution but should serve as a -significant deterrent against spraying cgroup usages in non-privileged -programs. diff --git a/Documentation/cgroup.txt b/Documentation/cgroup.txt new file mode 100644 index 0000000..31d1f7b --- /dev/null +++ b/Documentation/cgroup.txt @@ -0,0 +1,1293 @@ + +Control Group v2 + +October, 2015 Tejun Heo + +This is the authoritative documentation on the design, interface and +conventions of cgroup v2. It describes all userland-visible aspects +of cgroup including core and specific controller behaviors. All +future changes must be reflected in this document. Documentation for +v1 is available under Documentation/cgroup-legacy/. + +CONTENTS + +1. Introduction + 1-1. Terminology + 1-2. What is cgroup? +2. Basic Operations + 2-1. Mounting + 2-2. Organizing Processes + 2-3. [Un]populated Notification + 2-4. Controlling Controllers + 2-4-1. Enabling and Disabling + 2-4-2. Top-down Constraint + 2-4-3. No Internal Process Constraint + 2-5. Delegation + 2-5-1. Model of Delegation + 2-5-2. Delegation Containment + 2-6. Guidelines + 2-6-1. Organize Once and Control + 2-6-2. Avoid Name Collisions +3. Resource Distribution Models + 3-1. Weights + 3-2. Limits + 3-3. Protections + 3-4. Allocations +4. Interface Files + 4-1. Format + 4-2. Conventions + 4-3. Core Interface Files +5. Controllers + 5-1. CPU + 5-1-1. CPU Interface Files + 5-2. Memory + 5-2-1. Memory Interface Files + 5-2-2. Usage Guidelines + 5-2-3. Memory Ownership + 5-3. IO + 5-3-1. IO Interface Files + 5-3-2. Writeback +P. Information on Kernel Programming + P-1. Filesystem Support for Writeback +D. Deprecated v1 Core Features +R. Issues with v1 and Rationales for v2 + R-1. Multiple Hierarchies + R-2. Thread Granularity + R-3. Competition Between Inner Nodes and Threads + R-4. Other Interface Issues + R-5. Controller Issues and Remedies + R-5-1. Memory + + +1. Introduction + +1-1. Terminology + +"cgroup" stands for "control group" and is never capitalized. The +singular form is used to designate the whole feature and also as a +qualifier as in "cgroup controllers". When explicitly referring to +multiple individual control groups, the plural form "cgroups" is used. + + +1-2. What is cgroup? + +cgroup is a mechanism to organize processes hierarchically and +distribute system resources along the hierarchy in a controlled and +configurable manner. + +cgroup is largely composed of two parts - the core and controllers. +cgroup core is primarily responsible for hierarchically organizing +processes. A cgroup controller is usually responsible for +distributing a specific type of system resource along the hierarchy +although there are utility controllers which serve purposes other than +resource distribution. + +cgroups form a tree structure and every process in the system belongs +to one and only one cgroup. All threads of a process belong to the +same cgroup. On creation, all processes are put in the cgroup that +the parent process belongs to at the time. A process can be migrated +to another cgroup. Migration of a process doesn't affect already +existing descendant processes. + +Following certain structural constraints, controllers may be enabled or +disabled selectively on a cgroup. All controller behaviors are +hierarchical - if a controller is enabled on a cgroup, it affects all +processes which belong to the cgroups consisting the inclusive +sub-hierarchy of the cgroup. When a controller is enabled on a nested +cgroup, it always restricts the resource distribution further. The +restrictions set closer to the root in the hierarchy can not be +overridden from further away. + + +2. Basic Operations + +2-1. Mounting + +Unlike v1, cgroup v2 has only single hierarchy. The cgroup v2 +hierarchy can be mounted with the following mount command. + + # mount -t cgroup2 none $MOUNT_POINT + +cgroup2 filesystem has the magic number 0x63677270 ("cgrp"). All +controllers which support v2 and are not bound to a v1 hierarchy are +automatically bound to the v2 hierarchy and show up at the root. +Controllers which are not in active use in the v2 hierarchy can be +bound to other hierarchies. This allows mixing v2 hierarchy with the +legacy v1 multiple hierarchies in a fully backward compatible way. + +A controller can be moved across hierarchies only after the controller +is no longer referenced in its current hierarchy. Because per-cgroup +controller states are destroyed asynchronously and controllers may +have lingering references, a controller may not show up immediately on +the v2 hierarchy after the final umount of the previous hierarchy. +Similarly, a controller should be fully disabled to be moved out of +the unified hierarchy and it may take some time for the disabled +controller to become available for other hierarchies; furthermore, due +to inter-controller dependencies, other controllers may need to be +disabled too. + +While useful for development and manual configurations, moving +controllers dynamically between the v2 and other hierarchies is +strongly discouraged for production use. It is recommended to decide +the hierarchies and controller associations before starting using the +controllers after system boot. + + +2-2. Organizing Processes + +Initially, only the root cgroup exists to which all processes belong. +A child cgroup can be created by creating a sub-directory. + + # mkdir $CGROUP_NAME + +A given cgroup may have multiple child cgroups forming a tree +structure. Each cgroup has a read-writable interface file +"cgroup.procs". When read, it lists the PIDs of all processes which +belong to the cgroup one-per-line. The PIDs are not ordered and the +same PID may show up more than once if the process got moved to +another cgroup and then back or the PID got recycled while reading. + +A process can be migrated into a cgroup by writing its PID to the +target cgroup's "cgroup.procs" file. Only one process can be migrated +on a single write(2) call. If a process is composed of multiple +threads, writing the PID of any thread migrates all threads of the +process. + +When a process forks a child process, the new process is born into the +cgroup that the forking process belongs to at the time of the +operation. After exit, a process stays associated with the cgroup +that it belonged to at the time of exit until it's reaped; however, a +zombie process does not appear in "cgroup.procs" and thus can't be +moved to another cgroup. + +A cgroup which doesn't have any children or live processes can be +destroyed by removing the directory. Note that a cgroup which doesn't +have any children and is associated only with zombie processes is +considered empty and can be removed. + + # rmdir $CGROUP_NAME + +"/proc/$PID/cgroup" lists a process's cgroup membership. If legacy +cgroup is in use in the system, this file may contain multiple lines, +one for each hierarchy. The entry for cgroup v2 is always in the +format "0::$PATH". + + # cat /proc/842/cgroup + ... + 0::/test-cgroup/test-cgroup-nested + +If the process becomes a zombie and the cgroup it was associated with +is removed subsequently, " (deleted)" is appended to the path. + + # cat /proc/842/cgroup + ... + 0::/test-cgroup/test-cgroup-nested (deleted) + + +2-3. [Un]populated Notification + +Each non-root cgroup has a "cgroup.events" file which contains +"populated" field indicating whether the cgroup's sub-hierarchy has +live processes in it. Its value is 0 if there is no live process in +the cgroup and its descendants; otherwise, 1. poll and [id]notify +events are triggered when the value changes. This can be used, for +example, to start a clean-up operation after all processes of a given +sub-hierarchy have exited. The populated state updates and +notifications are recursive. Consider the following sub-hierarchy +where the numbers in the parentheses represent the numbers of processes +in each cgroup. + + A(4) - B(0) - C(1) + \ D(0) + +A, B and C's "populated" fields would be 1 while D's 0. After the one +process in C exits, B and C's "populated" fields would flip to "0" and +file modified events will be generated on the "cgroup.events" files of +both cgroups. + + +2-4. Controlling Controllers + +2-4-1. Enabling and Disabling + +Each cgroup has a "cgroup.controllers" file which lists all +controllers available for the cgroup to enable. + + # cat cgroup.controllers + cpu io memory + +No controller is enabled by default. Controllers can be enabled and +disabled by writing to the "cgroup.subtree_control" file. + + # echo "+cpu +memory -io" > cgroup.subtree_control + +Only controllers which are listed in "cgroup.controllers" can be +enabled. When multiple operations are specified as above, either they +all succeed or fail. If multiple operations on the same controller +are specified, the last one is effective. + +Enabling a controller in a cgroup indicates that the distribution of +the target resource across its immediate children will be controlled. +Consider the following sub-hierarchy. The enabled controllers are +listed in parentheses. + + A(cpu,memory) - B(memory) - C() + \ D() + +As A has "cpu" and "memory" enabled, A will control the distribution +of CPU cycles and memory to its children, in this case, B. As B has +"memory" enabled but not "CPU", C and D will compete freely on CPU +cycles but their division of memory available to B will be controlled. + +As a controller regulates the distribution of the target resource to +the cgroup's children, enabling it creates the controller's interface +files in the child cgroups. In the above example, enabling "cpu" on B +would create the "cpu." prefixed controller interface files in C and +D. Likewise, disabling "memory" from B would remove the "memory." +prefixed controller interface files from C and D. This means that the +controller interface files - anything which doesn't start with +"cgroup." are owned by the parent rather than the cgroup itself. + + +2-4-2. Top-down Constraint + +Resources are distributed top-down and a cgroup can further distribute +a resource only if the resource has been distributed to it from the +parent. This means that all non-root "cgroup.subtree_control" files +can only contain controllers which are enabled in the parent's +"cgroup.subtree_control" file. A controller can be enabled only if +the parent has the controller enabled and a controller can't be +disabled if one or more children have it enabled. + + +2-4-3. No Internal Process Constraint + +Non-root cgroups can only distribute resources to their children when +they don't have any processes of their own. In other words, only +cgroups which don't contain any processes can have controllers enabled +in their "cgroup.subtree_control" files. + +This guarantees that, when a controller is looking at the part of the +hierarchy which has it enabled, processes are always only on the +leaves. This rules out situations where child cgroups compete against +internal processes of the parent. + +The root cgroup is exempt from this restriction. Root contains +processes and anonymous resource consumption which can't be associated +with any other cgroups and requires special treatment from most +controllers. How resource consumption in the root cgroup is governed +is up to each controller. + +Note that the restriction doesn't get in the way if there is no +enabled controller in the cgroup's "cgroup.subtree_control". This is +important as otherwise it wouldn't be possible to create children of a +populated cgroup. To control resource distribution of a cgroup, the +cgroup must create children and transfer all its processes to the +children before enabling controllers in its "cgroup.subtree_control" +file. + + +2-5. Delegation + +2-5-1. Model of Delegation + +A cgroup can be delegated to a less privileged user by granting write +access of the directory and its "cgroup.procs" file to the user. Note +that resource control interface files in a given directory control the +distribution of the parent's resources and thus must not be delegated +along with the directory. + +Once delegated, the user can build sub-hierarchy under the directory, +organize processes as it sees fit and further distribute the resources +it received from the parent. The limits and other settings of all +resource controllers are hierarchical and regardless of what happens +in the delegated sub-hierarchy, nothing can escape the resource +restrictions imposed by the parent. + +Currently, cgroup doesn't impose any restrictions on the number of +cgroups in or nesting depth of a delegated sub-hierarchy; however, +this may be limited explicitly in the future. + + +2-5-2. Delegation Containment + +A delegated sub-hierarchy is contained in the sense that processes +can't be moved into or out of the sub-hierarchy by the delegatee. For +a process with a non-root euid to migrate a target process into a +cgroup by writing its PID to the "cgroup.procs" file, the following +conditions must be met. + +- The writer's euid must match either uid or suid of the target process. + +- The writer must have write access to the "cgroup.procs" file. + +- The writer must have write access to the "cgroup.procs" file of the + common ancestor of the source and destination cgroups. + +The above three constraints ensure that while a delegatee may migrate +processes around freely in the delegated sub-hierarchy it can't pull +in from or push out to outside the sub-hierarchy. + +For an example, let's assume cgroups C0 and C1 have been delegated to +user U0 who created C00, C01 under C0 and C10 under C1 as follows and +all processes under C0 and C1 belong to U0. + + ~~~~~~~~~~~~~ - C0 - C00 + ~ cgroup ~ \ C01 + ~ hierarchy ~ + ~~~~~~~~~~~~~ - C1 - C10 + +Let's also say U0 wants to write the PID of a process which is +currently in C10 into "C00/cgroup.procs". U0 has write access to the +file and uid match on the process; however, the common ancestor of the +source cgroup C10 and the destination cgroup C00 is above the points +of delegation and U0 would not have write access to its "cgroup.procs" +files and thus the write will be denied with -EACCES. + + +2-6. Guidelines + +2-6-1. Organize Once and Control + +Migrating a process across cgroups is a relatively expensive operation +and stateful resources such as memory are not moved together with the +process. This is an explicit design decision as there often exist +inherent trade-offs between migration and various hot paths in terms +of synchronization cost. + +As such, migrating processes across cgroups frequently as a means to +apply different resource restrictions is discouraged. A workload +should be assigned to a cgroup according to the system's logical and +resource structure once on start-up. Dynamic adjustments to resource +distribution can be made by changing controller configuration through +the interface files. + + +2-6-2. Avoid Name Collisions + +Interface files for a cgroup and its children cgroups occupy the same +directory and it is possible to create children cgroups which collide +with interface files. + +All cgroup core interface files are prefixed with "cgroup." and each +controller's interface files are prefixed with the controller name and +a dot. A controller's name is composed of lower case alphabets and +'_'s but never begins with an '_' so it can be used as the prefix +character for collision avoidance. Also, interface file names won't +start or end with terms which are often used in categorizing workloads +such as job, service, slice, unit or workload. + +cgroup doesn't do anything to prevent name collisions and it's the +user's responsibility to avoid them. + + +3. Resource Distribution Models + +cgroup controllers implement several resource distribution schemes +depending on the resource type and expected use cases. This section +describes major schemes in use along with their expected behaviors. + + +3-1. Weights + +A parent's resource is distributed by adding up the weights of all +active children and giving each the fraction matching the ratio of its +weight against the sum. As only children which can make use of the +resource at the moment participate in the distribution, this is +work-conserving. Due to the dynamic nature, this model is usually +used for stateless resources. + +All weights are in the range [1, 10000] with the default at 100. This +allows symmetric multiplicative biases in both directions at fine +enough granularity while staying in the intuitive range. + +As long as the weight is in range, all configuration combinations are +valid and there is no reason to reject configuration changes or +process migrations. + +"cpu.weight" proportionally distributes CPU cycles to active children +and is an example of this type. + + +3-2. Limits + +A child can only consume upto the configured amount of the resource. +Limits can be over-committed - the sum of the limits of children can +exceed the amount of resource available to the parent. + +Limits are in the range [0, max] and defaults to "max", which is noop. + +As limits can be over-committed, all configuration combinations are +valid and there is no reason to reject configuration changes or +process migrations. + +"io.max" limits the maximum BPS and/or IOPS that a cgroup can consume +on an IO device and is an example of this type. + + +3-3. Protections + +A cgroup is protected to be allocated upto the configured amount of +the resource if the usages of all its ancestors are under their +protected levels. Protections can be hard guarantees or best effort +soft boundaries. Protections can also be over-committed in which case +only upto the amount available to the parent is protected among +children. + +Protections are in the range [0, max] and defaults to 0, which is +noop. + +As protections can be over-committed, all configuration combinations +are valid and there is no reason to reject configuration changes or +process migrations. + +"memory.low" implements best-effort memory protection and is an +example of this type. + + +3-4. Allocations + +A cgroup is exclusively allocated a certain amount of a finite +resource. Allocations can't be over-committed - the sum of the +allocations of children can not exceed the amount of resource +available to the parent. + +Allocations are in the range [0, max] and defaults to 0, which is no +resource. + +As allocations can't be over-committed, some configuration +combinations are invalid and should be rejected. Also, if the +resource is mandatory for execution of processes, process migrations +may be rejected. + +"cpu.rt.max" hard-allocates realtime slices and is an example of this +type. + + +4. Interface Files + +4-1. Format + +All interface files should be in one of the following formats whenever +possible. + + New-line separated values + (when only one value can be written at once) + + VAL0\n + VAL1\n + ... + + Space separated values + (when read-only or multiple values can be written at once) + + VAL0 VAL1 ...\n + + Flat keyed + + KEY0 VAL0\n + KEY1 VAL1\n + ... + + Nested keyed + + KEY0 SUB_KEY0=VAL00 SUB_KEY1=VAL01... + KEY1 SUB_KEY0=VAL10 SUB_KEY1=VAL11... + ... + +For a writable file, the format for writing should generally match +reading; however, controllers may allow omitting later fields or +implement restricted shortcuts for most common use cases. + +For both flat and nested keyed files, only the values for a single key +can be written at a time. For nested keyed files, the sub key pairs +may be specified in any order and not all pairs have to be specified. + + +4-2. Conventions + +- Settings for a single feature should be contained in a single file. + +- The root cgroup should be exempt from resource control and thus + shouldn't have resource control interface files. Also, + informational files on the root cgroup which end up showing global + information available elsewhere shouldn't exist. + +- If a controller implements weight based resource distribution, its + interface file should be named "weight" and have the range [1, + 10000] with 100 as the default. The values are chosen to allow + enough and symmetric bias in both directions while keeping it + intuitive (the default is 100%). + +- If a controller implements an absolute resource guarantee and/or + limit, the interface files should be named "min" and "max" + respectively. If a controller implements best effort resource + guarantee and/or limit, the interface files should be named "low" + and "high" respectively. + + In the above four control files, the special token "max" should be + used to represent upward infinity for both reading and writing. + +- If a setting has a configurable default value and keyed specific + overrides, the default entry should be keyed with "default" and + appear as the first entry in the file. + + The default value can be updated by writing either "default $VAL" or + "$VAL". + + When writing to update a specific override, "default" can be used as + the value to indicate removal of the override. Override entries + with "default" as the value must not appear when read. + + For example, a setting which is keyed by major:minor device numbers + with integer values may look like the following. + + # cat cgroup-example-interface-file + default 150 + 8:0 300 + + The default value can be updated by + + # echo 125 > cgroup-example-interface-file + + or + + # echo "default 125" > cgroup-example-interface-file + + An override can be set by + + # echo "8:16 170" > cgroup-example-interface-file + + and cleared by + + # echo "8:0 default" > cgroup-example-interface-file + # cat cgroup-example-interface-file + default 125 + 8:16 170 + +- For events which are not very high frequency, an interface file + "events" should be created which lists event key value pairs. + Whenever a notifiable event happens, file modified event should be + generated on the file. + + +4-3. Core Interface Files + +All cgroup core files are prefixed with "cgroup." + + cgroup.procs + + A read-write new-line separated values file which exists on + all cgroups. + + When read, it lists the PIDs of all processes which belong to + the cgroup one-per-line. The PIDs are not ordered and the + same PID may show up more than once if the process got moved + to another cgroup and then back or the PID got recycled while + reading. + + A PID can be written to migrate the process associated with + the PID to the cgroup. The writer should match all of the + following conditions. + + - Its euid is either root or must match either uid or suid of + the target process. + + - It must have write access to the "cgroup.procs" file. + + - It must have write access to the "cgroup.procs" file of the + common ancestor of the source and destination cgroups. + + When delegating a sub-hierarchy, write access to this file + should be granted along with the containing directory. + + cgroup.controllers + + A read-only space separated values file which exists on all + cgroups. + + It shows space separated list of all controllers available to + the cgroup. The controllers are not ordered. + + cgroup.subtree_control + + A read-write space separated values file which exists on all + cgroups. Starts out empty. + + When read, it shows space separated list of the controllers + which are enabled to control resource distribution from the + cgroup to its children. + + Space separated list of controllers prefixed with '+' or '-' + can be written to enable or disable controllers. A controller + name prefixed with '+' enables the controller and '-' + disables. If a controller appears more than once on the list, + the last one is effective. When multiple enable and disable + operations are specified, either all succeed or all fail. + + cgroup.events + + A read-only flat-keyed file which exists on non-root cgroups. + The following entries are defined. Unless specified + otherwise, a value change in this file generates a file + modified event. + + populated + + 1 if the cgroup or its descendants contains any live + processes; otherwise, 0. + + +5. Controllers + +5-1. CPU + +[NOTE: The interface for the cpu controller hasn't been merged yet] + +The "cpu" controllers regulates distribution of CPU cycles. This +controller implements weight and absolute bandwidth limit models for +normal scheduling policy and absolute bandwidth allocation model for +realtime scheduling policy. + + +5-1-1. CPU Interface Files + +All time durations are in microseconds. + + cpu.stat + + A read-only flat-keyed file which exists on non-root cgroups. + + It reports the following six stats. + + usage_usec + user_usec + system_usec + nr_periods + nr_throttled + throttled_usec + + cpu.weight + + A read-write single value file which exists on non-root + cgroups. The default is "100". + + The weight in the range [1, 10000]. + + cpu.max + + A read-write two value file which exists on non-root cgroups. + The default is "max 100000". + + The maximum bandwidth limit. It's in the following format. + + $MAX $PERIOD + + which indicates that the group may consume upto $MAX in each + $PERIOD duration. "max" for $MAX indicates no limit. If only + one number is written, $MAX is updated. + + cpu.rt.max + + [NOTE: The semantics of this file is still under discussion and the + interface hasn't been merged yet] + + A read-write two value file which exists on all cgroups. + The default is "0 100000". + + The maximum realtime runtime allocation. Over-committing + configurations are disallowed and process migrations are + rejected if not enough bandwidth is available. It's in the + following format. + + $MAX $PERIOD + + which indicates that the group may consume upto $MAX in each + $PERIOD duration. If only one number is written, $MAX is + updated. + + +5-2. Memory + +The "memory" controller regulates distribution of memory. Memory is +stateful and implements both limit and protection models. Due to the +intertwining between memory usage and reclaim pressure and the +stateful nature of memory, the distribution model is relatively +complex. + +While not completely water-tight, all major memory usages by a given +cgroup are tracked so that the total memory consumption can be +accounted and controlled to a reasonable extent. Currently, the +following types of memory usages are tracked. + +- Userland memory - page cache and anonymous memory. + +- Kernel data structures such as dentries and inodes. + +- TCP socket buffers. + +The above list may expand in the future for better coverage. + + +5-2-1. Memory Interface Files + +All memory amounts are in bytes. If a value which is not aligned to +PAGE_SIZE is written, the value may be rounded up to the closest +PAGE_SIZE multiple when read back. + + memory.current + + A read-only single value file which exists on non-root + cgroups. + + The total amount of memory currently being used by the cgroup + and its descendants. + + memory.low + + A read-write single value file which exists on non-root + cgroups. The default is "0". + + Best-effort memory protection. If the memory usages of a + cgroup and all its ancestors are below their low boundaries, + the cgroup's memory won't be reclaimed unless memory can be + reclaimed from unprotected cgroups. + + Putting more memory than generally available under this + protection is discouraged. + + memory.high + + A read-write single value file which exists on non-root + cgroups. The default is "max". + + Memory usage throttle limit. This is the main mechanism to + control memory usage of a cgroup. If a cgroup's usage goes + over the high boundary, the processes of the cgroup are + throttled and put under heavy reclaim pressure. + + Going over the high limit never invokes the OOM killer and + under extreme conditions the limit may be breached. + + memory.max + + A read-write single value file which exists on non-root + cgroups. The default is "max". + + Memory usage hard limit. This is the final protection + mechanism. If a cgroup's memory usage reaches this limit and + can't be reduced, the OOM killer is invoked in the cgroup. + Under certain circumstances, the usage may go over the limit + temporarily. + + This is the ultimate protection mechanism. As long as the + high limit is used and monitored properly, this limit's + utility is limited to providing the final safety net. + + memory.events + + A read-only flat-keyed file which exists on non-root cgroups. + The following entries are defined. Unless specified + otherwise, a value change in this file generates a file + modified event. + + low + + The number of times the cgroup is reclaimed due to + high memory pressure even though its usage is under + the low boundary. This usually indicates that the low + boundary is over-committed. + + high + + The number of times processes of the cgroup are + throttled and routed to perform direct memory reclaim + because the high memory boundary was exceeded. For a + cgroup whose memory usage is capped by the high limit + rather than global memory pressure, this event's + occurrences are expected. + + max + + The number of times the cgroup's memory usage was + about to go over the max boundary. If direct reclaim + fails to bring it down, the OOM killer is invoked. + + oom + + The number of times the OOM killer has been invoked in + the cgroup. This may not exactly match the number of + processes killed but should generally be close. + + +5-2-2. General Usage + +"memory.high" is the main mechanism to control memory usage. +Over-committing on high limit (sum of high limits > available memory) +and letting global memory pressure to distribute memory according to +usage is a viable strategy. + +Because breach of the high limit doesn't trigger the OOM killer but +throttles the offending cgroup, a management agent has ample +opportunities to monitor and take appropriate actions such as granting +more memory or terminating the workload. + +Determining whether a cgroup has enough memory is not trivial as +memory usage doesn't indicate whether the workload can benefit from +more memory. For example, a workload which writes data received from +network to a file can use all available memory but can also operate as +performant with a small amount of memory. A measure of memory +pressure - how much the workload is being impacted due to lack of +memory - is necessary to determine whether a workload needs more +memory; unfortunately, memory pressure monitoring mechanism isn't +implemented yet. + + +5-2-3. Memory Ownership + +A memory area is charged to the cgroup which instantiated it and stays +charged to the cgroup until the area is released. Migrating a process +to a different cgroup doesn't move the memory usages that it +instantiated while in the previous cgroup to the new cgroup. + +A memory area may be used by processes belonging to different cgroups. +To which cgroup the area will be charged is in-deterministic; however, +over time, the memory area is likely to end up in a cgroup which has +enough memory allowance to avoid high reclaim pressure. + +If a cgroup sweeps a considerable amount of memory which is expected +to be accessed repeatedly by other cgroups, it may make sense to use +POSIX_FADV_DONTNEED to relinquish the ownership of memory areas +belonging to the affected files to ensure correct memory ownership. + + +5-3. IO + +The "io" controller regulates the distribution of IO resources. This +controller implements both weight based and absolute bandwidth or IOPS +limit distribution; however, weight based distribution is available +only if cfq-iosched is in use and neither scheme is available for +blk-mq devices. + + +5-3-1. IO Interface Files + + io.stat + + A read-only nested-keyed file which exists on non-root + cgroups. + + Lines are keyed by $MAJ:$MIN device numbers and not ordered. + The following nested keys are defined. + + rbytes Bytes read + wbytes Bytes written + rios Number of read IOs + wios Number of write IOs + + An example read output follows. + + 8:16 rbytes=1459200 wbytes=314773504 rios=192 wios=353 + 8:0 rbytes=90430464 wbytes=299008000 rios=8950 wios=1252 + + io.weight + + A read-write flat-keyed file which exists on non-root cgroups. + The default is "default 100". + + The first line is the default weight applied to devices + without specific override. The rest are overrides keyed by + $MAJ:$MIN device numbers and not ordered. The weights are in + the range [1, 10000] and specifies the relative amount IO time + the cgroup can use in relation to its siblings. + + The default weight can be updated by writing either "default + $WEIGHT" or simply "$WEIGHT". Overrides can be set by writing + "$MAJ:$MIN $WEIGHT" and unset by writing "$MAJ:$MIN default". + + An example read output follows. + + default 100 + 8:16 200 + 8:0 50 + + io.max + + A read-write nested-keyed file which exists on non-root + cgroups. + + BPS and IOPS based IO limit. Lines are keyed by $MAJ:$MIN + device numbers and not ordered. The following nested keys are + defined. + + rbps Max read bytes per second + wbps Max write bytes per second + riops Max read IO operations per second + wiops Max write IO operations per second + + When writing, any number of nested key-value pairs can be + specified in any order. "max" can be specified as the value + to remove a specific limit. If the same key is specified + multiple times, the outcome is undefined. + + BPS and IOPS are measured in each IO direction and IOs are + delayed if limit is reached. Temporary bursts are allowed. + + Setting read limit at 2M BPS and write at 120 IOPS for 8:16. + + echo "8:16 rbps=2097152 wiops=120" > io.max + + Reading returns the following. + + 8:16 rbps=2097152 wbps=max riops=max wiops=120 + + Write IOPS limit can be removed by writing the following. + + echo "8:16 wiops=max" > io.max + + Reading now returns the following. + + 8:16 rbps=2097152 wbps=max riops=max wiops=max + + +5-3-2. Writeback + +Page cache is dirtied through buffered writes and shared mmaps and +written asynchronously to the backing filesystem by the writeback +mechanism. Writeback sits between the memory and IO domains and +regulates the proportion of dirty memory by balancing dirtying and +write IOs. + +The io controller, in conjunction with the memory controller, +implements control of page cache writeback IOs. The memory controller +defines the memory domain that dirty memory ratio is calculated and +maintained for and the io controller defines the io domain which +writes out dirty pages for the memory domain. Both system-wide and +per-cgroup dirty memory states are examined and the more restrictive +of the two is enforced. + +cgroup writeback requires explicit support from the underlying +filesystem. Currently, cgroup writeback is implemented on ext2, ext4 +and btrfs. On other filesystems, all writeback IOs are attributed to +the root cgroup. + +There are inherent differences in memory and writeback management +which affects how cgroup ownership is tracked. Memory is tracked per +page while writeback per inode. For the purpose of writeback, an +inode is assigned to a cgroup and all IO requests to write dirty pages +from the inode are attributed to that cgroup. + +As cgroup ownership for memory is tracked per page, there can be pages +which are associated with different cgroups than the one the inode is +associated with. These are called foreign pages. The writeback +constantly keeps track of foreign pages and, if a particular foreign +cgroup becomes the majority over a certain period of time, switches +the ownership of the inode to that cgroup. + +While this model is enough for most use cases where a given inode is +mostly dirtied by a single cgroup even when the main writing cgroup +changes over time, use cases where multiple cgroups write to a single +inode simultaneously are not supported well. In such circumstances, a +significant portion of IOs are likely to be attributed incorrectly. +As memory controller assigns page ownership on the first use and +doesn't update it until the page is released, even if writeback +strictly follows page ownership, multiple cgroups dirtying overlapping +areas wouldn't work as expected. It's recommended to avoid such usage +patterns. + +The sysctl knobs which affect writeback behavior are applied to cgroup +writeback as follows. + + vm.dirty_background_ratio + vm.dirty_ratio + + These ratios apply the same to cgroup writeback with the + amount of available memory capped by limits imposed by the + memory controller and system-wide clean memory. + + vm.dirty_background_bytes + vm.dirty_bytes + + For cgroup writeback, this is calculated into ratio against + total available memory and applied the same way as + vm.dirty[_background]_ratio. + + +P. Information on Kernel Programming + +This section contains kernel programming information in the areas +where interacting with cgroup is necessary. cgroup core and +controllers are not covered. + + +P-1. Filesystem Support for Writeback + +A filesystem can support cgroup writeback by updating +address_space_operations->writepage[s]() to annotate bio's using the +following two functions. + + wbc_init_bio(@wbc, @bio) + + Should be called for each bio carrying writeback data and + associates the bio with the inode's owner cgroup. Can be + called anytime between bio allocation and submission. + + wbc_account_io(@wbc, @page, @bytes) + + Should be called for each data segment being written out. + While this function doesn't care exactly when it's called + during the writeback session, it's the easiest and most + natural to call it as data segments are added to a bio. + +With writeback bio's annotated, cgroup support can be enabled per +super_block by setting SB_I_CGROUPWB in ->s_iflags. This allows for +selective disabling of cgroup writeback support which is helpful when +certain filesystem features, e.g. journaled data mode, are +incompatible. + +wbc_init_bio() binds the specified bio to its cgroup. Depending on +the configuration, the bio may be executed at a lower priority and if +the writeback session is holding shared resources, e.g. a journal +entry, may lead to priority inversion. There is no one easy solution +for the problem. Filesystems can try to work around specific problem +cases by skipping wbc_init_bio() or using bio_associate_blkcg() +directly. + + +D. Deprecated v1 Core Features + +- Multiple hierarchies including named ones are not supported. + +- All mount options and remounting are not supported. + +- The "tasks" file is removed and "cgroup.procs" is not sorted. + +- "cgroup.clone_children" is removed. + +- /proc/cgroups is meaningless for v2. Use "cgroup.controllers" file + at the root instead. + + +R. Issues with v1 and Rationales for v2 + +R-1. Multiple Hierarchies + +cgroup v1 allowed an arbitrary number of hierarchies and each +hierarchy could host any number of controllers. While this seemed to +provide a high level of flexibility, it wasn't useful in practice. + +For example, as there is only one instance of each controller, utility +type controllers such as freezer which can be useful in all +hierarchies could only be used in one. The issue is exacerbated by +the fact that controllers couldn't be moved to another hierarchy once +hierarchies were populated. Another issue was that all controllers +bound to a hierarchy were forced to have exactly the same view of the +hierarchy. It wasn't possible to vary the granularity depending on +the specific controller. + +In practice, these issues heavily limited which controllers could be +put on the same hierarchy and most configurations resorted to putting +each controller on its own hierarchy. Only closely related ones, such +as the cpu and cpuacct controllers, made sense to be put on the same +hierarchy. This often meant that userland ended up managing multiple +similar hierarchies repeating the same steps on each hierarchy +whenever a hierarchy management operation was necessary. + +Furthermore, support for multiple hierarchies came at a steep cost. +It greatly complicated cgroup core implementation but more importantly +the support for multiple hierarchies restricted how cgroup could be +used in general and what controllers was able to do. + +There was no limit on how many hierarchies there might be, which meant +that a thread's cgroup membership couldn't be described in finite +length. The key might contain any number of entries and was unlimited +in length, which made it highly awkward to manipulate and led to +addition of controllers which existed only to identify membership, +which in turn exacerbated the original problem of proliferating number +of hierarchies. + +Also, as a controller couldn't have any expectation regarding the +topologies of hierarchies other controllers might be on, each +controller had to assume that all other controllers were attached to +completely orthogonal hierarchies. This made it impossible, or at +least very cumbersome, for controllers to cooperate with each other. + +In most use cases, putting controllers on hierarchies which are +completely orthogonal to each other isn't necessary. What usually is +called for is the ability to have differing levels of granularity +depending on the specific controller. In other words, hierarchy may +be collapsed from leaf towards root when viewed from specific +controllers. For example, a given configuration might not care about +how memory is distributed beyond a certain level while still wanting +to control how CPU cycles are distributed. + + +R-2. Thread Granularity + +cgroup v1 allowed threads of a process to belong to different cgroups. +This didn't make sense for some controllers and those controllers +ended up implementing different ways to ignore such situations but +much more importantly it blurred the line between API exposed to +individual applications and system management interface. + +Generally, in-process knowledge is available only to the process +itself; thus, unlike service-level organization of processes, +categorizing threads of a process requires active participation from +the application which owns the target process. + +cgroup v1 had an ambiguously defined delegation model which got abused +in combination with thread granularity. cgroups were delegated to +individual applications so that they can create and manage their own +sub-hierarchies and control resource distributions along them. This +effectively raised cgroup to the status of a syscall-like API exposed +to lay programs. + +First of all, cgroup has a fundamentally inadequate interface to be +exposed this way. For a process to access its own knobs, it has to +extract the path on the target hierarchy from /proc/self/cgroup, +construct the path by appending the name of the knob to the path, open +and then read and/or write to it. This is not only extremely clunky +and unusual but also inherently racy. There is no conventional way to +define transaction across the required steps and nothing can guarantee +that the process would actually be operating on its own sub-hierarchy. + +cgroup controllers implemented a number of knobs which would never be +accepted as public APIs because they were just adding control knobs to +system-management pseudo filesystem. cgroup ended up with interface +knobs which were not properly abstracted or refined and directly +revealed kernel internal details. These knobs got exposed to +individual applications through the ill-defined delegation mechanism +effectively abusing cgroup as a shortcut to implementing public APIs +without going through the required scrutiny. + +This was painful for both userland and kernel. Userland ended up with +misbehaving and poorly abstracted interfaces and kernel exposing and +locked into constructs inadvertently. + + +R-3. Competition Between Inner Nodes and Threads + +cgroup v1 allowed threads to be in any cgroups which created an +interesting problem where threads belonging to a parent cgroup and its +children cgroups competed for resources. This was nasty as two +different types of entities competed and there was no obvious way to +settle it. Different controllers did different things. + +The cpu controller considered threads and cgroups as equivalents and +mapped nice levels to cgroup weights. This worked for some cases but +fell flat when children wanted to be allocated specific ratios of CPU +cycles and the number of internal threads fluctuated - the ratios +constantly changed as the number of competing entities fluctuated. +There also were other issues. The mapping from nice level to weight +wasn't obvious or universal, and there were various other knobs which +simply weren't available for threads. + +The io controller implicitly created a hidden leaf node for each +cgroup to host the threads. The hidden leaf had its own copies of all +the knobs with "leaf_" prefixed. While this allowed equivalent +control over internal threads, it was with serious drawbacks. It +always added an extra layer of nesting which wouldn't be necessary +otherwise, made the interface messy and significantly complicated the +implementation. + +The memory controller didn't have a way to control what happened +between internal tasks and child cgroups and the behavior was not +clearly defined. There were attempts to add ad-hoc behaviors and +knobs to tailor the behavior to specific workloads which would have +led to problems extremely difficult to resolve in the long term. + +Multiple controllers struggled with internal tasks and came up with +different ways to deal with it; unfortunately, all the approaches were +severely flawed and, furthermore, the widely different behaviors +made cgroup as a whole highly inconsistent. + +This clearly is a problem which needs to be addressed from cgroup core +in a uniform way. + + +R-4. Other Interface Issues + +cgroup v1 grew without oversight and developed a large number of +idiosyncrasies and inconsistencies. One issue on the cgroup core side +was how an empty cgroup was notified - a userland helper binary was +forked and executed for each event. The event delivery wasn't +recursive or delegatable. The limitations of the mechanism also led +to in-kernel event delivery filtering mechanism further complicating +the interface. + +Controller interfaces were problematic too. An extreme example is +controllers completely ignoring hierarchical organization and treating +all cgroups as if they were all located directly under the root +cgroup. Some controllers exposed a large amount of inconsistent +implementation details to userland. + +There also was no consistency across controllers. When a new cgroup +was created, some controllers defaulted to not imposing extra +restrictions while others disallowed any resource usage until +explicitly configured. Configuration knobs for the same type of +control used widely differing naming schemes and formats. Statistics +and information knobs were named arbitrarily and used different +formats and units even in the same controller. + +cgroup v2 establishes common conventions where appropriate and updates +controllers so that they expose minimal and consistent interfaces. + + +R-5. Controller Issues and Remedies + +R-5-1. Memory + +The original lower boundary, the soft limit, is defined as a limit +that is per default unset. As a result, the set of cgroups that +global reclaim prefers is opt-in, rather than opt-out. The costs for +optimizing these mostly negative lookups are so high that the +implementation, despite its enormous size, does not even provide the +basic desirable behavior. First off, the soft limit has no +hierarchical meaning. All configured groups are organized in a global +rbtree and treated like equal peers, regardless where they are located +in the hierarchy. This makes subtree delegation impossible. Second, +the soft limit reclaim pass is so aggressive that it not just +introduces high allocation latencies into the system, but also impacts +system performance due to overreclaim, to the point where the feature +becomes self-defeating. + +The memory.low boundary on the other hand is a top-down allocated +reserve. A cgroup enjoys reclaim protection when it and all its +ancestors are below their low boundaries, which makes delegation of +subtrees possible. Secondly, new cgroups have no reserve per default +and in the common case most cgroups are eligible for the preferred +reclaim pass. This allows the new low boundary to be efficiently +implemented with just a minor addition to the generic reclaim code, +without the need for out-of-band data structures and reclaim passes. +Because the generic reclaim code considers all cgroups except for the +ones running low in the preferred first reclaim pass, overreclaim of +individual groups is eliminated as well, resulting in much better +overall workload performance. + +The original high boundary, the hard limit, is defined as a strict +limit that can not budge, even if the OOM killer has to be called. +But this generally goes against the goal of making the most out of the +available memory. The memory consumption of workloads varies during +runtime, and that requires users to overcommit. But doing that with a +strict upper limit requires either a fairly accurate prediction of the +working set size or adding slack to the limit. Since working set size +estimation is hard and error prone, and getting it wrong results in +OOM kills, most users tend to err on the side of a looser limit and +end up wasting precious resources. + +The memory.high boundary on the other hand can be set much more +conservatively. When hit, it throttles allocations by forcing them +into direct reclaim to work off the excess, but it never invokes the +OOM killer. As a result, a high boundary that is chosen too +aggressively will not terminate the processes, but instead it will +lead to gradual performance degradation. The user can monitor this +and make corrections until the minimal memory footprint that still +gives acceptable performance is found. + +In extreme cases, with many concurrent allocations and a complete +breakdown of reclaim progress within the group, the high boundary can +be exceeded. But even then it's mostly better to satisfy the +allocation from the slack available in other groups or the rest of the +system than killing the group. Otherwise, memory.max is there to +limit this type of spillover and ultimately contain buggy or even +malicious applications. -- cgit v0.10.2 From d684a90d38e24dcaf95fdb32c83efe05f80d152a Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 11 Nov 2015 16:27:33 -0800 Subject: ahci: per-port msix support Some AHCI controllers support per-port MSI-X vectors. At the same time the Linux AHCI driver needs to support one-off architectures that implement a single MSI-X vector for all ports. The heuristic for enabling AHCI ports becomes, in order of preference: 1/ per-port multi-MSI-X 2/ per-port multi-MSI 3/ single MSI 4/ single MSI-X 5/ legacy INTX This all depends on AHCI implementations with potentially broken MSI-X requesting less vectors than the number of ports. If this assumption is violated we will need to start explicitly white-listing AHCI-MSIX implementations. Reported-by: Ricardo Neri [ricardo: fix struct msix_entry handling] Reported-by: kernel test robot Signed-off-by: Dan Williams Signed-off-by: Tejun Heo diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c index cdfbcc5..594fcab 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -1306,15 +1306,13 @@ static inline void ahci_gtf_filter_workaround(struct ata_host *host) #endif /* - * ahci_init_msix() only implements single MSI-X support, not multiple - * MSI-X per-port interrupts. This is needed for host controllers that only - * have MSI-X support implemented, but no MSI or intx. + * ahci_init_msix() - optionally enable per-port MSI-X otherwise defer + * to single msi. */ static int ahci_init_msix(struct pci_dev *pdev, unsigned int n_ports, - struct ahci_host_priv *hpriv) + struct ahci_host_priv *hpriv, unsigned long flags) { - int rc, nvec; - struct msix_entry entry = {}; + int nvec, i, rc; /* Do not init MSI-X if MSI is disabled for the device */ if (hpriv->flags & AHCI_HFLAG_NO_MSI) @@ -1324,22 +1322,39 @@ static int ahci_init_msix(struct pci_dev *pdev, unsigned int n_ports, if (nvec < 0) return nvec; - if (!nvec) { + /* + * Proper MSI-X implementations will have a vector per-port. + * Barring that, we prefer single-MSI over single-MSIX. If this + * check fails (not enough MSI-X vectors for all ports) we will + * be called again with the flag clear iff ahci_init_msi() + * fails. + */ + if (flags & AHCI_HFLAG_MULTI_MSIX) { + if (nvec < n_ports) + return -ENODEV; + nvec = n_ports; + } else if (nvec) { + nvec = 1; + } else { + /* + * Emit dev_err() since this was the non-legacy irq + * method of last resort. + */ rc = -ENODEV; goto fail; } - /* - * There can be more than one vector (e.g. for error detection or - * hdd hotplug). Only the first vector (entry.entry = 0) is used. - */ - rc = pci_enable_msix_exact(pdev, &entry, 1); + for (i = 0; i < nvec; i++) + hpriv->msix[i].entry = i; + rc = pci_enable_msix_exact(pdev, hpriv->msix, nvec); if (rc < 0) goto fail; - hpriv->irq = entry.vector; + if (nvec > 1) + hpriv->flags |= AHCI_HFLAG_MULTI_MSIX; + hpriv->irq = hpriv->msix[0].vector; /* for single msi-x */ - return 1; + return nvec; fail: dev_err(&pdev->dev, "failed to enable MSI-X with error %d, # of vectors: %d\n", @@ -1403,20 +1418,25 @@ static int ahci_init_interrupts(struct pci_dev *pdev, unsigned int n_ports, { int nvec; + /* + * Try to enable per-port MSI-X. If the host is not capable + * fall back to single MSI before finally attempting single + * MSI-X. + */ + nvec = ahci_init_msix(pdev, n_ports, hpriv, AHCI_HFLAG_MULTI_MSIX); + if (nvec >= 0) + return nvec; + nvec = ahci_init_msi(pdev, n_ports, hpriv); if (nvec >= 0) return nvec; - /* - * Currently, MSI-X support only implements single IRQ mode and - * exists for controllers which can't do other types of IRQ. Only - * set it up if MSI fails. - */ - nvec = ahci_init_msix(pdev, n_ports, hpriv); + /* try single-msix */ + nvec = ahci_init_msix(pdev, n_ports, hpriv, 0); if (nvec >= 0) return nvec; - /* lagacy intx interrupts */ + /* legacy intx interrupts */ pci_intx(pdev, 1); hpriv->irq = pdev->irq; @@ -1578,7 +1598,10 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) if (!host) return -ENOMEM; host->private_data = hpriv; - + hpriv->msix = devm_kzalloc(&pdev->dev, + sizeof(struct msix_entry) * n_ports, GFP_KERNEL); + if (!hpriv->msix) + return -ENOMEM; ahci_init_interrupts(pdev, n_ports, hpriv); if (!(hpriv->cap & HOST_CAP_SSS) || ahci_ignore_sss) diff --git a/drivers/ata/ahci.h b/drivers/ata/ahci.h index 45586c1..9e60c50 100644 --- a/drivers/ata/ahci.h +++ b/drivers/ata/ahci.h @@ -242,6 +242,7 @@ enum { AHCI_HFLAG_NO_FBS = (1 << 18), /* no FBS */ AHCI_HFLAG_EDGE_IRQ = (1 << 19), /* HOST_IRQ_STAT behaves as Edge Triggered */ + AHCI_HFLAG_MULTI_MSIX = (1 << 20), /* per-port MSI-X */ /* ap->flags bits */ @@ -343,6 +344,7 @@ struct ahci_host_priv { * the PHY position in this array. */ struct phy **phys; + struct msix_entry *msix; /* Optional MSI-X support */ unsigned nports; /* Number of ports */ void *plat_data; /* Other platform data */ unsigned int irq; /* interrupt line */ diff --git a/drivers/ata/libahci.c b/drivers/ata/libahci.c index 096064c..0a5645f 100644 --- a/drivers/ata/libahci.c +++ b/drivers/ata/libahci.c @@ -43,6 +43,7 @@ #include #include #include +#include #include "ahci.h" #include "libata.h" @@ -2470,9 +2471,10 @@ void ahci_set_em_messages(struct ahci_host_priv *hpriv, } EXPORT_SYMBOL_GPL(ahci_set_em_messages); -static int ahci_host_activate_multi_irqs(struct ata_host *host, int irq, +static int ahci_host_activate_multi_irqs(struct ata_host *host, struct scsi_host_template *sht) { + struct ahci_host_priv *hpriv = host->private_data; int i, rc; rc = ata_host_start(host); @@ -2484,6 +2486,12 @@ static int ahci_host_activate_multi_irqs(struct ata_host *host, int irq, */ for (i = 0; i < host->n_ports; i++) { struct ahci_port_priv *pp = host->ports[i]->private_data; + int irq; + + if (hpriv->flags & AHCI_HFLAG_MULTI_MSIX) + irq = hpriv->msix[i].vector; + else + irq = hpriv->irq + i; /* Do not receive interrupts sent by dummy ports */ if (!pp) { @@ -2491,14 +2499,15 @@ static int ahci_host_activate_multi_irqs(struct ata_host *host, int irq, continue; } - rc = devm_request_threaded_irq(host->dev, irq + i, + rc = devm_request_threaded_irq(host->dev, irq, ahci_multi_irqs_intr, ahci_port_thread_fn, 0, pp->irq_desc, host->ports[i]); if (rc) return rc; - ata_port_desc(host->ports[i], "irq %d", irq + i); + ata_port_desc(host->ports[i], "irq %d", irq); } + return ata_host_register(host, sht); } @@ -2519,8 +2528,8 @@ int ahci_host_activate(struct ata_host *host, struct scsi_host_template *sht) int irq = hpriv->irq; int rc; - if (hpriv->flags & AHCI_HFLAG_MULTI_MSI) - rc = ahci_host_activate_multi_irqs(host, irq, sht); + if (hpriv->flags & (AHCI_HFLAG_MULTI_MSI | AHCI_HFLAG_MULTI_MSIX)) + rc = ahci_host_activate_multi_irqs(host, sht); else if (hpriv->flags & AHCI_HFLAG_EDGE_IRQ) rc = ata_host_activate(host, irq, ahci_single_edge_irq_intr, IRQF_SHARED, sht); -- cgit v0.10.2 From a6b7fb764ed2a6b7bb1ac96d93c06787aa589092 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 11 Nov 2015 16:27:38 -0800 Subject: ahci: switch from 'threaded' to 'hardirq' interrupt handling For high frequency I/O the overhead of threaded interrupts impacts performance. A quick out-of-the-box test (i.e. no affinity tuning) shows ~10% random read performance at ~20% less cpu. The cpu wins appear to be from reduced lock contention. Signed-off-by: Dan Williams Signed-off-by: Tejun Heo diff --git a/drivers/ata/libahci.c b/drivers/ata/libahci.c index 0a5645f..1b6c7cc 100644 --- a/drivers/ata/libahci.c +++ b/drivers/ata/libahci.c @@ -1796,41 +1796,24 @@ static void ahci_port_intr(struct ata_port *ap) ahci_handle_port_interrupt(ap, port_mmio, status); } -static irqreturn_t ahci_port_thread_fn(int irq, void *dev_instance) +static irqreturn_t ahci_multi_irqs_intr_hard(int irq, void *dev_instance) { struct ata_port *ap = dev_instance; - struct ahci_port_priv *pp = ap->private_data; void __iomem *port_mmio = ahci_port_base(ap); u32 status; - status = atomic_xchg(&pp->intr_status, 0); - if (!status) - return IRQ_NONE; - - spin_lock_bh(ap->lock); - ahci_handle_port_interrupt(ap, port_mmio, status); - spin_unlock_bh(ap->lock); - - return IRQ_HANDLED; -} - -static irqreturn_t ahci_multi_irqs_intr(int irq, void *dev_instance) -{ - struct ata_port *ap = dev_instance; - void __iomem *port_mmio = ahci_port_base(ap); - struct ahci_port_priv *pp = ap->private_data; - u32 status; - VPRINTK("ENTER\n"); status = readl(port_mmio + PORT_IRQ_STAT); writel(status, port_mmio + PORT_IRQ_STAT); - atomic_or(status, &pp->intr_status); + spin_lock(ap->lock); + ahci_handle_port_interrupt(ap, port_mmio, status); + spin_unlock(ap->lock); VPRINTK("EXIT\n"); - return IRQ_WAKE_THREAD; + return IRQ_HANDLED; } static u32 ahci_handle_port_intr(struct ata_host *host, u32 irq_masked) @@ -2499,10 +2482,9 @@ static int ahci_host_activate_multi_irqs(struct ata_host *host, continue; } - rc = devm_request_threaded_irq(host->dev, irq, - ahci_multi_irqs_intr, - ahci_port_thread_fn, 0, - pp->irq_desc, host->ports[i]); + rc = devm_request_irq(host->dev, irq, ahci_multi_irqs_intr_hard, + 0, pp->irq_desc, host->ports[i]); + if (rc) return rc; ata_port_desc(host->ports[i], "irq %d", irq); -- cgit v0.10.2 From f46c4bd16e26eaf85f82cf95f4c77ade5302171d Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 11 Nov 2015 16:37:12 -0800 Subject: ahci: kill 'intr_status' This field in achi_port_priv was only used to support threaded interrupts. Now that we are hardirq only it can be deleted. Signed-off-by: Dan Williams Signed-off-by: Tejun Heo diff --git a/drivers/ata/ahci.h b/drivers/ata/ahci.h index 9e60c50..878470f 100644 --- a/drivers/ata/ahci.h +++ b/drivers/ata/ahci.h @@ -309,7 +309,6 @@ struct ahci_port_priv { unsigned int ncq_saw_d2h:1; unsigned int ncq_saw_dmas:1; unsigned int ncq_saw_sdb:1; - atomic_t intr_status; /* interrupts to handle */ spinlock_t lock; /* protects parent ata_port */ u32 intr_mask; /* interrupts to enable */ bool fbs_supported; /* set iff FBS is supported */ -- cgit v0.10.2 From a946e8c717f9355d1abd5408ed0adc0002d1aed1 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Wed, 4 Nov 2015 18:32:37 +0000 Subject: genirq: Delay incrementing interrupt count if it's disabled/pending In case of a wakeup interrupt, irq_pm_check_wakeup disables the interrupt and marks it pending and suspended, disables it and notifies the pm core about the wake event. The interrupt gets handled later once the system is resumed. However the irq stats is updated twice: once when it's disabled waiting for the system to resume and later when it's handled, resulting in wrong counting of the wakeup interrupt when waking up the system. This patch updates the interrupt count so that it's updated only when the interrupt gets handled. It's already handled correctly in handle_edge_irq and handle_edge_eoi_irq. Reported-by: Manoil Claudiu Signed-off-by: Sudeep Holla Cc: Marc Zyngier Link: http://lkml.kernel.org/r/1446661957-1019-1-git-send-email-sudeep.holla@arm.com Signed-off-by: Thomas Gleixner diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 1520645..05e29de 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -338,7 +338,6 @@ void handle_nested_irq(unsigned int irq) raw_spin_lock_irq(&desc->lock); desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - kstat_incr_irqs_this_cpu(desc); action = desc->action; if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) { @@ -346,6 +345,7 @@ void handle_nested_irq(unsigned int irq) goto out_unlock; } + kstat_incr_irqs_this_cpu(desc); irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); raw_spin_unlock_irq(&desc->lock); @@ -412,13 +412,13 @@ void handle_simple_irq(struct irq_desc *desc) goto out_unlock; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - kstat_incr_irqs_this_cpu(desc); if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { desc->istate |= IRQS_PENDING; goto out_unlock; } + kstat_incr_irqs_this_cpu(desc); handle_irq_event(desc); out_unlock: @@ -462,7 +462,6 @@ void handle_level_irq(struct irq_desc *desc) goto out_unlock; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - kstat_incr_irqs_this_cpu(desc); /* * If its disabled or no action available @@ -473,6 +472,7 @@ void handle_level_irq(struct irq_desc *desc) goto out_unlock; } + kstat_incr_irqs_this_cpu(desc); handle_irq_event(desc); cond_unmask_irq(desc); @@ -532,7 +532,6 @@ void handle_fasteoi_irq(struct irq_desc *desc) goto out; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - kstat_incr_irqs_this_cpu(desc); /* * If its disabled or no action available @@ -544,6 +543,7 @@ void handle_fasteoi_irq(struct irq_desc *desc) goto out; } + kstat_incr_irqs_this_cpu(desc); if (desc->istate & IRQS_ONESHOT) mask_irq(desc); -- cgit v0.10.2 From 90381cba64591e27d0e8bbfe71bf8a98bd2a3db3 Mon Sep 17 00:00:00 2001 From: Drew Richardson Date: Thu, 22 Oct 2015 07:07:01 -0700 Subject: arm64: perf: Convert event enums to #defines The enums are not necessary and this allows the event values to be used to construct static strings at compile time. Signed-off-by: Drew Richardson Signed-off-by: Will Deacon diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index 5b1897e..7e4372e 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -29,60 +29,55 @@ * ARMv8 PMUv3 Performance Events handling code. * Common event types. */ -enum armv8_pmuv3_perf_types { - /* Required events. */ - ARMV8_PMUV3_PERFCTR_PMNC_SW_INCR = 0x00, - ARMV8_PMUV3_PERFCTR_L1_DCACHE_REFILL = 0x03, - ARMV8_PMUV3_PERFCTR_L1_DCACHE_ACCESS = 0x04, - ARMV8_PMUV3_PERFCTR_PC_BRANCH_MIS_PRED = 0x10, - ARMV8_PMUV3_PERFCTR_CLOCK_CYCLES = 0x11, - ARMV8_PMUV3_PERFCTR_PC_BRANCH_PRED = 0x12, - - /* At least one of the following is required. */ - ARMV8_PMUV3_PERFCTR_INSTR_EXECUTED = 0x08, - ARMV8_PMUV3_PERFCTR_OP_SPEC = 0x1B, - - /* Common architectural events. */ - ARMV8_PMUV3_PERFCTR_MEM_READ = 0x06, - ARMV8_PMUV3_PERFCTR_MEM_WRITE = 0x07, - ARMV8_PMUV3_PERFCTR_EXC_TAKEN = 0x09, - ARMV8_PMUV3_PERFCTR_EXC_EXECUTED = 0x0A, - ARMV8_PMUV3_PERFCTR_CID_WRITE = 0x0B, - ARMV8_PMUV3_PERFCTR_PC_WRITE = 0x0C, - ARMV8_PMUV3_PERFCTR_PC_IMM_BRANCH = 0x0D, - ARMV8_PMUV3_PERFCTR_PC_PROC_RETURN = 0x0E, - ARMV8_PMUV3_PERFCTR_MEM_UNALIGNED_ACCESS = 0x0F, - ARMV8_PMUV3_PERFCTR_TTBR_WRITE = 0x1C, - - /* Common microarchitectural events. */ - ARMV8_PMUV3_PERFCTR_L1_ICACHE_REFILL = 0x01, - ARMV8_PMUV3_PERFCTR_ITLB_REFILL = 0x02, - ARMV8_PMUV3_PERFCTR_DTLB_REFILL = 0x05, - ARMV8_PMUV3_PERFCTR_MEM_ACCESS = 0x13, - ARMV8_PMUV3_PERFCTR_L1_ICACHE_ACCESS = 0x14, - ARMV8_PMUV3_PERFCTR_L1_DCACHE_WB = 0x15, - ARMV8_PMUV3_PERFCTR_L2_CACHE_ACCESS = 0x16, - ARMV8_PMUV3_PERFCTR_L2_CACHE_REFILL = 0x17, - ARMV8_PMUV3_PERFCTR_L2_CACHE_WB = 0x18, - ARMV8_PMUV3_PERFCTR_BUS_ACCESS = 0x19, - ARMV8_PMUV3_PERFCTR_MEM_ERROR = 0x1A, - ARMV8_PMUV3_PERFCTR_BUS_CYCLES = 0x1D, -}; + +/* Required events. */ +#define ARMV8_PMUV3_PERFCTR_PMNC_SW_INCR 0x00 +#define ARMV8_PMUV3_PERFCTR_L1_DCACHE_REFILL 0x03 +#define ARMV8_PMUV3_PERFCTR_L1_DCACHE_ACCESS 0x04 +#define ARMV8_PMUV3_PERFCTR_PC_BRANCH_MIS_PRED 0x10 +#define ARMV8_PMUV3_PERFCTR_CLOCK_CYCLES 0x11 +#define ARMV8_PMUV3_PERFCTR_PC_BRANCH_PRED 0x12 + +/* At least one of the following is required. */ +#define ARMV8_PMUV3_PERFCTR_INSTR_EXECUTED 0x08 +#define ARMV8_PMUV3_PERFCTR_OP_SPEC 0x1B + +/* Common architectural events. */ +#define ARMV8_PMUV3_PERFCTR_MEM_READ 0x06 +#define ARMV8_PMUV3_PERFCTR_MEM_WRITE 0x07 +#define ARMV8_PMUV3_PERFCTR_EXC_TAKEN 0x09 +#define ARMV8_PMUV3_PERFCTR_EXC_EXECUTED 0x0A +#define ARMV8_PMUV3_PERFCTR_CID_WRITE 0x0B +#define ARMV8_PMUV3_PERFCTR_PC_WRITE 0x0C +#define ARMV8_PMUV3_PERFCTR_PC_IMM_BRANCH 0x0D +#define ARMV8_PMUV3_PERFCTR_PC_PROC_RETURN 0x0E +#define ARMV8_PMUV3_PERFCTR_MEM_UNALIGNED_ACCESS 0x0F +#define ARMV8_PMUV3_PERFCTR_TTBR_WRITE 0x1C + +/* Common microarchitectural events. */ +#define ARMV8_PMUV3_PERFCTR_L1_ICACHE_REFILL 0x01 +#define ARMV8_PMUV3_PERFCTR_ITLB_REFILL 0x02 +#define ARMV8_PMUV3_PERFCTR_DTLB_REFILL 0x05 +#define ARMV8_PMUV3_PERFCTR_MEM_ACCESS 0x13 +#define ARMV8_PMUV3_PERFCTR_L1_ICACHE_ACCESS 0x14 +#define ARMV8_PMUV3_PERFCTR_L1_DCACHE_WB 0x15 +#define ARMV8_PMUV3_PERFCTR_L2_CACHE_ACCESS 0x16 +#define ARMV8_PMUV3_PERFCTR_L2_CACHE_REFILL 0x17 +#define ARMV8_PMUV3_PERFCTR_L2_CACHE_WB 0x18 +#define ARMV8_PMUV3_PERFCTR_BUS_ACCESS 0x19 +#define ARMV8_PMUV3_PERFCTR_MEM_ERROR 0x1A +#define ARMV8_PMUV3_PERFCTR_BUS_CYCLES 0x1D /* ARMv8 Cortex-A53 specific event types. */ -enum armv8_a53_pmu_perf_types { - ARMV8_A53_PERFCTR_PREFETCH_LINEFILL = 0xC2, -}; +#define ARMV8_A53_PERFCTR_PREFETCH_LINEFILL 0xC2 /* ARMv8 Cortex-A57 specific event types. */ -enum armv8_a57_perf_types { - ARMV8_A57_PERFCTR_L1_DCACHE_ACCESS_LD = 0x40, - ARMV8_A57_PERFCTR_L1_DCACHE_ACCESS_ST = 0x41, - ARMV8_A57_PERFCTR_L1_DCACHE_REFILL_LD = 0x42, - ARMV8_A57_PERFCTR_L1_DCACHE_REFILL_ST = 0x43, - ARMV8_A57_PERFCTR_DTLB_REFILL_LD = 0x4c, - ARMV8_A57_PERFCTR_DTLB_REFILL_ST = 0x4d, -}; +#define ARMV8_A57_PERFCTR_L1_DCACHE_ACCESS_LD 0x40 +#define ARMV8_A57_PERFCTR_L1_DCACHE_ACCESS_ST 0x41 +#define ARMV8_A57_PERFCTR_L1_DCACHE_REFILL_LD 0x42 +#define ARMV8_A57_PERFCTR_L1_DCACHE_REFILL_ST 0x43 +#define ARMV8_A57_PERFCTR_DTLB_REFILL_LD 0x4c +#define ARMV8_A57_PERFCTR_DTLB_REFILL_ST 0x4d /* PMUv3 HW events mapping. */ static const unsigned armv8_pmuv3_perf_map[PERF_COUNT_HW_MAX] = { -- cgit v0.10.2 From 9e9caa6a496174e53d7753baa4779717771da4a7 Mon Sep 17 00:00:00 2001 From: Drew Richardson Date: Thu, 22 Oct 2015 07:07:32 -0700 Subject: arm64: perf: Add event descriptions Add additional information about the ARM architected hardware events to make counters self describing. This makes the hardware PMUs easier to use as perf list contains possible events instead of users having to refer to documentation like the ARM TRMs. Signed-off-by: Drew Richardson Signed-off-by: Will Deacon diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index 7e4372e..d880e9cf 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -53,6 +53,8 @@ #define ARMV8_PMUV3_PERFCTR_PC_PROC_RETURN 0x0E #define ARMV8_PMUV3_PERFCTR_MEM_UNALIGNED_ACCESS 0x0F #define ARMV8_PMUV3_PERFCTR_TTBR_WRITE 0x1C +#define ARMV8_PMUV3_PERFCTR_CHAIN 0x1E +#define ARMV8_PMUV3_PERFCTR_BR_RETIRED 0x21 /* Common microarchitectural events. */ #define ARMV8_PMUV3_PERFCTR_L1_ICACHE_REFILL 0x01 @@ -67,6 +69,23 @@ #define ARMV8_PMUV3_PERFCTR_BUS_ACCESS 0x19 #define ARMV8_PMUV3_PERFCTR_MEM_ERROR 0x1A #define ARMV8_PMUV3_PERFCTR_BUS_CYCLES 0x1D +#define ARMV8_PMUV3_PERFCTR_L1D_CACHE_ALLOCATE 0x1F +#define ARMV8_PMUV3_PERFCTR_L2D_CACHE_ALLOCATE 0x20 +#define ARMV8_PMUV3_PERFCTR_BR_MIS_PRED_RETIRED 0x22 +#define ARMV8_PMUV3_PERFCTR_STALL_FRONTEND 0x23 +#define ARMV8_PMUV3_PERFCTR_STALL_BACKEND 0x24 +#define ARMV8_PMUV3_PERFCTR_L1D_TLB 0x25 +#define ARMV8_PMUV3_PERFCTR_L1I_TLB 0x26 +#define ARMV8_PMUV3_PERFCTR_L2I_CACHE 0x27 +#define ARMV8_PMUV3_PERFCTR_L2I_CACHE_REFILL 0x28 +#define ARMV8_PMUV3_PERFCTR_L3D_CACHE_ALLOCATE 0x29 +#define ARMV8_PMUV3_PERFCTR_L3D_CACHE_REFILL 0x2A +#define ARMV8_PMUV3_PERFCTR_L3D_CACHE 0x2B +#define ARMV8_PMUV3_PERFCTR_L3D_CACHE_WB 0x2C +#define ARMV8_PMUV3_PERFCTR_L2D_TLB_REFILL 0x2D +#define ARMV8_PMUV3_PERFCTR_L21_TLB_REFILL 0x2E +#define ARMV8_PMUV3_PERFCTR_L2D_TLB 0x2F +#define ARMV8_PMUV3_PERFCTR_L21_TLB 0x30 /* ARMv8 Cortex-A53 specific event types. */ #define ARMV8_A53_PERFCTR_PREFETCH_LINEFILL 0xC2 @@ -173,6 +192,123 @@ static const unsigned armv8_a57_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] [C(BPU)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_PC_BRANCH_MIS_PRED, }; +#define ARMV8_EVENT_ATTR_RESOLVE(m) #m +#define ARMV8_EVENT_ATTR(name, config) \ + PMU_EVENT_ATTR_STRING(name, armv8_event_attr_##name, \ + "event=" ARMV8_EVENT_ATTR_RESOLVE(config)) + +ARMV8_EVENT_ATTR(sw_incr, ARMV8_PMUV3_PERFCTR_PMNC_SW_INCR); +ARMV8_EVENT_ATTR(l1i_cache_refill, ARMV8_PMUV3_PERFCTR_L1_ICACHE_REFILL); +ARMV8_EVENT_ATTR(l1i_tlb_refill, ARMV8_PMUV3_PERFCTR_ITLB_REFILL); +ARMV8_EVENT_ATTR(l1d_cache_refill, ARMV8_PMUV3_PERFCTR_L1_DCACHE_REFILL); +ARMV8_EVENT_ATTR(l1d_cache, ARMV8_PMUV3_PERFCTR_L1_DCACHE_ACCESS); +ARMV8_EVENT_ATTR(l1d_tlb_refill, ARMV8_PMUV3_PERFCTR_DTLB_REFILL); +ARMV8_EVENT_ATTR(ld_retired, ARMV8_PMUV3_PERFCTR_MEM_READ); +ARMV8_EVENT_ATTR(st_retired, ARMV8_PMUV3_PERFCTR_MEM_WRITE); +ARMV8_EVENT_ATTR(inst_retired, ARMV8_PMUV3_PERFCTR_INSTR_EXECUTED); +ARMV8_EVENT_ATTR(exc_taken, ARMV8_PMUV3_PERFCTR_EXC_TAKEN); +ARMV8_EVENT_ATTR(exc_return, ARMV8_PMUV3_PERFCTR_EXC_EXECUTED); +ARMV8_EVENT_ATTR(cid_write_retired, ARMV8_PMUV3_PERFCTR_CID_WRITE); +ARMV8_EVENT_ATTR(pc_write_retired, ARMV8_PMUV3_PERFCTR_PC_WRITE); +ARMV8_EVENT_ATTR(br_immed_retired, ARMV8_PMUV3_PERFCTR_PC_IMM_BRANCH); +ARMV8_EVENT_ATTR(br_return_retired, ARMV8_PMUV3_PERFCTR_PC_PROC_RETURN); +ARMV8_EVENT_ATTR(unaligned_ldst_retired, ARMV8_PMUV3_PERFCTR_MEM_UNALIGNED_ACCESS); +ARMV8_EVENT_ATTR(br_mis_pred, ARMV8_PMUV3_PERFCTR_PC_BRANCH_MIS_PRED); +ARMV8_EVENT_ATTR(cpu_cycles, ARMV8_PMUV3_PERFCTR_CLOCK_CYCLES); +ARMV8_EVENT_ATTR(br_pred, ARMV8_PMUV3_PERFCTR_PC_BRANCH_PRED); +ARMV8_EVENT_ATTR(mem_access, ARMV8_PMUV3_PERFCTR_MEM_ACCESS); +ARMV8_EVENT_ATTR(l1i_cache, ARMV8_PMUV3_PERFCTR_L1_ICACHE_ACCESS); +ARMV8_EVENT_ATTR(l1d_cache_wb, ARMV8_PMUV3_PERFCTR_L1_DCACHE_WB); +ARMV8_EVENT_ATTR(l2d_cache, ARMV8_PMUV3_PERFCTR_L2_CACHE_ACCESS); +ARMV8_EVENT_ATTR(l2d_cache_refill, ARMV8_PMUV3_PERFCTR_L2_CACHE_REFILL); +ARMV8_EVENT_ATTR(l2d_cache_wb, ARMV8_PMUV3_PERFCTR_L2_CACHE_WB); +ARMV8_EVENT_ATTR(bus_access, ARMV8_PMUV3_PERFCTR_BUS_ACCESS); +ARMV8_EVENT_ATTR(memory_error, ARMV8_PMUV3_PERFCTR_MEM_ERROR); +ARMV8_EVENT_ATTR(inst_spec, ARMV8_PMUV3_PERFCTR_OP_SPEC); +ARMV8_EVENT_ATTR(ttbr_write_retired, ARMV8_PMUV3_PERFCTR_TTBR_WRITE); +ARMV8_EVENT_ATTR(bus_cycles, ARMV8_PMUV3_PERFCTR_BUS_CYCLES); +ARMV8_EVENT_ATTR(chain, ARMV8_PMUV3_PERFCTR_CHAIN); +ARMV8_EVENT_ATTR(l1d_cache_allocate, ARMV8_PMUV3_PERFCTR_L1D_CACHE_ALLOCATE); +ARMV8_EVENT_ATTR(l2d_cache_allocate, ARMV8_PMUV3_PERFCTR_L2D_CACHE_ALLOCATE); +ARMV8_EVENT_ATTR(br_retired, ARMV8_PMUV3_PERFCTR_BR_RETIRED); +ARMV8_EVENT_ATTR(br_mis_pred_retired, ARMV8_PMUV3_PERFCTR_BR_MIS_PRED_RETIRED); +ARMV8_EVENT_ATTR(stall_frontend, ARMV8_PMUV3_PERFCTR_STALL_FRONTEND); +ARMV8_EVENT_ATTR(stall_backend, ARMV8_PMUV3_PERFCTR_STALL_BACKEND); +ARMV8_EVENT_ATTR(l1d_tlb, ARMV8_PMUV3_PERFCTR_L1D_TLB); +ARMV8_EVENT_ATTR(l1i_tlb, ARMV8_PMUV3_PERFCTR_L1I_TLB); +ARMV8_EVENT_ATTR(l2i_cache, ARMV8_PMUV3_PERFCTR_L2I_CACHE); +ARMV8_EVENT_ATTR(l2i_cache_refill, ARMV8_PMUV3_PERFCTR_L2I_CACHE_REFILL); +ARMV8_EVENT_ATTR(l3d_cache_allocate, ARMV8_PMUV3_PERFCTR_L3D_CACHE_ALLOCATE); +ARMV8_EVENT_ATTR(l3d_cache_refill, ARMV8_PMUV3_PERFCTR_L3D_CACHE_REFILL); +ARMV8_EVENT_ATTR(l3d_cache, ARMV8_PMUV3_PERFCTR_L3D_CACHE); +ARMV8_EVENT_ATTR(l3d_cache_wb, ARMV8_PMUV3_PERFCTR_L3D_CACHE_WB); +ARMV8_EVENT_ATTR(l2d_tlb_refill, ARMV8_PMUV3_PERFCTR_L2D_TLB_REFILL); +ARMV8_EVENT_ATTR(l21_tlb_refill, ARMV8_PMUV3_PERFCTR_L21_TLB_REFILL); +ARMV8_EVENT_ATTR(l2d_tlb, ARMV8_PMUV3_PERFCTR_L2D_TLB); +ARMV8_EVENT_ATTR(l21_tlb, ARMV8_PMUV3_PERFCTR_L21_TLB); + +static struct attribute *armv8_pmuv3_event_attrs[] = { + &armv8_event_attr_sw_incr.attr.attr, + &armv8_event_attr_l1i_cache_refill.attr.attr, + &armv8_event_attr_l1i_tlb_refill.attr.attr, + &armv8_event_attr_l1d_cache_refill.attr.attr, + &armv8_event_attr_l1d_cache.attr.attr, + &armv8_event_attr_l1d_tlb_refill.attr.attr, + &armv8_event_attr_ld_retired.attr.attr, + &armv8_event_attr_st_retired.attr.attr, + &armv8_event_attr_inst_retired.attr.attr, + &armv8_event_attr_exc_taken.attr.attr, + &armv8_event_attr_exc_return.attr.attr, + &armv8_event_attr_cid_write_retired.attr.attr, + &armv8_event_attr_pc_write_retired.attr.attr, + &armv8_event_attr_br_immed_retired.attr.attr, + &armv8_event_attr_br_return_retired.attr.attr, + &armv8_event_attr_unaligned_ldst_retired.attr.attr, + &armv8_event_attr_br_mis_pred.attr.attr, + &armv8_event_attr_cpu_cycles.attr.attr, + &armv8_event_attr_br_pred.attr.attr, + &armv8_event_attr_mem_access.attr.attr, + &armv8_event_attr_l1i_cache.attr.attr, + &armv8_event_attr_l1d_cache_wb.attr.attr, + &armv8_event_attr_l2d_cache.attr.attr, + &armv8_event_attr_l2d_cache_refill.attr.attr, + &armv8_event_attr_l2d_cache_wb.attr.attr, + &armv8_event_attr_bus_access.attr.attr, + &armv8_event_attr_memory_error.attr.attr, + &armv8_event_attr_inst_spec.attr.attr, + &armv8_event_attr_ttbr_write_retired.attr.attr, + &armv8_event_attr_bus_cycles.attr.attr, + &armv8_event_attr_chain.attr.attr, + &armv8_event_attr_l1d_cache_allocate.attr.attr, + &armv8_event_attr_l2d_cache_allocate.attr.attr, + &armv8_event_attr_br_retired.attr.attr, + &armv8_event_attr_br_mis_pred_retired.attr.attr, + &armv8_event_attr_stall_frontend.attr.attr, + &armv8_event_attr_stall_backend.attr.attr, + &armv8_event_attr_l1d_tlb.attr.attr, + &armv8_event_attr_l1i_tlb.attr.attr, + &armv8_event_attr_l2i_cache.attr.attr, + &armv8_event_attr_l2i_cache_refill.attr.attr, + &armv8_event_attr_l3d_cache_allocate.attr.attr, + &armv8_event_attr_l3d_cache_refill.attr.attr, + &armv8_event_attr_l3d_cache.attr.attr, + &armv8_event_attr_l3d_cache_wb.attr.attr, + &armv8_event_attr_l2d_tlb_refill.attr.attr, + &armv8_event_attr_l21_tlb_refill.attr.attr, + &armv8_event_attr_l2d_tlb.attr.attr, + &armv8_event_attr_l21_tlb.attr.attr, + NULL +}; + +static struct attribute_group armv8_pmuv3_events_attr_group = { + .name = "events", + .attrs = armv8_pmuv3_event_attrs, +}; + +static const struct attribute_group *armv8_pmuv3_attr_groups[] = { + &armv8_pmuv3_events_attr_group, + NULL +}; /* * Perf Events' indices @@ -641,6 +777,7 @@ static int armv8_a53_pmu_init(struct arm_pmu *cpu_pmu) armv8_pmu_init(cpu_pmu); cpu_pmu->name = "armv8_cortex_a53"; cpu_pmu->map_event = armv8_a53_map_event; + cpu_pmu->pmu.attr_groups = armv8_pmuv3_attr_groups; return armv8pmu_probe_num_events(cpu_pmu); } @@ -649,6 +786,7 @@ static int armv8_a57_pmu_init(struct arm_pmu *cpu_pmu) armv8_pmu_init(cpu_pmu); cpu_pmu->name = "armv8_cortex_a57"; cpu_pmu->map_event = armv8_a57_map_event; + cpu_pmu->pmu.attr_groups = armv8_pmuv3_attr_groups; return armv8pmu_probe_num_events(cpu_pmu); } -- cgit v0.10.2 From 6fde22426be6af261816db5941744b8d3c4c7f96 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Fri, 13 Nov 2015 07:55:59 -0600 Subject: GFS2: Delete an unnecessary check before the function call "iput" The iput() function tests whether its argument is NULL and then returns immediately. Thus the test around the call is not needed. This issue was detected by using the Coccinelle software. Signed-off-by: Markus Elfring Signed-off-by: Bob Peterson diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index baab99b..1f9de17 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -910,8 +910,7 @@ fail_qc_i: fail_ut_i: iput(sdp->sd_sc_inode); fail: - if (pn) - iput(pn); + iput(pn); return error; } -- cgit v0.10.2 From 3dd1dd8c696bdb7c8dcc9456cb23558ad1b336b8 Mon Sep 17 00:00:00 2001 From: Andrew Price Date: Thu, 12 Nov 2015 14:07:52 -0600 Subject: GFS2: Use rht_for_each_entry_rcu in glock_hash_walk This lockdep splat was being triggered on umount: [55715.973122] =============================== [55715.980169] [ INFO: suspicious RCU usage. ] [55715.981021] 4.3.0-11553-g8d3de01-dirty #15 Tainted: G W [55715.982353] ------------------------------- [55715.983301] fs/gfs2/glock.c:1427 suspicious rcu_dereference_protected() usage! The code it refers to is the rht_for_each_entry_safe usage in glock_hash_walk. The condition that triggers the warning is lockdep_rht_bucket_is_held(tbl, hash) which is checked in the __rcu_dereference_protected macro. The rhashtable buckets are not changed in glock_hash_walk so it's safe to rely on the rcu protection. Replace the rht_for_each_entry_safe() usage with rht_for_each_entry_rcu(), which doesn't care whether the bucket lock is held if the rcu read lock is held. Signed-off-by: Andrew Price Signed-off-by: Bob Peterson Acked-by: Steven Whitehouse diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 32e7471..430326e 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1417,14 +1417,14 @@ static struct shrinker glock_shrinker = { static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp) { struct gfs2_glock *gl; - struct rhash_head *pos, *next; + struct rhash_head *pos; const struct bucket_table *tbl; int i; rcu_read_lock(); tbl = rht_dereference_rcu(gl_hash_table.tbl, &gl_hash_table); for (i = 0; i < tbl->size; i++) { - rht_for_each_entry_safe(gl, pos, next, tbl, i, gl_node) { + rht_for_each_entry_rcu(gl, pos, tbl, i, gl_node) { if ((gl->gl_name.ln_sbd == sdp) && lockref_get_not_dead(&gl->gl_lockref)) examiner(gl); -- cgit v0.10.2 From c8d577038449a718ad0027d1790b6ef4441715d4 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 11 Nov 2015 15:00:35 -0600 Subject: gfs2: Extended attribute readahead When gfs2 allocates an inode and its extended attribute block next to each other at inode create time, the inode's directory entry indicates that in de_rahead. In that case, we can readahead the extended attribute block when we read in the inode. Signed-off-by: Andreas Gruenbacher Signed-off-by: Bob Peterson diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index ad8a5b7..c248659 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -108,7 +108,7 @@ static int gfs2_dir_get_existing_buffer(struct gfs2_inode *ip, u64 block, struct buffer_head *bh; int error; - error = gfs2_meta_read(ip->i_gl, block, DIO_WAIT, &bh); + error = gfs2_meta_read(ip->i_gl, block, DIO_WAIT, 0, &bh); if (error) return error; if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_JD)) { @@ -305,7 +305,7 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, __be64 *buf, BUG_ON(extlen < 1); bh = gfs2_meta_ra(ip->i_gl, dblock, extlen); } else { - error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, &bh); + error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, 0, &bh); if (error) goto fail; } @@ -723,7 +723,7 @@ static int get_leaf(struct gfs2_inode *dip, u64 leaf_no, { int error; - error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, bhp); + error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, 0, bhp); if (!error && gfs2_metatype_check(GFS2_SB(&dip->i_inode), *bhp, GFS2_METATYPE_LF)) { /* pr_info("block num=%llu\n", leaf_no); */ error = -EIO; @@ -1560,15 +1560,22 @@ struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name, dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh); if (dent) { + struct inode *inode; + u16 rahead; + if (IS_ERR(dent)) return ERR_CAST(dent); dtype = be16_to_cpu(dent->de_type); + rahead = be16_to_cpu(dent->de_rahead); addr = be64_to_cpu(dent->de_inum.no_addr); formal_ino = be64_to_cpu(dent->de_inum.no_formal_ino); brelse(bh); if (fail_on_exist) return ERR_PTR(-EEXIST); - return gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino, 0); + inode = gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino, 0); + if (!IS_ERR(inode)) + GFS2_I(inode)->i_rahead = rahead; + return inode; } return ERR_PTR(-ENOENT); } diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index de7b4f9..77e7784 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -402,6 +402,7 @@ struct gfs2_inode { u32 i_diskflags; u8 i_height; u8 i_depth; + u16 i_rahead; }; /* diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 0e1d4be..0f24828 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -187,6 +187,21 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno) return bh; } +static void gfs2_meta_readahead(struct gfs2_glock *gl, u64 blkno) +{ + struct buffer_head *bh; + + bh = gfs2_getbuf(gl, blkno, 1); + lock_buffer(bh); + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + brelse(bh); + return; + } + bh->b_end_io = end_buffer_read_sync; + submit_bh(READA | REQ_META | REQ_PRIO, bh); +} + /** * gfs2_meta_read - Read a block from disk * @gl: The glock covering the block @@ -198,7 +213,7 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno) */ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, - struct buffer_head **bhp) + int rahead, struct buffer_head **bhp) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct buffer_head *bh; @@ -213,11 +228,15 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, lock_buffer(bh); if (buffer_uptodate(bh)) { unlock_buffer(bh); + if (rahead) + gfs2_meta_readahead(gl, blkno + 1); return 0; } bh->b_end_io = end_buffer_read_sync; get_bh(bh); submit_bh(READ_SYNC | REQ_META | REQ_PRIO, bh); + if (rahead) + gfs2_meta_readahead(gl, blkno + 1); if (!(flags & DIO_WAIT)) return 0; @@ -341,8 +360,12 @@ int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num, struct buffer_head *bh; int ret = 0; u32 mtype = height ? GFS2_METATYPE_IN : GFS2_METATYPE_DI; + int rahead = 0; + + if (num == ip->i_no_addr) + rahead = ip->i_rahead; - ret = gfs2_meta_read(gl, num, DIO_WAIT, &bh); + ret = gfs2_meta_read(gl, num, DIO_WAIT, rahead, &bh); if (ret == 0 && gfs2_metatype_check(sdp, bh, mtype)) { brelse(bh); ret = -EIO; diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h index 8ca1615..c5086c8 100644 --- a/fs/gfs2/meta_io.h +++ b/fs/gfs2/meta_io.h @@ -53,7 +53,7 @@ static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping) extern struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno); extern int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, - struct buffer_head **bhp); + int rahead, struct buffer_head **bhp); extern int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh); extern struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create); diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 3a31226..e01298d 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -388,7 +388,7 @@ static int bh_get(struct gfs2_quota_data *qd) error = gfs2_block_map(&ip->i_inode, block, &bh_map, 0); if (error) goto fail; - error = gfs2_meta_read(ip->i_gl, bh_map.b_blocknr, DIO_WAIT, &bh); + error = gfs2_meta_read(ip->i_gl, bh_map.b_blocknr, DIO_WAIT, 0, &bh); if (error) goto fail; error = -EIO; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index c134c04..ac0a65d 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1158,7 +1158,7 @@ static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) for (x = 0; x < length; x++) { bi = rgd->rd_bits + x; - error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, &bi->bi_bh); + error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, 0, &bi->bi_bh); if (error) goto fail; } diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 894fb01..8f94282 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1633,6 +1633,7 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb) ip->i_gl = NULL; ip->i_rgd = NULL; ip->i_res = NULL; + ip->i_rahead = 0; } return &ip->i_inode; } diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 4c096fa..f0fe884 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -119,7 +119,7 @@ static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data) __be64 *eablk, *end; int error; - error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &bh); + error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, 0, &bh); if (error) return error; @@ -143,7 +143,7 @@ static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data) break; bn = be64_to_cpu(*eablk); - error = gfs2_meta_read(ip->i_gl, bn, DIO_WAIT, &eabh); + error = gfs2_meta_read(ip->i_gl, bn, DIO_WAIT, 0, &eabh); if (error) break; error = ea_foreach_i(ip, eabh, ea_call, data); @@ -477,7 +477,7 @@ static int gfs2_iter_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea, return -ENOMEM; for (x = 0; x < nptrs; x++) { - error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0, + error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0, 0, bh + x); if (error) { while (x--) @@ -977,7 +977,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er, if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) { __be64 *end; - error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, + error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, 0, &indbh); if (error) return error; @@ -1303,7 +1303,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip) memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); - error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &indbh); + error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, 0, &indbh); if (error) return error; -- cgit v0.10.2 From b522842c43e49bff1a1c929628f9fd46e596c06b Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 27 Oct 2015 16:42:06 +0100 Subject: ARM: 8448/1: add some L220 DT settings The RealView ARM11MPCore enables parity, eventmon and shared override in the cache controller through its current boardfile, but the code and DT bindings for the ARM L220 is currently lacking the ability to set this up from DT. Add the required bool parameters for parity and shared override, but keep eventmon out of it: this should be enabled by the event monitor code. Cc: devicetree@vger.kernel.org Acked-by: Rob Herring Signed-off-by: Linus Walleij Signed-off-by: Russell King diff --git a/Documentation/devicetree/bindings/arm/l2cc.txt b/Documentation/devicetree/bindings/arm/l2cc.txt index 06c88a4..d181b7c 100644 --- a/Documentation/devicetree/bindings/arm/l2cc.txt +++ b/Documentation/devicetree/bindings/arm/l2cc.txt @@ -67,12 +67,14 @@ Optional properties: disable if zero. - arm,prefetch-offset : Override prefetch offset value. Valid values are 0-7, 15, 23, and 31. -- arm,shared-override : The default behavior of the pl310 cache controller with - respect to the shareable attribute is to transform "normal memory - non-cacheable transactions" into "cacheable no allocate" (for reads) or - "write through no write allocate" (for writes). +- arm,shared-override : The default behavior of the L220 or PL310 cache + controllers with respect to the shareable attribute is to transform "normal + memory non-cacheable transactions" into "cacheable no allocate" (for reads) + or "write through no write allocate" (for writes). On systems where this may cause DMA buffer corruption, this property must be specified to indicate that such transforms are precluded. +- arm,parity-enable : enable parity checking on the L2 cache (L220 or PL310). +- arm,parity-disable : disable parity checking on the L2 cache (L220 or PL310). - prefetch-data : Data prefetch. Value: <0> (forcibly disable), <1> (forcibly enable), property absent (retain settings set by firmware) - prefetch-instr : Instruction prefetch. Value: <0> (forcibly disable), diff --git a/arch/arm/mm/cache-l2x0.c b/arch/arm/mm/cache-l2x0.c index 493692d..3f3008e 100644 --- a/arch/arm/mm/cache-l2x0.c +++ b/arch/arm/mm/cache-l2x0.c @@ -1060,6 +1060,18 @@ static void __init l2x0_of_parse(const struct device_node *np, val |= (dirty - 1) << L2X0_AUX_CTRL_DIRTY_LATENCY_SHIFT; } + if (of_property_read_bool(np, "arm,parity-enable")) { + mask &= ~L2C_AUX_CTRL_PARITY_ENABLE; + val |= L2C_AUX_CTRL_PARITY_ENABLE; + } else if (of_property_read_bool(np, "arm,parity-disable")) { + mask &= ~L2C_AUX_CTRL_PARITY_ENABLE; + } + + if (of_property_read_bool(np, "arm,shared-override")) { + mask &= ~L2C_AUX_CTRL_SHARED_OVERRIDE; + val |= L2C_AUX_CTRL_SHARED_OVERRIDE; + } + ret = l2x0_cache_size_of_parse(np, aux_val, aux_mask, &assoc, SZ_256K); if (ret) return; @@ -1176,6 +1188,14 @@ static void __init l2c310_of_parse(const struct device_node *np, *aux_mask &= ~L2C_AUX_CTRL_SHARED_OVERRIDE; } + if (of_property_read_bool(np, "arm,parity-enable")) { + *aux_val |= L2C_AUX_CTRL_PARITY_ENABLE; + *aux_mask &= ~L2C_AUX_CTRL_PARITY_ENABLE; + } else if (of_property_read_bool(np, "arm,parity-disable")) { + *aux_val &= ~L2C_AUX_CTRL_PARITY_ENABLE; + *aux_mask &= ~L2C_AUX_CTRL_PARITY_ENABLE; + } + prefetch = l2x0_saved_regs.prefetch_ctrl; ret = of_property_read_u32(np, "arm,double-linefill", &val); -- cgit v0.10.2 From 440ee365d30adbcd9a97fba9cd629cedb7e9b7cb Mon Sep 17 00:00:00 2001 From: Ezequiel Garcia Date: Fri, 30 Oct 2015 21:08:05 +0100 Subject: ARM: 8450/1: v7-M: Use ret_to_user_from_irq in PendSV handler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PendSV handler calls v7m_exception_entry which disables IRQs. Therefore, since IRQs are already disabled, the PendSV handler can return using ret_to_user_from_irq. Signed-off-by: Ezequiel Garcia Acked-by: Uwe Kleine-König Signed-off-by: Russell King diff --git a/arch/arm/kernel/entry-v7m.S b/arch/arm/kernel/entry-v7m.S index b6c8bb9..907534f 100644 --- a/arch/arm/kernel/entry-v7m.S +++ b/arch/arm/kernel/entry-v7m.S @@ -88,7 +88,7 @@ __pendsv_entry: @ execute the pending work, including reschedule get_thread_info tsk mov why, #0 - b ret_to_user + b ret_to_user_from_irq ENDPROC(__pendsv_entry) /* -- cgit v0.10.2 From a4124e7296000242243996e1ae2601cfadf276a5 Mon Sep 17 00:00:00 2001 From: Ezequiel Garcia Date: Wed, 4 Nov 2015 17:08:37 +0100 Subject: ARM: 8451/1: v7-M: Set an early stack for __v7m_setup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On ARM v7-M, when PROCINFO_INITFUNC (__v7m_setup) is called, a stack is needed before calling the supervisor call (SVC), which is used by the supervisor call to save the context. Currently, __v7m_setup() prepares a temporary stack in the .text.init section, which is is broken if the kernel is executing directly from read-only memory. In particular, this is the case for LPC43xx, which allows to execute the kernel in-place from a serial flash through its SPIFI controller. This commit fixes the issue by seting an early stack to its usual location. Also, __v7m_setup() is currently saving and restoring the previous stack. That was bogus, because there's no stack previously set, so this commit removes it. Acked-by: Uwe Kleine-König Signed-off-by: Ezequiel Garcia Signed-off-by: Russell King diff --git a/arch/arm/mm/proc-v7m.S b/arch/arm/mm/proc-v7m.S index 67d9209..7229d8d 100644 --- a/arch/arm/mm/proc-v7m.S +++ b/arch/arm/mm/proc-v7m.S @@ -12,6 +12,7 @@ */ #include #include +#include #include #include "proc-macros.S" @@ -97,19 +98,19 @@ __v7m_setup: mov r5, #0x00800000 str r5, [r0, V7M_SCB_SHPR3] @ set PendSV priority - @ SVC to run the kernel in this mode + @ SVC to switch to handler mode. Notice that this requires sp to + @ point to writeable memory because the processor saves + @ some registers to the stack. badr r1, 1f ldr r5, [r12, #11 * 4] @ read the SVC vector entry str r1, [r12, #11 * 4] @ write the temporary SVC vector entry mov r6, lr @ save LR - mov r7, sp @ save SP - ldr sp, =__v7m_setup_stack_top + ldr sp, =init_thread_union + THREAD_START_SP cpsie i svc #0 1: cpsid i str r5, [r12, #11 * 4] @ restore the original SVC vector entry mov lr, r6 @ restore LR - mov sp, r7 @ restore SP @ Special-purpose control register mov r1, #1 @@ -123,11 +124,6 @@ __v7m_setup: ret lr ENDPROC(__v7m_setup) - .align 2 -__v7m_setup_stack: - .space 4 * 8 @ 8 registers -__v7m_setup_stack_top: - define_processor_functions v7m, dabort=nommu_early_abort, pabort=legacy_pabort, nommu=1 .section ".rodata" -- cgit v0.10.2 From 4b7f353b8026b43e66af475ef2266958ddcfa6eb Mon Sep 17 00:00:00 2001 From: Yanbo Li Date: Thu, 12 Nov 2015 10:36:10 -0800 Subject: ath10k: fix the wrong RX rate idx report at 11G mode The RX rate idx is not correct for 11G mode OFDM packet. Because the bitrate table start with CCK index instead of OFDM. Signed-off-by: Yanbo Li Signed-off-by: Kalle Valo diff --git a/drivers/net/wireless/ath/ath10k/htt_rx.c b/drivers/net/wireless/ath/ath10k/htt_rx.c index d1dc1ba..396645b 100644 --- a/drivers/net/wireless/ath/ath10k/htt_rx.c +++ b/drivers/net/wireless/ath/ath10k/htt_rx.c @@ -674,7 +674,7 @@ static void ath10k_htt_rx_h_rates(struct ath10k *ar, rate &= ~RX_PPDU_START_RATE_FLAG; sband = &ar->mac.sbands[status->band]; - status->rate_idx = ath10k_mac_hw_rate_to_idx(sband, rate); + status->rate_idx = ath10k_mac_hw_rate_to_idx(sband, rate, cck); break; case HTT_RX_HT: case HTT_RX_HT_WITH_TXBF: diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c index 76484a9..6637854 100644 --- a/drivers/net/wireless/ath/ath10k/mac.c +++ b/drivers/net/wireless/ath/ath10k/mac.c @@ -90,7 +90,7 @@ static u8 ath10k_mac_bitrate_to_rate(int bitrate) } u8 ath10k_mac_hw_rate_to_idx(const struct ieee80211_supported_band *sband, - u8 hw_rate) + u8 hw_rate, bool cck) { const struct ieee80211_rate *rate; int i; @@ -98,6 +98,9 @@ u8 ath10k_mac_hw_rate_to_idx(const struct ieee80211_supported_band *sband, for (i = 0; i < sband->n_bitrates; i++) { rate = &sband->bitrates[i]; + if (ath10k_mac_bitrate_is_cck(rate->bitrate) != cck) + continue; + if (rate->hw_value == hw_rate) return i; else if (rate->flags & IEEE80211_RATE_SHORT_PREAMBLE && diff --git a/drivers/net/wireless/ath/ath10k/mac.h b/drivers/net/wireless/ath/ath10k/mac.h index f504804..5309158 100644 --- a/drivers/net/wireless/ath/ath10k/mac.h +++ b/drivers/net/wireless/ath/ath10k/mac.h @@ -66,7 +66,7 @@ void ath10k_mac_handle_tx_pause_vdev(struct ath10k *ar, u32 vdev_id, enum wmi_tlv_tx_pause_action action); u8 ath10k_mac_hw_rate_to_idx(const struct ieee80211_supported_band *sband, - u8 hw_rate); + u8 hw_rate, bool cck); u8 ath10k_mac_bitrate_to_idx(const struct ieee80211_supported_band *sband, u32 bitrate); -- cgit v0.10.2 From 9b1ba7b28e70a1f4066cfa3bebea24953f36feef Mon Sep 17 00:00:00 2001 From: Vladimir Kondratiev Date: Fri, 6 Nov 2015 12:50:44 +0200 Subject: wil6210: hold wil->mutex while managing vrings To prevent race when connect flow may run in parallel with the disconnect event. Scenario leading to the bug is: while running connect flow on the AP, STA sends disconnect. log follows. <7>[ 668.736269] wil6210 0000:01:00.0: wlan0: DBG[ WMI]Configure for connection CID 1 <7>[ 668.736269] wil6210 0000:01:00.0: wlan0: DBG[MISC]wil_vring_init_tx() max_mpdu_size 2048 <7>[ 668.736301] wil6210 0000:01:00.0: wlan0: DBG[MISC]wil_vring_alloc() <7>[ 668.736363] wil6210 0000:01:00.0: wlan0: DBG[MISC]vring[1024] 0xffbe8000:d962ce08 0xdb244000 <7>[ 668.736394] wil6210 0000:01:00.0: wlan0: DBG[ WMI]Head 0x00880300 -> 0x00880308 <7>[ 668.736394] wil6210 0000:01:00.0: wlan0: DBG[ WMI]WMI command 0x0821 [28] <7>[ 668.736426] DBG[ WMI]Cmd 00000000: 20 00 24 00 00 00 00 00 00 00 21 08 00 00 00 00 .$.......!..... <7>[ 668.736426] DBG[ WMI]cmd 00000000: 00 00 00 00 00 00 5f 5c 00 00 00 00 00 04 00 08 ......_\........ <7>[ 668.736457] DBG[ WMI]cmd 00000010: 01 01 00 00 00 00 00 00 00 00 ff 0f ............ <7>[ 668.736488] wil6210 0000:01:00.0: wlan0: DBG[ IRQ]Pseudo IRQ 0x00000004 <7>[ 668.736519] wil6210 0000:01:00.0: wlan0: DBG[ WMI]Handle WMI 0x1824 (reply_id 0x1821) <7>[ 668.736519] wil6210 0000:01:00.0: wlan0: DBG[ IRQ]wil6210_mask_irq_pseudo() <7>[ 668.736519] wil6210 0000:01:00.0: wlan0: DBG[ IRQ]ISR MISC 0x20000000 <7>[ 668.736551] wil6210 0000:01:00.0: wlan0: DBG[ WMI]Handle WMI 0x1003 (reply_id 0x1821) <7>[ 668.736551] wil6210 0000:01:00.0: wlan0: DBG[ WMI]Disconnect 04:ce:14:00:07:70 reason [proto 3 wmi 4] <7>[ 668.736582] wil6210 0000:01:00.0: wlan0: DBG[MISC]wil6210_disconnect() <7>[ 668.736613] wil6210 0000:01:00.0: wlan0: DBG[ IRQ]Thread IRQ <7>[ 668.736613] wil6210 0000:01:00.0: wlan0: DBG[ IRQ]Thread ISR MISC 0x20000000 <7>[ 668.736644] wil6210 0000:01:00.0: wlan0: DBG[ IRQ]MBOX event <7>[ 668.736644] wil6210 0000:01:00.0: wlan0: DBG[ WMI]Mbox head 00880330 tail 00880328 <7>[ 668.736676] wil6210 0000:01:00.0: wlan0: DBG[ WMI]Mbox evt 001a 0010 0000 00 <7>[ 668.736676] wil6210 0000:01:00.0: wlan0: DBG[ WMI]WMI event 0x1821 MID 0 @3255145 msec <7>[ 668.736707] DBG[ WMI]evt 00000000: 1a 00 10 00 00 00 00 10 00 00 21 18 69 ab 31 00 ..........!.i.1. <7>[ 668.736707] DBG[ WMI]evt 00000010: 01 01 00 00 00 00 00 00 ........ <7>[ 668.736738] wil6210 0000:01:00.0: wlan0: DBG[ WMI]queue_work -> 0 <7>[ 668.736738] wil6210 0000:01:00.0: wlan0: DBG[ WMI]wmi_recv_cmd -> 1 events queued <7>[ 668.736769] wil6210 0000:01:00.0: wlan0: DBG[ IRQ]wil6210_unmask_irq_pseudo() <7>[ 668.736832] wil6210 0000:01:00.0: wlan0: DBG[MISC]Disconnect 04:ce:14:00:07:70, CID=1, reason=3 <7>[ 668.736832] wil6210 0000:01:00.0: wlan0: DBG[MISC]wil_disconnect_cid(CID 1, status 1) <7>[ 668.736894] wil6210 0000:01:00.0: wlan0: DBG[MISC]wil_vring_fini_tx() id=1 <7>[ 668.736894] wil6210 0000:01:00.0: wlan0: DBG[MISC]free Tx vring 1 [1024] 0xffbe8000:d962ce08 0xdb244000 <7>[ 668.736957] wil6210 0000:01:00.0: wlan0: DBG[ WMI]Handle WMI 0x1821 (reply_id 0x1821) <7>[ 668.736988] wil6210 0000:01:00.0: wlan0: DBG[ WMI]Complete WMI 0x1821 <7>[ 668.737019] wil6210 0000:01:00.0: wlan0: DBG[ WMI]wmi_call(0x0821->0x1821) completed in 0 msec <3>[ 668.737019] wil6210 0000:01:00.0: wlan0: Tx config failed, status 0x01 <7>[ 668.739518] wil6210 0000:01:00.0: wlan0: DBG[MISC]wil_cfg80211_del_station(04:ce:14:00:07:70, reason=2) <7>[ 668.739550] wil6210 0000:01:00.0: wlan0: DBG[MISC]wil6210_disconnect() <7>[ 668.739550] wil6210 0000:01:00.0: wlan0: DBG[MISC]_wil6210_disconnect(bssid=04:ce:14:00:07:70, reason=2, ev-) <7>[ 668.739581] wil6210 0000:01:00.0: wlan0: DBG[MISC]Disconnect 04:ce:14:00:07:70, CID=-2, reason=2 <7>[ 668.742705] wil6210 0000:01:00.0: wlan0: DBG[MISC]free Tx vring 1 [1024] 0x (null):d962ce08 0x (null) <3>[ 668.742736] __dma_free_remap: trying to free invalid coherent area: (null) Signed-off-by: Vladimir Kondratiev Signed-off-by: Maya Erez Signed-off-by: Kalle Valo diff --git a/drivers/net/wireless/ath/wil6210/main.c b/drivers/net/wireless/ath/wil6210/main.c index bb69a59..48687f1 100644 --- a/drivers/net/wireless/ath/wil6210/main.c +++ b/drivers/net/wireless/ath/wil6210/main.c @@ -401,20 +401,26 @@ void wil_bcast_fini(struct wil6210_priv *wil) static void wil_connect_worker(struct work_struct *work) { - int rc; + int rc, cid, ringid; struct wil6210_priv *wil = container_of(work, struct wil6210_priv, connect_worker); struct net_device *ndev = wil_to_ndev(wil); - int cid = wil->pending_connect_cid; - int ringid = wil_find_free_vring(wil); + mutex_lock(&wil->mutex); + cid = wil->pending_connect_cid; if (cid < 0) { wil_err(wil, "No connection pending\n"); - return; + goto out; + } + ringid = wil_find_free_vring(wil); + if (ringid < 0) { + wil_err(wil, "No free vring found\n"); + goto out; } - wil_dbg_wmi(wil, "Configure for connection CID %d\n", cid); + wil_dbg_wmi(wil, "Configure for connection CID %d vring %d\n", + cid, ringid); rc = wil_vring_init_tx(wil, ringid, 1 << tx_ring_order, cid, 0); wil->pending_connect_cid = -1; @@ -424,6 +430,8 @@ static void wil_connect_worker(struct work_struct *work) } else { wil_disconnect_cid(wil, cid, WLAN_REASON_UNSPECIFIED, true); } +out: + mutex_unlock(&wil->mutex); } int wil_priv_init(struct wil6210_priv *wil) diff --git a/drivers/net/wireless/ath/wil6210/txrx.c b/drivers/net/wireless/ath/wil6210/txrx.c index 3bc9bc0..7887e6c 100644 --- a/drivers/net/wireless/ath/wil6210/txrx.c +++ b/drivers/net/wireless/ath/wil6210/txrx.c @@ -160,6 +160,7 @@ static void wil_vring_free(struct wil6210_priv *wil, struct vring *vring, struct device *dev = wil_to_dev(wil); size_t sz = vring->size * sizeof(vring->va[0]); + lockdep_assert_held(&wil->mutex); if (tx) { int vring_index = vring - wil->vring_tx; @@ -749,6 +750,7 @@ int wil_vring_init_tx(struct wil6210_priv *wil, int id, int size, wil_dbg_misc(wil, "%s() max_mpdu_size %d\n", __func__, cmd.vring_cfg.tx_sw_ring.max_mpdu_size); + lockdep_assert_held(&wil->mutex); if (vring->va) { wil_err(wil, "Tx ring [%d] already allocated\n", id); @@ -821,6 +823,7 @@ int wil_vring_init_bcast(struct wil6210_priv *wil, int id, int size) wil_dbg_misc(wil, "%s() max_mpdu_size %d\n", __func__, cmd.vring_cfg.tx_sw_ring.max_mpdu_size); + lockdep_assert_held(&wil->mutex); if (vring->va) { wil_err(wil, "Tx ring [%d] already allocated\n", id); @@ -872,7 +875,7 @@ void wil_vring_fini_tx(struct wil6210_priv *wil, int id) struct vring *vring = &wil->vring_tx[id]; struct vring_tx_data *txdata = &wil->vring_tx_data[id]; - WARN_ON(!mutex_is_locked(&wil->mutex)); + lockdep_assert_held(&wil->mutex); if (!vring->va) return; -- cgit v0.10.2 From 7884084f3bcc98adfbd8b90a2bd6bcf10c4df2cd Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 3 Nov 2015 17:34:17 -0500 Subject: cpuidle,x86: increase forced cut-off for polling to 20us The cpuidle menu governor has a forced cut-off for polling at 5us, in order to deal with firmware that gives the OS bad information on cpuidle states, leading to the system spending way too much time in polling. However, at least one x86 CPU family (Atom) has chips that have a 20us break-even point for C1. Forcing the polling cut-off to less than that wastes performance and power. Increase the polling cut-off to 20us. Systems with a lower C1 latency will be found in the states table by the menu governor, which will pick those states as appropriate. Signed-off-by: Rik van Riel Acked-by: Arjan van de Ven Signed-off-by: Rafael J. Wysocki diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 22e4463..ecc242a 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -330,7 +330,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) * We want to default to C1 (hlt), not to busy polling * unless the timer is happening really really soon. */ - if (data->next_timer_us > 5 && + if (data->next_timer_us > 20 && !drv->states[CPUIDLE_DRIVER_STATE_START].disabled && dev->states_usage[CPUIDLE_DRIVER_STATE_START].disable == 0) data->last_state_idx = CPUIDLE_DRIVER_STATE_START; -- cgit v0.10.2 From a9ceb78bc75ca47972096372ff3d48648b16317a Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 3 Nov 2015 17:34:18 -0500 Subject: cpuidle,menu: use interactivity_req to disable polling The menu governor carefully figures out how much time we typically sleep for an estimated sleep interval, or whether there is a repeating pattern going on, and corrects that estimate for the CPU load. Then it proceeds to ignore that information when determining whether or not to consider polling. This is not a big deal on most x86 CPUs, which have very low C1 latencies, and the patch should not have any effect on those CPUs. However, certain CPUs (eg. Atom) have much higher C1 latencies, and it would be good to not waste performance and power on those CPUs if we are expecting a very low wakeup latency. Disable polling based on the estimated interactivity requirement, not on the time to the next timer interrupt. Signed-off-by: Rik van Riel Acked-by: Arjan van de Ven Signed-off-by: Rafael J. Wysocki diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index ecc242a..b1a5573 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -330,7 +330,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) * We want to default to C1 (hlt), not to busy polling * unless the timer is happening really really soon. */ - if (data->next_timer_us > 20 && + if (interactivity_req > 20 && !drv->states[CPUIDLE_DRIVER_STATE_START].disabled && dev->states_usage[CPUIDLE_DRIVER_STATE_START].disable == 0) data->last_state_idx = CPUIDLE_DRIVER_STATE_START; -- cgit v0.10.2 From efddfd90fb7f5ba3c7d1bff923a3626a78eee553 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 3 Nov 2015 17:34:19 -0500 Subject: cpuidle,menu: smooth out measured_us calculation The cpuidle state tables contain the maximum exit latency for each cpuidle state. On x86, that is the exit latency for when the entire package goes into that same idle state. However, a lot of the time we only go into the core idle state, not the package idle state. This means we see a much smaller exit latency. We have no way to detect whether we went into the core or package idle state while idle, and that is ok. However, the current menu_update logic does have the potential to trip up the repeating pattern detection in get_typical_interval. If the system is experiencing an exit latency near the idle state's exit latency, some of the samples will have exit_us subtracted, while others will not. This turns a repeating pattern into mush, potentially breaking get_typical_interval. Furthermore, for smaller sleep intervals, we know the chance that all the cores in the package went to the same idle state are fairly small. Dividing the measured_us by two, instead of subtracting the full exit latency when hitting a small measured_us, will reduce the error. Signed-off-by: Rik van Riel Acked-by: Arjan van de Ven Signed-off-by: Rafael J. Wysocki diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index b1a5573..7b0971d 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -404,8 +404,10 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) measured_us = cpuidle_get_last_residency(dev); /* Deduct exit latency */ - if (measured_us > target->exit_latency) + if (measured_us > 2 * target->exit_latency) measured_us -= target->exit_latency; + else + measured_us /= 2; /* Make sure our coefficients do not exceed unity */ if (measured_us > data->next_timer_us) -- cgit v0.10.2 From 4deea4cb471354a8abdeecc9a571dfdbac8c2481 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 9 Nov 2015 23:24:10 -0200 Subject: [media] dvb: document dvb_frontend_sleep_until() This function is used mainly at the DVB core, in order to provide emulation for a legacy ioctl. The only current exception is the stv0299 driver, with takes more than 8ms to switch voltage, breaking the emulation for FE_DISHNETWORK_SEND_LEGACY_CMD. Document that. Signed-off-by: Mauro Carvalho Chehab diff --git a/drivers/media/dvb-core/dvb_frontend.c b/drivers/media/dvb-core/dvb_frontend.c index c38ef1a..d764cff 100644 --- a/drivers/media/dvb-core/dvb_frontend.c +++ b/drivers/media/dvb-core/dvb_frontend.c @@ -891,10 +891,11 @@ static void dvb_frontend_stop(struct dvb_frontend *fe) } /* - * Sleep until gettimeofday() > waketime + add_usec - * This needs to be as precise as possible, but as the delay is - * usually between 2ms and 32ms, it is done using a scheduled msleep - * followed by usleep (normally a busy-wait loop) for the remainder + * Sleep for the amount of time given by add_usec parameter + * + * This needs to be as precise as possible, as it affects the detection of + * the dish tone command at the satellite subsystem. The precision is improved + * by using a scheduled msleep followed by udelay for the remainder. */ void dvb_frontend_sleep_until(ktime_t *waketime, u32 add_usec) { diff --git a/drivers/media/dvb-core/dvb_frontend.h b/drivers/media/dvb-core/dvb_frontend.h index 97661b2..a6bc037 100644 --- a/drivers/media/dvb-core/dvb_frontend.h +++ b/drivers/media/dvb-core/dvb_frontend.h @@ -404,6 +404,11 @@ struct dtv_frontend_properties; * FE_ENABLE_HIGH_LNB_VOLTAGE ioctl (only Satellite). * @dishnetwork_send_legacy_command: callback function to implement the * FE_DISHNETWORK_SEND_LEGACY_CMD ioctl (only Satellite). + * Drivers should not use this, except when the DVB + * core emulation fails to provide proper support (e.g. + * if set_voltage() takes more than 8ms to work), and + * when backward compatibility with this legacy API is + * required. * @i2c_gate_ctrl: controls the I2C gate. Newer drivers should use I2C * mux support instead. * @ts_bus_ctrl: callback function used to take control of the TS bus. @@ -693,6 +698,29 @@ extern void dvb_frontend_reinitialise(struct dvb_frontend *fe); extern int dvb_frontend_suspend(struct dvb_frontend *fe); extern int dvb_frontend_resume(struct dvb_frontend *fe); -extern void dvb_frontend_sleep_until(ktime_t *waketime, u32 add_usec); +/** + * dvb_frontend_sleep_until() - Sleep for the amount of time given by + * add_usec parameter + * + * @waketime: pointer to a struct ktime_t + * @add_usec: time to sleep, in microseconds + * + * This function is used to measure the time required for the + * %FE_DISHNETWORK_SEND_LEGACY_CMD ioctl to work. It needs to be as precise + * as possible, as it affects the detection of the dish tone command at the + * satellite subsystem. + * + * Its used internally by the DVB frontend core, in order to emulate + * %FE_DISHNETWORK_SEND_LEGACY_CMD using the &dvb_frontend_ops.set_voltage() + * callback. + * + * NOTE: it should not be used at the drivers, as the emulation for the + * legacy callback is provided by the Kernel. The only situation where this + * should be at the drivers is when there are some bugs at the hardware that + * would prevent the core emulation to work. On such cases, the driver would + * be writing a &dvb_frontend_ops.dishnetwork_send_legacy_command() and + * calling this function directly. + */ +void dvb_frontend_sleep_until(ktime_t *waketime, u32 add_usec); #endif -- cgit v0.10.2 From 75f400b0e358b42c31162404c1e9e8511426f1e2 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 10 Nov 2015 10:26:39 -0200 Subject: [media] Document the obscure dvb_frontend_reinitialise() The dvb_frontend_reinitialise() function is a special case used by just one frontend. Document it, for completeness. Signed-off-by: Mauro Carvalho Chehab diff --git a/drivers/media/dvb-core/dvb_frontend.h b/drivers/media/dvb-core/dvb_frontend.h index a6bc037..7d7041a 100644 --- a/drivers/media/dvb-core/dvb_frontend.h +++ b/drivers/media/dvb-core/dvb_frontend.h @@ -694,11 +694,24 @@ extern int dvb_unregister_frontend(struct dvb_frontend *fe); extern void dvb_frontend_detach(struct dvb_frontend *fe); -extern void dvb_frontend_reinitialise(struct dvb_frontend *fe); extern int dvb_frontend_suspend(struct dvb_frontend *fe); extern int dvb_frontend_resume(struct dvb_frontend *fe); /** + * dvb_frontend_reinitialise() - forces a reinitialisation at the frontend + * + * @fe: pointer to the frontend struct + * + * Calls &dvb_frontend_ops.init() and &dvb_frontend_ops.tuner_ops.init(), + * and resets SEC tone and voltage (for Satellite systems). + * + * NOTE: Currently, this function is used only by one driver (budget-av). + * It seems to be due to address some special issue with that specific + * frontend. + */ +void dvb_frontend_reinitialise(struct dvb_frontend *fe); + +/** * dvb_frontend_sleep_until() - Sleep for the amount of time given by * add_usec parameter * -- cgit v0.10.2 From 66f4b3cb5c2c68810772e7eee30b3d4b85852639 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 10 Nov 2015 10:46:25 -0200 Subject: [media] dvb_frontend: document the most used functions Documents the most used functions at the Digital TV kABI: dvb_frontend_register(), dvb_frontend_unregister() and dvb_frontend_detach(). Signed-off-by: Mauro Carvalho Chehab diff --git a/drivers/media/dvb-core/dvb_frontend.h b/drivers/media/dvb-core/dvb_frontend.h index 7d7041a..d15507b 100644 --- a/drivers/media/dvb-core/dvb_frontend.h +++ b/drivers/media/dvb-core/dvb_frontend.h @@ -687,12 +687,49 @@ struct dvb_frontend { unsigned int exit; }; -extern int dvb_register_frontend(struct dvb_adapter *dvb, +/** + * dvb_register_frontend() - Registers a DVB frontend at the adapter + * + * @dvb: pointer to the dvb adapter + * @fe: pointer to the frontend struct + * + * Allocate and initialize the private data needed by the frontend core to + * manage the frontend and calls dvb_register_device() to register a new + * frontend. It also cleans the property cache that stores the frontend + * parameters and selects the first available delivery system. + */ +int dvb_register_frontend(struct dvb_adapter *dvb, struct dvb_frontend *fe); -extern int dvb_unregister_frontend(struct dvb_frontend *fe); +/** + * dvb_unregister_frontend() - Unregisters a DVB frontend + * + * @fe: pointer to the frontend struct + * + * Stops the frontend kthread, calls dvb_unregister_device() and frees the + * private frontend data allocated by dvb_register_frontend(). + * + * NOTE: This function doesn't frees the memory allocated by the demod, + * by the SEC driver and by the tuner. In order to free it, an explicit call to + * dvb_frontend_detach() is needed, after calling this function. + */ +int dvb_unregister_frontend(struct dvb_frontend *fe); -extern void dvb_frontend_detach(struct dvb_frontend *fe); +/** + * dvb_frontend_detach() - Detaches and frees frontend specific data + * + * @fe: pointer to the frontend struct + * + * This function should be called after dvb_unregister_frontend(). It + * calls the SEC, tuner and demod release functions: + * &dvb_frontend_ops.release_sec, &dvb_frontend_ops.tuner_ops.release, + * &dvb_frontend_ops.analog_ops.release and &dvb_frontend_ops.release. + * + * If the driver is compiled with CONFIG_MEDIA_ATTACH, it also decreases + * the module reference count, needed to allow userspace to remove the + * previously used DVB frontend modules. + */ +void dvb_frontend_detach(struct dvb_frontend *fe); extern int dvb_frontend_suspend(struct dvb_frontend *fe); extern int dvb_frontend_resume(struct dvb_frontend *fe); -- cgit v0.10.2 From 02f028cfd962106a6d223549f7f898b532117ecf Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 10 Nov 2015 11:12:30 -0200 Subject: [media] dvb_frontend.h: Add a description for the header This header file provides the kABI functions used by the Digital TV Frontend core support. Add a description for this kABI, to add at the device_drivers Kernel DocBook. Signed-off-by: Mauro Carvalho Chehab diff --git a/Documentation/DocBook/device-drivers.tmpl b/Documentation/DocBook/device-drivers.tmpl index fc7242d..7b3fcc5 100644 --- a/Documentation/DocBook/device-drivers.tmpl +++ b/Documentation/DocBook/device-drivers.tmpl @@ -244,6 +244,7 @@ X!Isound/sound_firmware.c !Idrivers/media/dvb-core/dvbdev.h Digital TV Frontend kABI +!Pdrivers/media/dvb-core/dvb_frontend.h Digital TV Frontend !Idrivers/media/dvb-core/dvb_frontend.h Digital TV Demux kABI diff --git a/drivers/media/dvb-core/dvb_frontend.h b/drivers/media/dvb-core/dvb_frontend.h index d15507b..47e1ee4 100644 --- a/drivers/media/dvb-core/dvb_frontend.h +++ b/drivers/media/dvb-core/dvb_frontend.h @@ -42,6 +42,29 @@ #include "dvbdev.h" +/** + * DOC: Digital TV Frontend + * + * The Digital TV Frontend kABI defines a driver-internal interface for + * registering low-level, hardware specific driver to a hardware independent + * frontend layer. It is only of interest for Digital TV device driver writers. + * The header file for this API is named dvb_frontend.h and located in + * drivers/media/dvb-core. + * + * Before using the Digital TV frontend core, the bridge driver should attach + * the frontend demod, tuner and SEC devices and call dvb_register_frontend(), + * in order to register the new frontend at the subsystem. At device + * detach/removal, the bridge driver should call dvb_unregister_frontend() to + * remove the frontend from the core and then dvb_frontend_detach() to free the + * memory allocated by the frontend drivers. + * + * The drivers should also call dvb_frontend_suspend() as part of their + * handler for the &device_driver.suspend(), and dvb_frontend_resume() as + * part of their handler for &device_driver.resume(). + * + * A few other optional functions are provided to handle some special cases. + */ + /* * Maximum number of Delivery systems per frontend. It * should be smaller or equal to 32 -- cgit v0.10.2 From 03946b2d661fc889bee4cf204a2f9853ca27c986 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 10 Nov 2015 11:14:52 -0200 Subject: [media] demux.h: Some documentation fixups for the header The DocBook description of this header has two issues: - It calls the Kernel ABI as API, instead of kABI; - It mentions that the DVB frontend kABI is not described within the document. As this will actually generate a single DocBook, this is actually not true, now that the documentation for the frontend was added. So, fix both issues. Signed-off-by: Mauro Carvalho Chehab diff --git a/drivers/media/dvb-core/demux.h b/drivers/media/dvb-core/demux.h index f716e14..6d3b95b 100644 --- a/drivers/media/dvb-core/demux.h +++ b/drivers/media/dvb-core/demux.h @@ -35,31 +35,31 @@ /** * DOC: Digital TV Demux * - * The kernel demux API defines a driver-internal interface for registering - * low-level, hardware specific driver to a hardware independent demux layer. - * It is only of interest for Digital TV device driver writers. - * The header file for this API is named demux.h and located in + * The Kernel Digital TV Demux kABI defines a driver-internal interface for + * registering low-level, hardware specific driver to a hardware independent + * demux layer. It is only of interest for Digital TV device driver writers. + * The header file for this kABI is named demux.h and located in * drivers/media/dvb-core. * - * The demux API should be implemented for each demux in the system. It is + * The demux kABI should be implemented for each demux in the system. It is * used to select the TS source of a demux and to manage the demux resources. - * When the demux client allocates a resource via the demux API, it receives - * a pointer to the API of that resource. + * When the demux client allocates a resource via the demux kABI, it receives + * a pointer to the kABI of that resource. * * Each demux receives its TS input from a DVB front-end or from memory, as - * set via this demux API. In a system with more than one front-end, the API + * set via this demux kABI. In a system with more than one front-end, the kABI * can be used to select one of the DVB front-ends as a TS source for a demux, * unless this is fixed in the HW platform. * - * The demux API only controls front-ends regarding to their connections with - * demuxes; the APIs used to set the other front-end parameters, such as - * tuning, are not defined in this document. + * The demux kABI only controls front-ends regarding to their connections with + * demuxes; the kABI used to set the other front-end parameters, such as + * tuning, are devined via the Digital TV Frontend kABI. * * The functions that implement the abstract interface demux should be defined * static or module private and registered to the Demux core for external * access. It is not necessary to implement every function in the struct * &dmx_demux. For example, a demux interface might support Section filtering, - * but not PES filtering. The API client is expected to check the value of any + * but not PES filtering. The kABI client is expected to check the value of any * function pointer before calling the function: the value of NULL means * that the function is not available. * @@ -71,7 +71,7 @@ * Even a simple memory allocation without using %GFP_ATOMIC can result in a * kernel thread being put to sleep if swapping is needed. For example, the * Linux Kernel calls the functions of a network device interface from a - * bottom half context. Thus, if a demux API function is called from network + * bottom half context. Thus, if a demux kABI function is called from network * device code, the function must not sleep. */ -- cgit v0.10.2 From 3663b31b69222be90ce8bf346e0d5ba3b5ecbac0 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 10 Nov 2015 11:50:30 -0200 Subject: [media] dvb_frontend: resume tone and voltage As SEC tone and voltage could have changed during suspend(), restore them to their previous values at resume(). Signed-off-by: Mauro Carvalho Chehab diff --git a/drivers/media/dvb-core/dvb_frontend.c b/drivers/media/dvb-core/dvb_frontend.c index d764cff..0b52cfc 100644 --- a/drivers/media/dvb-core/dvb_frontend.c +++ b/drivers/media/dvb-core/dvb_frontend.c @@ -2711,6 +2711,11 @@ int dvb_frontend_resume(struct dvb_frontend *fe) else if (fe->ops.tuner_ops.init) ret = fe->ops.tuner_ops.init(fe); + if (fe->ops.set_tone && fepriv->tone != -1) + fe->ops.set_tone(fe, fepriv->tone); + if (fe->ops.set_voltage && fepriv->voltage != -1) + fe->ops.set_voltage(fe, fepriv->voltage); + fe->exit = DVB_FE_NO_EXIT; fepriv->state = FESTATE_RETUNE; dvb_frontend_wakeup(fe); -- cgit v0.10.2 From 41c0b78e5eaf29f866ebffda4e0e86c9be589477 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 10 Nov 2015 11:54:15 -0200 Subject: [media] dvb_frontend.h: Document suspend/resume functions Those functions should be implemented on all drivers. So, document them. Signed-off-by: Mauro Carvalho Chehab diff --git a/drivers/media/dvb-core/dvb_frontend.h b/drivers/media/dvb-core/dvb_frontend.h index 47e1ee4..bd79d55 100644 --- a/drivers/media/dvb-core/dvb_frontend.h +++ b/drivers/media/dvb-core/dvb_frontend.h @@ -754,8 +754,44 @@ int dvb_unregister_frontend(struct dvb_frontend *fe); */ void dvb_frontend_detach(struct dvb_frontend *fe); -extern int dvb_frontend_suspend(struct dvb_frontend *fe); -extern int dvb_frontend_resume(struct dvb_frontend *fe); +/** + * dvb_frontend_suspend() - Suspends a Digital TV frontend + * + * @fe: pointer to the frontend struct + * + * This function prepares a Digital TV frontend to suspend. + * + * In order to prepare the tuner to suspend, if + * &dvb_frontend_ops.tuner_ops.suspend() is available, it calls it. Otherwise, + * it will call &dvb_frontend_ops.tuner_ops.sleep(), if available. + * + * It will also call &dvb_frontend_ops.sleep() to put the demod to suspend. + * + * The drivers should also call dvb_frontend_suspend() as part of their + * handler for the &device_driver.suspend(). + */ +int dvb_frontend_suspend(struct dvb_frontend *fe); + +/** + * dvb_frontend_resume() - Resumes a Digital TV frontend + * + * @fe: pointer to the frontend struct + * + * This function resumes the usual operation of the tuner after resume. + * + * In order to resume the frontend, it calls the demod &dvb_frontend_ops.init(). + * + * If &dvb_frontend_ops.tuner_ops.resume() is available, It, it calls it. + * Otherwise,t will call &dvb_frontend_ops.tuner_ops.init(), if available. + * + * Once tuner and demods are resumed, it will enforce that the SEC voltage and + * tone are restored to their previous values and wake up the frontend's + * kthread in order to retune the frontend. + * + * The drivers should also call dvb_frontend_resume() as part of their + * handler for the &device_driver.resume(). + */ +int dvb_frontend_resume(struct dvb_frontend *fe); /** * dvb_frontend_reinitialise() - forces a reinitialisation at the frontend -- cgit v0.10.2 From 2184e2530c061ce6d26b80c346afb1a2c63db952 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Wed, 11 Nov 2015 15:52:25 -0200 Subject: [media] dvb_frontend.h: get rid of unused tuner params/states There are several tuner_param values that aren't by any driver or core: DVBFE_TUNER_TUNERSTEP DVBFE_TUNER_IFFREQ DVBFE_TUNER_REFCLOCK DVBFE_TUNER_IQSENSE DVBFE_TUNER_DUMMY Several of those correspond to the values at the tuner_state struct with is also only initialized by not used anyware: u32 tunerstep; u32 ifreq; u32 refclock; It doesn't make sense to keep anything at the kABI that it is not used. So, get rid of them. Signed-off-by: Mauro Carvalho Chehab diff --git a/drivers/media/dvb-core/dvb_frontend.h b/drivers/media/dvb-core/dvb_frontend.h index bd79d55..4856411 100644 --- a/drivers/media/dvb-core/dvb_frontend.h +++ b/drivers/media/dvb-core/dvb_frontend.h @@ -137,12 +137,7 @@ struct analog_parameters { enum tuner_param { DVBFE_TUNER_FREQUENCY = (1 << 0), - DVBFE_TUNER_TUNERSTEP = (1 << 1), - DVBFE_TUNER_IFFREQ = (1 << 2), - DVBFE_TUNER_BANDWIDTH = (1 << 3), - DVBFE_TUNER_REFCLOCK = (1 << 4), - DVBFE_TUNER_IQSENSE = (1 << 5), - DVBFE_TUNER_DUMMY = (1 << 31) + DVBFE_TUNER_BANDWIDTH = (1 << 1), }; /** @@ -177,11 +172,7 @@ enum dvbfe_algo { struct tuner_state { u32 frequency; - u32 tunerstep; - u32 ifreq; u32 bandwidth; - u32 iqsense; - u32 refclock; }; /** diff --git a/drivers/media/dvb-frontends/stb6100.c b/drivers/media/dvb-frontends/stb6100.c index 4ef8a5c..e7f8d2c 100644 --- a/drivers/media/dvb-frontends/stb6100.c +++ b/drivers/media/dvb-frontends/stb6100.c @@ -496,14 +496,15 @@ static int stb6100_init(struct dvb_frontend *fe) { struct stb6100_state *state = fe->tuner_priv; struct tuner_state *status = &state->status; + int refclk = 27000000; /* Hz */ - status->tunerstep = 125000; - status->ifreq = 0; - status->refclock = 27000000; /* Hz */ - status->iqsense = 1; + /* + * iqsense = 1 + * tunerstep = 125000 + */ status->bandwidth = 36000; /* kHz */ state->bandwidth = status->bandwidth * 1000; /* Hz */ - state->reference = status->refclock / 1000; /* kHz */ + state->reference = refclk / 1000; /* kHz */ /* Set default bandwidth. Modified, PN 13-May-10 */ return 0; @@ -517,15 +518,9 @@ static int stb6100_get_state(struct dvb_frontend *fe, case DVBFE_TUNER_FREQUENCY: stb6100_get_frequency(fe, &state->frequency); break; - case DVBFE_TUNER_TUNERSTEP: - break; - case DVBFE_TUNER_IFFREQ: - break; case DVBFE_TUNER_BANDWIDTH: stb6100_get_bandwidth(fe, &state->bandwidth); break; - case DVBFE_TUNER_REFCLOCK: - break; default: break; } @@ -544,16 +539,10 @@ static int stb6100_set_state(struct dvb_frontend *fe, stb6100_set_frequency(fe, state->frequency); tstate->frequency = state->frequency; break; - case DVBFE_TUNER_TUNERSTEP: - break; - case DVBFE_TUNER_IFFREQ: - break; case DVBFE_TUNER_BANDWIDTH: stb6100_set_bandwidth(fe, state->bandwidth); tstate->bandwidth = state->bandwidth; break; - case DVBFE_TUNER_REFCLOCK: - break; default: break; } -- cgit v0.10.2 From cffdbfe7cf9c7201d4d6c3a0c6b224497595431a Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Wed, 11 Nov 2015 16:52:46 -0200 Subject: [media] stb6100: get rid of tuner_state at struct stb6100_state The stb6100 driver has a struct tuner_state on its state struct, that it is used only to store the bandwidth. Even so, this struct is not really used, as every time the bandwidth is get or set, it goes through the hardware. So, get rid of that. Signed-off-by: Mauro Carvalho Chehab diff --git a/drivers/media/dvb-frontends/stb6100.c b/drivers/media/dvb-frontends/stb6100.c index e7f8d2c..5d8dbde 100644 --- a/drivers/media/dvb-frontends/stb6100.c +++ b/drivers/media/dvb-frontends/stb6100.c @@ -252,6 +252,7 @@ static int stb6100_get_bandwidth(struct dvb_frontend *fe, u32 *bandwidth) { int rc; u8 f; + u32 bw; struct stb6100_state *state = fe->tuner_priv; rc = stb6100_read_reg(state, STB6100_F); @@ -259,9 +260,9 @@ static int stb6100_get_bandwidth(struct dvb_frontend *fe, u32 *bandwidth) return rc; f = rc & STB6100_F_F; - state->status.bandwidth = (f + 5) * 2000; /* x2 for ZIF */ + bw = (f + 5) * 2000; /* x2 for ZIF */ - *bandwidth = state->bandwidth = state->status.bandwidth * 1000; + *bandwidth = state->bandwidth = bw * 1000; dprintk(verbose, FE_DEBUG, 1, "bandwidth = %u Hz", state->bandwidth); return 0; } @@ -495,15 +496,13 @@ static int stb6100_sleep(struct dvb_frontend *fe) static int stb6100_init(struct dvb_frontend *fe) { struct stb6100_state *state = fe->tuner_priv; - struct tuner_state *status = &state->status; int refclk = 27000000; /* Hz */ /* * iqsense = 1 * tunerstep = 125000 */ - status->bandwidth = 36000; /* kHz */ - state->bandwidth = status->bandwidth * 1000; /* Hz */ + state->bandwidth = 36000000; /* Hz */ state->reference = refclk / 1000; /* kHz */ /* Set default bandwidth. Modified, PN 13-May-10 */ diff --git a/drivers/media/dvb-frontends/stb6100.h b/drivers/media/dvb-frontends/stb6100.h index 218c818..f7b468b 100644 --- a/drivers/media/dvb-frontends/stb6100.h +++ b/drivers/media/dvb-frontends/stb6100.h @@ -86,7 +86,6 @@ struct stb6100_state { const struct stb6100_config *config; struct dvb_tuner_ops ops; struct dvb_frontend *frontend; - struct tuner_state status; u32 frequency; u32 srate; -- cgit v0.10.2 From 8e6c4be3f8f7889b0806d0635c7b85a6328d77ed Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Wed, 11 Nov 2015 17:33:59 -0200 Subject: [media] tda665x: split set_frequency from set_state On tda665x, set_state only sets frequency. As the kABI for set_state is meant to be used only on special cases, split the function into two, in order to allow it to be latter used by a DVBv5 cache params logic. Signed-off-by: Mauro Carvalho Chehab diff --git a/drivers/media/dvb-frontends/tda665x.c b/drivers/media/dvb-frontends/tda665x.c index 63cc123..9c89253 100644 --- a/drivers/media/dvb-frontends/tda665x.c +++ b/drivers/media/dvb-frontends/tda665x.c @@ -111,9 +111,8 @@ exit: return err; } -static int tda665x_set_state(struct dvb_frontend *fe, - enum tuner_param param, - struct tuner_state *tstate) +static int tda665x_set_frequency(struct dvb_frontend *fe, + u32 new_frequency) { struct tda665x_state *state = fe->tuner_priv; const struct tda665x_config *config = state->config; @@ -121,88 +120,98 @@ static int tda665x_set_state(struct dvb_frontend *fe, u8 buf[4]; int err = 0; - if (param & DVBFE_TUNER_FREQUENCY) { - - frequency = tstate->frequency; - if ((frequency < config->frequency_max) || (frequency > config->frequency_min)) { - printk(KERN_ERR "%s: Frequency beyond limits, frequency=%d\n", __func__, frequency); - return -EINVAL; - } - - frequency += config->frequency_offst; - frequency *= config->ref_multiplier; - frequency += config->ref_divider >> 1; - frequency /= config->ref_divider; - - buf[0] = (u8) ((frequency & 0x7f00) >> 8); - buf[1] = (u8) (frequency & 0x00ff) >> 0; - buf[2] = 0x80 | 0x40 | 0x02; - buf[3] = 0x00; - - /* restore frequency */ - frequency = tstate->frequency; - - if (frequency < 153000000) { - /* VHF-L */ - buf[3] |= 0x01; /* fc, Low Band, 47 - 153 MHz */ - if (frequency < 68000000) - buf[3] |= 0x40; /* 83uA */ - if (frequency < 1040000000) - buf[3] |= 0x60; /* 122uA */ - if (frequency < 1250000000) - buf[3] |= 0x80; /* 163uA */ - else - buf[3] |= 0xa0; /* 254uA */ - } else if (frequency < 438000000) { - /* VHF-H */ - buf[3] |= 0x02; /* fc, Mid Band, 153 - 438 MHz */ - if (frequency < 230000000) - buf[3] |= 0x40; - if (frequency < 300000000) - buf[3] |= 0x60; - else - buf[3] |= 0x80; - } else { - /* UHF */ - buf[3] |= 0x04; /* fc, High Band, 438 - 862 MHz */ - if (frequency < 470000000) - buf[3] |= 0x60; - if (frequency < 526000000) - buf[3] |= 0x80; - else - buf[3] |= 0xa0; - } - - /* Set params */ - err = tda665x_write(state, buf, 5); - if (err < 0) - goto exit; - - /* sleep for some time */ - printk(KERN_DEBUG "%s: Waiting to Phase LOCK\n", __func__); - msleep(20); - /* check status */ - err = tda665x_get_status(fe, &status); - if (err < 0) - goto exit; - - if (status == 1) { - printk(KERN_DEBUG "%s: Tuner Phase locked: status=%d\n", __func__, status); - state->frequency = frequency; /* cache successful state */ - } else { - printk(KERN_ERR "%s: No Phase lock: status=%d\n", __func__, status); - } - } else { - printk(KERN_ERR "%s: Unknown parameter (param=%d)\n", __func__, param); + if ((new_frequency < config->frequency_max) + || (new_frequency > config->frequency_min)) { + printk(KERN_ERR "%s: Frequency beyond limits, frequency=%d\n", + __func__, new_frequency); return -EINVAL; } + frequency = new_frequency; + + frequency += config->frequency_offst; + frequency *= config->ref_multiplier; + frequency += config->ref_divider >> 1; + frequency /= config->ref_divider; + + buf[0] = (u8) ((frequency & 0x7f00) >> 8); + buf[1] = (u8) (frequency & 0x00ff) >> 0; + buf[2] = 0x80 | 0x40 | 0x02; + buf[3] = 0x00; + + /* restore frequency */ + frequency = new_frequency; + + if (frequency < 153000000) { + /* VHF-L */ + buf[3] |= 0x01; /* fc, Low Band, 47 - 153 MHz */ + if (frequency < 68000000) + buf[3] |= 0x40; /* 83uA */ + if (frequency < 1040000000) + buf[3] |= 0x60; /* 122uA */ + if (frequency < 1250000000) + buf[3] |= 0x80; /* 163uA */ + else + buf[3] |= 0xa0; /* 254uA */ + } else if (frequency < 438000000) { + /* VHF-H */ + buf[3] |= 0x02; /* fc, Mid Band, 153 - 438 MHz */ + if (frequency < 230000000) + buf[3] |= 0x40; + if (frequency < 300000000) + buf[3] |= 0x60; + else + buf[3] |= 0x80; + } else { + /* UHF */ + buf[3] |= 0x04; /* fc, High Band, 438 - 862 MHz */ + if (frequency < 470000000) + buf[3] |= 0x60; + if (frequency < 526000000) + buf[3] |= 0x80; + else + buf[3] |= 0xa0; + } + + /* Set params */ + err = tda665x_write(state, buf, 5); + if (err < 0) + goto exit; + + /* sleep for some time */ + printk(KERN_DEBUG "%s: Waiting to Phase LOCK\n", __func__); + msleep(20); + /* check status */ + err = tda665x_get_status(fe, &status); + if (err < 0) + goto exit; + + if (status == 1) { + printk(KERN_DEBUG "%s: Tuner Phase locked: status=%d\n", + __func__, status); + state->frequency = frequency; /* cache successful state */ + } else { + printk(KERN_ERR "%s: No Phase lock: status=%d\n", + __func__, status); + } + return 0; exit: printk(KERN_ERR "%s: I/O Error\n", __func__); return err; } +static int tda665x_set_state(struct dvb_frontend *fe, + enum tuner_param param, + struct tuner_state *tstate) +{ + if (param & DVBFE_TUNER_FREQUENCY) + return tda665x_set_frequency(fe, tstate->frequency); + + printk(KERN_ERR "%s: Unknown parameter (param=%d)\n", __func__, param); + return -EINVAL; +} + static int tda665x_release(struct dvb_frontend *fe) { struct tda665x_state *state = fe->tuner_priv; -- cgit v0.10.2 From 8fdc25bf61cf87280960e9fea453ac86b68fbb35 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Wed, 11 Nov 2015 17:40:27 -0200 Subject: [media] tda666x: add support for set_parms() and get_frequency() Those two callbacks are the ones that should be used by normal DVB frontend drivers. Add support for them. Signed-off-by: Mauro Carvalho Chehab diff --git a/drivers/media/dvb-frontends/tda665x.c b/drivers/media/dvb-frontends/tda665x.c index 9c89253..6ced688 100644 --- a/drivers/media/dvb-frontends/tda665x.c +++ b/drivers/media/dvb-frontends/tda665x.c @@ -88,6 +88,15 @@ static int tda665x_get_state(struct dvb_frontend *fe, return err; } +static int tda665x_get_frequency(struct dvb_frontend *fe, u32 *frequency) +{ + struct tda665x_state *state = fe->tuner_priv; + + *frequency = state->frequency; + + return 0; +} + static int tda665x_get_status(struct dvb_frontend *fe, u32 *status) { struct tda665x_state *state = fe->tuner_priv; @@ -201,6 +210,15 @@ exit: return err; } +static int tda665x_set_params(struct dvb_frontend *fe) +{ + struct dtv_frontend_properties *c = &fe->dtv_property_cache; + + tda665x_set_frequency(fe, c->frequency); + + return 0; +} + static int tda665x_set_state(struct dvb_frontend *fe, enum tuner_param param, struct tuner_state *tstate) @@ -226,6 +244,8 @@ static struct dvb_tuner_ops tda665x_ops = { .set_state = tda665x_set_state, .get_state = tda665x_get_state, .get_status = tda665x_get_status, + .set_params = tda665x_set_params, + .get_frequency = tda665x_get_frequency, .release = tda665x_release }; -- cgit v0.10.2 From e417668d402e07a0de40b996005324800e694633 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Wed, 11 Nov 2015 18:07:44 -0200 Subject: [media] tda8261: don't use set_state/get_state callbacks Those callbacks are meant to be used only on some very specific cases. There's absolutely no need to do that at tda8261, as the only parameter that it allows to be set/get is the frequency. So, use the standard get_params() and get_frequency() kABI ops. There's no need to touch at any bridge driver, as all interactions are done via the macros at tda8261_cfg.h. Signed-off-by: Mauro Carvalho Chehab diff --git a/drivers/media/dvb-frontends/tda8261.c b/drivers/media/dvb-frontends/tda8261.c index 19c4888..3285b1b 100644 --- a/drivers/media/dvb-frontends/tda8261.c +++ b/drivers/media/dvb-frontends/tda8261.c @@ -83,88 +83,71 @@ static int tda8261_get_status(struct dvb_frontend *fe, u32 *status) static const u32 div_tab[] = { 2000, 1000, 500, 250, 125 }; /* kHz */ static const u8 ref_div[] = { 0x00, 0x01, 0x02, 0x05, 0x07 }; -static int tda8261_get_state(struct dvb_frontend *fe, - enum tuner_param param, - struct tuner_state *tstate) +static int tda8261_get_frequency(struct dvb_frontend *fe, u32 *frequency) { struct tda8261_state *state = fe->tuner_priv; - int err = 0; - switch (param) { - case DVBFE_TUNER_FREQUENCY: - tstate->frequency = state->frequency; - break; - case DVBFE_TUNER_BANDWIDTH: - tstate->bandwidth = 40000000; /* FIXME! need to calculate Bandwidth */ - break; - default: - pr_err("%s: Unknown parameter (param=%d)\n", __func__, param); - err = -EINVAL; - break; - } + *frequency = state->frequency; - return err; + return 0; } -static int tda8261_set_state(struct dvb_frontend *fe, - enum tuner_param param, - struct tuner_state *tstate) +static int tda8261_set_params(struct dvb_frontend *fe) { + struct dtv_frontend_properties *c = &fe->dtv_property_cache; struct tda8261_state *state = fe->tuner_priv; const struct tda8261_config *config = state->config; u32 frequency, N, status = 0; u8 buf[4]; int err = 0; - if (param & DVBFE_TUNER_FREQUENCY) { - /** - * N = Max VCO Frequency / Channel Spacing - * Max VCO Frequency = VCO frequency + (channel spacing - 1) - * (to account for half channel spacing on either side) - */ - frequency = tstate->frequency; - if ((frequency < 950000) || (frequency > 2150000)) { - pr_warn("%s: Frequency beyond limits, frequency=%d\n", __func__, frequency); - return -EINVAL; - } - N = (frequency + (div_tab[config->step_size] - 1)) / div_tab[config->step_size]; - pr_debug("%s: Step size=%d, Divider=%d, PG=0x%02x (%d)\n", - __func__, config->step_size, div_tab[config->step_size], N, N); - - buf[0] = (N >> 8) & 0xff; - buf[1] = N & 0xff; - buf[2] = (0x01 << 7) | ((ref_div[config->step_size] & 0x07) << 1); - - if (frequency < 1450000) - buf[3] = 0x00; - else if (frequency < 2000000) - buf[3] = 0x40; - else if (frequency < 2150000) - buf[3] = 0x80; - - /* Set params */ - if ((err = tda8261_write(state, buf)) < 0) { - pr_err("%s: I/O Error\n", __func__); - return err; - } - /* sleep for some time */ - pr_debug("%s: Waiting to Phase LOCK\n", __func__); - msleep(20); - /* check status */ - if ((err = tda8261_get_status(fe, &status)) < 0) { - pr_err("%s: I/O Error\n", __func__); - return err; - } - if (status == 1) { - pr_debug("%s: Tuner Phase locked: status=%d\n", __func__, status); - state->frequency = frequency; /* cache successful state */ - } else { - pr_debug("%s: No Phase lock: status=%d\n", __func__, status); - } - } else { - pr_err("%s: Unknown parameter (param=%d)\n", __func__, param); + /* + * N = Max VCO Frequency / Channel Spacing + * Max VCO Frequency = VCO frequency + (channel spacing - 1) + * (to account for half channel spacing on either side) + */ + frequency = c->frequency; + if ((frequency < 950000) || (frequency > 2150000)) { + pr_warn("%s: Frequency beyond limits, frequency=%d\n", + __func__, frequency); return -EINVAL; } + N = (frequency + (div_tab[config->step_size] - 1)) / div_tab[config->step_size]; + pr_debug("%s: Step size=%d, Divider=%d, PG=0x%02x (%d)\n", + __func__, config->step_size, div_tab[config->step_size], N, N); + + buf[0] = (N >> 8) & 0xff; + buf[1] = N & 0xff; + buf[2] = (0x01 << 7) | ((ref_div[config->step_size] & 0x07) << 1); + + if (frequency < 1450000) + buf[3] = 0x00; + else if (frequency < 2000000) + buf[3] = 0x40; + else if (frequency < 2150000) + buf[3] = 0x80; + + /* Set params */ + err = tda8261_write(state, buf); + if (err < 0) { + pr_err("%s: I/O Error\n", __func__); + return err; + } + /* sleep for some time */ + pr_debug("%s: Waiting to Phase LOCK\n", __func__); + msleep(20); + /* check status */ + if ((err = tda8261_get_status(fe, &status)) < 0) { + pr_err("%s: I/O Error\n", __func__); + return err; + } + if (status == 1) { + pr_debug("%s: Tuner Phase locked: status=%d\n", __func__, + status); + state->frequency = frequency; /* cache successful state */ + } else { + pr_debug("%s: No Phase lock: status=%d\n", __func__, status); + } return 0; } @@ -182,14 +165,13 @@ static struct dvb_tuner_ops tda8261_ops = { .info = { .name = "TDA8261", -// .tuner_name = NULL, .frequency_min = 950000, .frequency_max = 2150000, .frequency_step = 0 }, - .set_state = tda8261_set_state, - .get_state = tda8261_get_state, + .set_params = tda8261_set_params, + .get_frequency = tda8261_get_frequency, .get_status = tda8261_get_status, .release = tda8261_release }; @@ -210,10 +192,7 @@ struct dvb_frontend *tda8261_attach(struct dvb_frontend *fe, fe->ops.tuner_ops = tda8261_ops; fe->ops.tuner_ops.info.frequency_step = div_tab[config->step_size]; -// fe->ops.tuner_ops.tuner_name = &config->buf; -// printk("%s: Attaching %s TDA8261 8PSK/QPSK tuner\n", -// __func__, fe->ops.tuner_ops.tuner_name); pr_info("%s: Attaching TDA8261 8PSK/QPSK tuner\n", __func__); return fe; diff --git a/drivers/media/dvb-frontends/tda8261_cfg.h b/drivers/media/dvb-frontends/tda8261_cfg.h index 04a19e1..fe527ff 100644 --- a/drivers/media/dvb-frontends/tda8261_cfg.h +++ b/drivers/media/dvb-frontends/tda8261_cfg.h @@ -21,17 +21,15 @@ static int tda8261_get_frequency(struct dvb_frontend *fe, u32 *frequency) { struct dvb_frontend_ops *frontend_ops = &fe->ops; struct dvb_tuner_ops *tuner_ops = &frontend_ops->tuner_ops; - struct tuner_state t_state; int err = 0; - if (tuner_ops->get_state) { - err = tuner_ops->get_state(fe, DVBFE_TUNER_FREQUENCY, &t_state); + if (tuner_ops->get_frequency) { + err = tuner_ops->get_frequency(fe, frequency); if (err < 0) { - printk("%s: Invalid parameter\n", __func__); + pr_err("%s: Invalid parameter\n", __func__); return err; } - *frequency = t_state.frequency; - printk("%s: Frequency=%d\n", __func__, t_state.frequency); + pr_debug("%s: Frequency=%d\n", __func__, *frequency); } return 0; } @@ -40,37 +38,24 @@ static int tda8261_set_frequency(struct dvb_frontend *fe, u32 frequency) { struct dvb_frontend_ops *frontend_ops = &fe->ops; struct dvb_tuner_ops *tuner_ops = &frontend_ops->tuner_ops; - struct tuner_state t_state; + struct dtv_frontend_properties *c = &fe->dtv_property_cache; int err = 0; - t_state.frequency = frequency; - - if (tuner_ops->set_state) { - err = tuner_ops->set_state(fe, DVBFE_TUNER_FREQUENCY, &t_state); + if (tuner_ops->set_params) { + err = tuner_ops->set_params(fe); if (err < 0) { - printk("%s: Invalid parameter\n", __func__); + pr_err("%s: Invalid parameter\n", __func__); return err; } } - printk("%s: Frequency=%d\n", __func__, t_state.frequency); + pr_debug("%s: Frequency=%d\n", __func__, c->frequency); return 0; } static int tda8261_get_bandwidth(struct dvb_frontend *fe, u32 *bandwidth) { - struct dvb_frontend_ops *frontend_ops = &fe->ops; - struct dvb_tuner_ops *tuner_ops = &frontend_ops->tuner_ops; - struct tuner_state t_state; - int err = 0; + /* FIXME! need to calculate Bandwidth */ + *bandwidth = 40000000; - if (tuner_ops->get_state) { - err = tuner_ops->get_state(fe, DVBFE_TUNER_BANDWIDTH, &t_state); - if (err < 0) { - printk("%s: Invalid parameter\n", __func__); - return err; - } - *bandwidth = t_state.bandwidth; - printk("%s: Bandwidth=%d\n", __func__, t_state.bandwidth); - } return 0; } -- cgit v0.10.2 From b2d3afcfbdb7cd48759433ab9a0dd1bf20cc6aa8 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Wed, 11 Nov 2015 18:14:08 -0200 Subject: [media] tda6655: get rid of get_state()/set_state() Those ops aren't used by any driver, with is weird. I suspect that mantis_vb3030 driver were not working properly... Anyway, now that the driver uses the set_parms, the DVB frontend core should do the right thing. Signed-off-by: Mauro Carvalho Chehab diff --git a/drivers/media/dvb-frontends/tda665x.c b/drivers/media/dvb-frontends/tda665x.c index 6ced688..82f8cc5 100644 --- a/drivers/media/dvb-frontends/tda665x.c +++ b/drivers/media/dvb-frontends/tda665x.c @@ -66,28 +66,6 @@ exit: return err; } -static int tda665x_get_state(struct dvb_frontend *fe, - enum tuner_param param, - struct tuner_state *tstate) -{ - struct tda665x_state *state = fe->tuner_priv; - int err = 0; - - switch (param) { - case DVBFE_TUNER_FREQUENCY: - tstate->frequency = state->frequency; - break; - case DVBFE_TUNER_BANDWIDTH: - break; - default: - printk(KERN_ERR "%s: Unknown parameter (param=%d)\n", __func__, param); - err = -EINVAL; - break; - } - - return err; -} - static int tda665x_get_frequency(struct dvb_frontend *fe, u32 *frequency) { struct tda665x_state *state = fe->tuner_priv; @@ -219,17 +197,6 @@ static int tda665x_set_params(struct dvb_frontend *fe) return 0; } -static int tda665x_set_state(struct dvb_frontend *fe, - enum tuner_param param, - struct tuner_state *tstate) -{ - if (param & DVBFE_TUNER_FREQUENCY) - return tda665x_set_frequency(fe, tstate->frequency); - - printk(KERN_ERR "%s: Unknown parameter (param=%d)\n", __func__, param); - return -EINVAL; -} - static int tda665x_release(struct dvb_frontend *fe) { struct tda665x_state *state = fe->tuner_priv; @@ -240,9 +207,6 @@ static int tda665x_release(struct dvb_frontend *fe) } static struct dvb_tuner_ops tda665x_ops = { - - .set_state = tda665x_set_state, - .get_state = tda665x_get_state, .get_status = tda665x_get_status, .set_params = tda665x_set_params, .get_frequency = tda665x_get_frequency, -- cgit v0.10.2 From 65f0f686de9fcbad7dadcedae94de77ab8b558b7 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Wed, 11 Nov 2015 18:42:00 -0200 Subject: [media] stb6100: get rid of get_state()/set_state() It is tricky to get rid of those ops here, as the stv0299 driver wants to set frequency in separate from setting the bandwidth. So, we use a small trick: we temporarely fill the cache with 0 for either frequency or bandwidth and add some logic at set_params to only change the property(ies) that aren't zero. Signed-off-by: Mauro Carvalho Chehab diff --git a/drivers/media/dvb-frontends/stb6100.c b/drivers/media/dvb-frontends/stb6100.c index 5d8dbde..c978c80 100644 --- a/drivers/media/dvb-frontends/stb6100.c +++ b/drivers/media/dvb-frontends/stb6100.c @@ -502,49 +502,22 @@ static int stb6100_init(struct dvb_frontend *fe) * iqsense = 1 * tunerstep = 125000 */ - state->bandwidth = 36000000; /* Hz */ + state->bandwidth = 36000000; /* Hz */ state->reference = refclk / 1000; /* kHz */ /* Set default bandwidth. Modified, PN 13-May-10 */ return 0; } -static int stb6100_get_state(struct dvb_frontend *fe, - enum tuner_param param, - struct tuner_state *state) +static int stb6100_set_params(struct dvb_frontend *fe) { - switch (param) { - case DVBFE_TUNER_FREQUENCY: - stb6100_get_frequency(fe, &state->frequency); - break; - case DVBFE_TUNER_BANDWIDTH: - stb6100_get_bandwidth(fe, &state->bandwidth); - break; - default: - break; - } + struct dtv_frontend_properties *c = &fe->dtv_property_cache; - return 0; -} + if (c->frequency > 0) + stb6100_set_frequency(fe, c->frequency); -static int stb6100_set_state(struct dvb_frontend *fe, - enum tuner_param param, - struct tuner_state *state) -{ - struct stb6100_state *tstate = fe->tuner_priv; - - switch (param) { - case DVBFE_TUNER_FREQUENCY: - stb6100_set_frequency(fe, state->frequency); - tstate->frequency = state->frequency; - break; - case DVBFE_TUNER_BANDWIDTH: - stb6100_set_bandwidth(fe, state->bandwidth); - tstate->bandwidth = state->bandwidth; - break; - default: - break; - } + if (c->bandwidth_hz > 0) + stb6100_set_bandwidth(fe, c->bandwidth_hz); return 0; } @@ -560,8 +533,9 @@ static struct dvb_tuner_ops stb6100_ops = { .init = stb6100_init, .sleep = stb6100_sleep, .get_status = stb6100_get_status, - .get_state = stb6100_get_state, - .set_state = stb6100_set_state, + .set_params = stb6100_set_params, + .get_frequency = stb6100_get_frequency, + .get_bandwidth = stb6100_get_bandwidth, .release = stb6100_release }; diff --git a/drivers/media/dvb-frontends/stb6100_cfg.h b/drivers/media/dvb-frontends/stb6100_cfg.h index 6edc153..2ef67aa 100644 --- a/drivers/media/dvb-frontends/stb6100_cfg.h +++ b/drivers/media/dvb-frontends/stb6100_cfg.h @@ -19,20 +19,21 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +#include +#include "dvb_frontend.h" + static int stb6100_get_frequency(struct dvb_frontend *fe, u32 *frequency) { struct dvb_frontend_ops *frontend_ops = &fe->ops; struct dvb_tuner_ops *tuner_ops = &frontend_ops->tuner_ops; - struct tuner_state t_state; int err = 0; - if (tuner_ops->get_state) { - err = tuner_ops->get_state(fe, DVBFE_TUNER_FREQUENCY, &t_state); + if (tuner_ops->get_frequency) { + err = tuner_ops->get_frequency(fe, frequency); if (err < 0) { printk("%s: Invalid parameter\n", __func__); return err; } - *frequency = t_state.frequency; } return 0; } @@ -41,13 +42,16 @@ static int stb6100_set_frequency(struct dvb_frontend *fe, u32 frequency) { struct dvb_frontend_ops *frontend_ops = &fe->ops; struct dvb_tuner_ops *tuner_ops = &frontend_ops->tuner_ops; - struct tuner_state t_state; + struct dtv_frontend_properties *c = &fe->dtv_property_cache; + u32 bw = c->bandwidth_hz; int err = 0; - t_state.frequency = frequency; + c->frequency = frequency; + c->bandwidth_hz = 0; /* Don't adjust the bandwidth */ - if (tuner_ops->set_state) { - err = tuner_ops->set_state(fe, DVBFE_TUNER_FREQUENCY, &t_state); + if (tuner_ops->set_params) { + err = tuner_ops->set_params(fe); + c->bandwidth_hz = bw; if (err < 0) { printk("%s: Invalid parameter\n", __func__); return err; @@ -60,16 +64,14 @@ static int stb6100_get_bandwidth(struct dvb_frontend *fe, u32 *bandwidth) { struct dvb_frontend_ops *frontend_ops = &fe->ops; struct dvb_tuner_ops *tuner_ops = &frontend_ops->tuner_ops; - struct tuner_state t_state; int err = 0; - if (tuner_ops->get_state) { - err = tuner_ops->get_state(fe, DVBFE_TUNER_BANDWIDTH, &t_state); + if (tuner_ops->get_bandwidth) { + err = tuner_ops->get_bandwidth(fe, bandwidth); if (err < 0) { printk("%s: Invalid parameter\n", __func__); return err; } - *bandwidth = t_state.bandwidth; } return 0; } @@ -78,13 +80,16 @@ static int stb6100_set_bandwidth(struct dvb_frontend *fe, u32 bandwidth) { struct dvb_frontend_ops *frontend_ops = &fe->ops; struct dvb_tuner_ops *tuner_ops = &frontend_ops->tuner_ops; - struct tuner_state t_state; + struct dtv_frontend_properties *c = &fe->dtv_property_cache; + u32 freq = c->frequency; int err = 0; - t_state.bandwidth = bandwidth; + c->bandwidth_hz = bandwidth; + c->frequency = 0; /* Don't adjust the frequency */ - if (tuner_ops->set_state) { - err = tuner_ops->set_state(fe, DVBFE_TUNER_BANDWIDTH, &t_state); + if (tuner_ops->set_params) { + err = tuner_ops->set_params(fe); + c->frequency = freq; if (err < 0) { printk("%s: Invalid parameter\n", __func__); return err; diff --git a/drivers/media/dvb-frontends/stb6100_proc.h b/drivers/media/dvb-frontends/stb6100_proc.h index bd8a0ec..50ffa21 100644 --- a/drivers/media/dvb-frontends/stb6100_proc.h +++ b/drivers/media/dvb-frontends/stb6100_proc.h @@ -17,27 +17,27 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +#include +#include "dvb_frontend.h" + static int stb6100_get_freq(struct dvb_frontend *fe, u32 *frequency) { struct dvb_frontend_ops *frontend_ops = &fe->ops; struct dvb_tuner_ops *tuner_ops = &frontend_ops->tuner_ops; - struct tuner_state state; int err = 0; - if (tuner_ops->get_state) { + if (tuner_ops->get_frequency) { if (frontend_ops->i2c_gate_ctrl) frontend_ops->i2c_gate_ctrl(fe, 1); - err = tuner_ops->get_state(fe, DVBFE_TUNER_FREQUENCY, &state); + err = tuner_ops->get_frequency(fe, frequency); if (err < 0) { - printk(KERN_ERR "%s: Invalid parameter\n", __func__); + printk("%s: Invalid parameter\n", __func__); return err; } if (frontend_ops->i2c_gate_ctrl) frontend_ops->i2c_gate_ctrl(fe, 0); - - *frequency = state.frequency; } return 0; @@ -47,18 +47,21 @@ static int stb6100_set_freq(struct dvb_frontend *fe, u32 frequency) { struct dvb_frontend_ops *frontend_ops = &fe->ops; struct dvb_tuner_ops *tuner_ops = &frontend_ops->tuner_ops; - struct tuner_state state; + struct dtv_frontend_properties *c = &fe->dtv_property_cache; + u32 bw = c->bandwidth_hz; int err = 0; - state.frequency = frequency; + c->frequency = frequency; + c->bandwidth_hz = 0; /* Don't adjust the bandwidth */ - if (tuner_ops->set_state) { + if (tuner_ops->set_params) { if (frontend_ops->i2c_gate_ctrl) frontend_ops->i2c_gate_ctrl(fe, 1); - err = tuner_ops->set_state(fe, DVBFE_TUNER_FREQUENCY, &state); + err = tuner_ops->set_params(fe); + c->bandwidth_hz = bw; if (err < 0) { - printk(KERN_ERR "%s: Invalid parameter\n", __func__); + printk("%s: Invalid parameter\n", __func__); return err; } @@ -74,14 +77,13 @@ static int stb6100_get_bandw(struct dvb_frontend *fe, u32 *bandwidth) { struct dvb_frontend_ops *frontend_ops = &fe->ops; struct dvb_tuner_ops *tuner_ops = &frontend_ops->tuner_ops; - struct tuner_state state; int err = 0; - if (tuner_ops->get_state) { + if (tuner_ops->get_bandwidth) { if (frontend_ops->i2c_gate_ctrl) frontend_ops->i2c_gate_ctrl(fe, 1); - err = tuner_ops->get_state(fe, DVBFE_TUNER_BANDWIDTH, &state); + err = tuner_ops->get_bandwidth(fe, bandwidth); if (err < 0) { printk(KERN_ERR "%s: Invalid parameter\n", __func__); return err; @@ -89,8 +91,6 @@ static int stb6100_get_bandw(struct dvb_frontend *fe, u32 *bandwidth) if (frontend_ops->i2c_gate_ctrl) frontend_ops->i2c_gate_ctrl(fe, 0); - - *bandwidth = state.bandwidth; } return 0; @@ -100,16 +100,19 @@ static int stb6100_set_bandw(struct dvb_frontend *fe, u32 bandwidth) { struct dvb_frontend_ops *frontend_ops = &fe->ops; struct dvb_tuner_ops *tuner_ops = &frontend_ops->tuner_ops; - struct tuner_state state; + struct dtv_frontend_properties *c = &fe->dtv_property_cache; + u32 freq = c->frequency; int err = 0; - state.bandwidth = bandwidth; + c->bandwidth_hz = bandwidth; + c->frequency = 0; /* Don't adjust the frequency */ - if (tuner_ops->set_state) { + if (tuner_ops->set_params) { if (frontend_ops->i2c_gate_ctrl) frontend_ops->i2c_gate_ctrl(fe, 1); - err = tuner_ops->set_state(fe, DVBFE_TUNER_BANDWIDTH, &state); + err = tuner_ops->set_params(fe); + c->frequency = freq; if (err < 0) { printk(KERN_ERR "%s: Invalid parameter\n", __func__); return err; -- cgit v0.10.2 From 45346e0e83ae1d0eff59d6d200bcb9338c04355f Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Wed, 11 Nov 2015 18:57:16 -0200 Subject: [media] dvb_frontend: get rid of set_state ops & related data The get_state()/set_state and the corresponding data types (struct tuner_state and enum tuner_param) are old DVB interfaces that came from the DVBv3 time. Nowadays, set_params() provide a better way to set the tuner and demod parameters. So, no need to keep those legacy stuff, as all drivers that were using it got converted. With this patch, all kABI elements at dvb_frontend.h are now documented. Signed-off-by: Mauro Carvalho Chehab diff --git a/drivers/media/dvb-core/dvb_frontend.h b/drivers/media/dvb-core/dvb_frontend.h index 4856411..032e125 100644 --- a/drivers/media/dvb-core/dvb_frontend.h +++ b/drivers/media/dvb-core/dvb_frontend.h @@ -135,11 +135,6 @@ struct analog_parameters { u64 std; }; -enum tuner_param { - DVBFE_TUNER_FREQUENCY = (1 << 0), - DVBFE_TUNER_BANDWIDTH = (1 << 1), -}; - /** * enum dvbfe_algo - defines the algorithm used to tune into a channel * @@ -170,11 +165,6 @@ enum dvbfe_algo { DVBFE_ALGO_RECOVERY = (1 << 31) }; -struct tuner_state { - u32 frequency; - u32 bandwidth; -}; - /** * enum dvbfe_search - search callback possible return status * @@ -245,12 +235,6 @@ enum dvbfe_search { * set_params is preferred. * @set_bandwidth: Set a new frequency. Please notice that using * set_params is preferred. - * @set_state: callback function used on some legacy drivers that - * don't implement set_params in order to set properties. - * Shouldn't be used on new drivers. - * @get_state: callback function used to get properties by some - * legacy drivers that don't implement set_params. - * Shouldn't be used on new drivers. * * NOTE: frequencies used on get_frequency and set_frequency are in Hz for * terrestrial/cable or kHz for satellite. @@ -290,13 +274,6 @@ struct dvb_tuner_ops { * tuners which require sophisticated tuning loops, controlling each parameter separately. */ int (*set_frequency)(struct dvb_frontend *fe, u32 frequency); int (*set_bandwidth)(struct dvb_frontend *fe, u32 bandwidth); - - /* - * These are provided separately from set_params in order to facilitate silicon - * tuners which require sophisticated tuning loops, controlling each parameter separately. - */ - int (*set_state)(struct dvb_frontend *fe, enum tuner_param param, struct tuner_state *state); - int (*get_state)(struct dvb_frontend *fe, enum tuner_param param, struct tuner_state *state); }; /** diff --git a/drivers/media/tuners/mt2063.c b/drivers/media/tuners/mt2063.c index 9e9c5eb..6457ac9 100644 --- a/drivers/media/tuners/mt2063.c +++ b/drivers/media/tuners/mt2063.c @@ -225,7 +225,6 @@ struct mt2063_state { const struct mt2063_config *config; struct dvb_tuner_ops ops; struct dvb_frontend *frontend; - struct tuner_state status; u32 frequency; u32 srate; -- cgit v0.10.2 From bef0e549d165b681bb505c9839ce3d3be33822ee Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Wed, 11 Nov 2015 19:14:44 -0200 Subject: [media] dvb_frontend.h: improve documentation for struct dvb_tuner_ops Improve the comments at the header, removing kernel-doc tag from where it doesn't belong, grouping the legacy tuner functions, and improving the text. No functional changes. Signed-off-by: Mauro Carvalho Chehab diff --git a/drivers/media/dvb-core/dvb_frontend.h b/drivers/media/dvb-core/dvb_frontend.h index 032e125..458bcce 100644 --- a/drivers/media/dvb-core/dvb_frontend.h +++ b/drivers/media/dvb-core/dvb_frontend.h @@ -213,12 +213,12 @@ enum dvbfe_search { * are stored at @dvb_frontend.dtv_property_cache;. The * tuner demod can change the parameters to reflect the * changes needed for the channel to be tuned, and - * update statistics. + * update statistics. This is the recommended way to set + * the tuner parameters and should be used on newer + * drivers. * @set_analog_params: callback function used to tune into an analog TV * channel on hybrid tuners. It passes @analog_parameters; * to the driver. - * @calc_regs: callback function used to pass register data settings - * for simple tuners. * @set_config: callback function used to send some tuner-specific * parameters. * @get_frequency: get the actual tuned frequency @@ -231,10 +231,10 @@ enum dvbfe_search { * via DVBv5 API (@dvb_frontend.dtv_property_cache;). * @get_afc: Used only by analog TV core. Reports the frequency * drift due to AFC. - * @set_frequency: Set a new frequency. Please notice that using - * set_params is preferred. - * @set_bandwidth: Set a new frequency. Please notice that using - * set_params is preferred. + * @calc_regs: callback function used to pass register data settings + * for simple tuners. Shouldn't be used on newer drivers. + * @set_frequency: Set a new frequency. Shouldn't be used on newer drivers. + * @set_bandwidth: Set a new frequency. Shouldn't be used on newer drivers. * * NOTE: frequencies used on get_frequency and set_frequency are in Hz for * terrestrial/cable or kHz for satellite. @@ -250,14 +250,10 @@ struct dvb_tuner_ops { int (*suspend)(struct dvb_frontend *fe); int (*resume)(struct dvb_frontend *fe); - /** This is for simple PLLs - set all parameters in one go. */ + /* This is the recomended way to set the tuner */ int (*set_params)(struct dvb_frontend *fe); int (*set_analog_params)(struct dvb_frontend *fe, struct analog_parameters *p); - /** This is support for demods like the mt352 - fills out the supplied buffer with what to write. */ - int (*calc_regs)(struct dvb_frontend *fe, u8 *buf, int buf_len); - - /** This is to allow setting tuner-specific configs */ int (*set_config)(struct dvb_frontend *fe, void *priv_cfg); int (*get_frequency)(struct dvb_frontend *fe, u32 *frequency); @@ -270,8 +266,21 @@ struct dvb_tuner_ops { int (*get_rf_strength)(struct dvb_frontend *fe, u16 *strength); int (*get_afc)(struct dvb_frontend *fe, s32 *afc); - /** These are provided separately from set_params in order to facilitate silicon - * tuners which require sophisticated tuning loops, controlling each parameter separately. */ + /* + * This is support for demods like the mt352 - fills out the supplied + * buffer with what to write. + * + * Don't use on newer drivers. + */ + int (*calc_regs)(struct dvb_frontend *fe, u8 *buf, int buf_len); + + /* + * These are provided separately from set_params in order to + * facilitate silicon tuners which require sophisticated tuning loops, + * controlling each parameter separately. + * + * Don't use on newer drivers. + */ int (*set_frequency)(struct dvb_frontend *fe, u32 frequency); int (*set_bandwidth)(struct dvb_frontend *fe, u32 bandwidth); }; @@ -462,7 +471,8 @@ struct dvb_frontend_ops { int (*ts_bus_ctrl)(struct dvb_frontend* fe, int acquire); int (*set_lna)(struct dvb_frontend *); - /* These callbacks are for devices that implement their own + /* + * These callbacks are for devices that implement their own * tuning algorithms, rather than a simple swzigzag */ enum dvbfe_search (*search)(struct dvb_frontend *fe); -- cgit v0.10.2 From f82a9ece922fd299e03ef5986e02b4e6bce8405a Mon Sep 17 00:00:00 2001 From: Graham Whaley Date: Thu, 12 Nov 2015 18:51:51 -0200 Subject: [media] DocBook/media/Makefile: Do not fail mkdir if dir already exists MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 5240f4e68d42 ("[media] DocBook/media/Makefile: Avoid make htmldocs to fail") introduced a mkdir which is always called through install_media_images from the Documentation/DocBook/Makefile htmldocs rule. If you run 'make htmldocs' more than once you get: mkdir: cannot create directory ‘./Documentation/DocBook//media_api’: File exists Add -p to the mkdir to continue no matter if the dir already exists. Signed-off-by: Graham Whaley Signed-off-by: Mauro Carvalho Chehab diff --git a/Documentation/DocBook/media/Makefile b/Documentation/DocBook/media/Makefile index 08527e7..0284814 100644 --- a/Documentation/DocBook/media/Makefile +++ b/Documentation/DocBook/media/Makefile @@ -199,7 +199,7 @@ DVB_DOCUMENTED = \ # install_media_images = \ - $(Q)-mkdir $(MEDIA_OBJ_DIR)/media_api; \ + $(Q)-mkdir -p $(MEDIA_OBJ_DIR)/media_api; \ cp $(OBJIMGFILES) $(MEDIA_SRC_DIR)/*.svg $(MEDIA_SRC_DIR)/v4l/*.svg $(MEDIA_OBJ_DIR)/media_api $(MEDIA_OBJ_DIR)/%: $(MEDIA_SRC_DIR)/%.b64 -- cgit v0.10.2 From b7be755733dc44c72956c91876e5d86c56052a54 Mon Sep 17 00:00:00 2001 From: Alec Leamas Date: Thu, 12 Nov 2015 16:03:00 -0200 Subject: [media] bz#75751: Move internal header file lirc.h to uapi/ The file include/media/lirc.h describes a public interface and should thus be a public header. See kernel bug https://bugzilla.kernel.org/show_bug.cgi?id=75751 which has a manpage describing the interface + an acknowledgment that this info belongs to uapi. Signed-off-by: Mauro Carvalho Chehab diff --git a/include/media/lirc.h b/include/media/lirc.h index 4b3ab29..554988c 100644 --- a/include/media/lirc.h +++ b/include/media/lirc.h @@ -1,168 +1 @@ -/* - * lirc.h - linux infrared remote control header file - * last modified 2010/07/13 by Jarod Wilson - */ - -#ifndef _LINUX_LIRC_H -#define _LINUX_LIRC_H - -#include -#include - -#define PULSE_BIT 0x01000000 -#define PULSE_MASK 0x00FFFFFF - -#define LIRC_MODE2_SPACE 0x00000000 -#define LIRC_MODE2_PULSE 0x01000000 -#define LIRC_MODE2_FREQUENCY 0x02000000 -#define LIRC_MODE2_TIMEOUT 0x03000000 - -#define LIRC_VALUE_MASK 0x00FFFFFF -#define LIRC_MODE2_MASK 0xFF000000 - -#define LIRC_SPACE(val) (((val)&LIRC_VALUE_MASK) | LIRC_MODE2_SPACE) -#define LIRC_PULSE(val) (((val)&LIRC_VALUE_MASK) | LIRC_MODE2_PULSE) -#define LIRC_FREQUENCY(val) (((val)&LIRC_VALUE_MASK) | LIRC_MODE2_FREQUENCY) -#define LIRC_TIMEOUT(val) (((val)&LIRC_VALUE_MASK) | LIRC_MODE2_TIMEOUT) - -#define LIRC_VALUE(val) ((val)&LIRC_VALUE_MASK) -#define LIRC_MODE2(val) ((val)&LIRC_MODE2_MASK) - -#define LIRC_IS_SPACE(val) (LIRC_MODE2(val) == LIRC_MODE2_SPACE) -#define LIRC_IS_PULSE(val) (LIRC_MODE2(val) == LIRC_MODE2_PULSE) -#define LIRC_IS_FREQUENCY(val) (LIRC_MODE2(val) == LIRC_MODE2_FREQUENCY) -#define LIRC_IS_TIMEOUT(val) (LIRC_MODE2(val) == LIRC_MODE2_TIMEOUT) - -/* used heavily by lirc userspace */ -#define lirc_t int - -/*** lirc compatible hardware features ***/ - -#define LIRC_MODE2SEND(x) (x) -#define LIRC_SEND2MODE(x) (x) -#define LIRC_MODE2REC(x) ((x) << 16) -#define LIRC_REC2MODE(x) ((x) >> 16) - -#define LIRC_MODE_RAW 0x00000001 -#define LIRC_MODE_PULSE 0x00000002 -#define LIRC_MODE_MODE2 0x00000004 -#define LIRC_MODE_LIRCCODE 0x00000010 - - -#define LIRC_CAN_SEND_RAW LIRC_MODE2SEND(LIRC_MODE_RAW) -#define LIRC_CAN_SEND_PULSE LIRC_MODE2SEND(LIRC_MODE_PULSE) -#define LIRC_CAN_SEND_MODE2 LIRC_MODE2SEND(LIRC_MODE_MODE2) -#define LIRC_CAN_SEND_LIRCCODE LIRC_MODE2SEND(LIRC_MODE_LIRCCODE) - -#define LIRC_CAN_SEND_MASK 0x0000003f - -#define LIRC_CAN_SET_SEND_CARRIER 0x00000100 -#define LIRC_CAN_SET_SEND_DUTY_CYCLE 0x00000200 -#define LIRC_CAN_SET_TRANSMITTER_MASK 0x00000400 - -#define LIRC_CAN_REC_RAW LIRC_MODE2REC(LIRC_MODE_RAW) -#define LIRC_CAN_REC_PULSE LIRC_MODE2REC(LIRC_MODE_PULSE) -#define LIRC_CAN_REC_MODE2 LIRC_MODE2REC(LIRC_MODE_MODE2) -#define LIRC_CAN_REC_LIRCCODE LIRC_MODE2REC(LIRC_MODE_LIRCCODE) - -#define LIRC_CAN_REC_MASK LIRC_MODE2REC(LIRC_CAN_SEND_MASK) - -#define LIRC_CAN_SET_REC_CARRIER (LIRC_CAN_SET_SEND_CARRIER << 16) -#define LIRC_CAN_SET_REC_DUTY_CYCLE (LIRC_CAN_SET_SEND_DUTY_CYCLE << 16) - -#define LIRC_CAN_SET_REC_DUTY_CYCLE_RANGE 0x40000000 -#define LIRC_CAN_SET_REC_CARRIER_RANGE 0x80000000 -#define LIRC_CAN_GET_REC_RESOLUTION 0x20000000 -#define LIRC_CAN_SET_REC_TIMEOUT 0x10000000 -#define LIRC_CAN_SET_REC_FILTER 0x08000000 - -#define LIRC_CAN_MEASURE_CARRIER 0x02000000 -#define LIRC_CAN_USE_WIDEBAND_RECEIVER 0x04000000 - -#define LIRC_CAN_SEND(x) ((x)&LIRC_CAN_SEND_MASK) -#define LIRC_CAN_REC(x) ((x)&LIRC_CAN_REC_MASK) - -#define LIRC_CAN_NOTIFY_DECODE 0x01000000 - -/*** IOCTL commands for lirc driver ***/ - -#define LIRC_GET_FEATURES _IOR('i', 0x00000000, __u32) - -#define LIRC_GET_SEND_MODE _IOR('i', 0x00000001, __u32) -#define LIRC_GET_REC_MODE _IOR('i', 0x00000002, __u32) -#define LIRC_GET_SEND_CARRIER _IOR('i', 0x00000003, __u32) -#define LIRC_GET_REC_CARRIER _IOR('i', 0x00000004, __u32) -#define LIRC_GET_SEND_DUTY_CYCLE _IOR('i', 0x00000005, __u32) -#define LIRC_GET_REC_DUTY_CYCLE _IOR('i', 0x00000006, __u32) -#define LIRC_GET_REC_RESOLUTION _IOR('i', 0x00000007, __u32) - -#define LIRC_GET_MIN_TIMEOUT _IOR('i', 0x00000008, __u32) -#define LIRC_GET_MAX_TIMEOUT _IOR('i', 0x00000009, __u32) - -#define LIRC_GET_MIN_FILTER_PULSE _IOR('i', 0x0000000a, __u32) -#define LIRC_GET_MAX_FILTER_PULSE _IOR('i', 0x0000000b, __u32) -#define LIRC_GET_MIN_FILTER_SPACE _IOR('i', 0x0000000c, __u32) -#define LIRC_GET_MAX_FILTER_SPACE _IOR('i', 0x0000000d, __u32) - -/* code length in bits, currently only for LIRC_MODE_LIRCCODE */ -#define LIRC_GET_LENGTH _IOR('i', 0x0000000f, __u32) - -#define LIRC_SET_SEND_MODE _IOW('i', 0x00000011, __u32) -#define LIRC_SET_REC_MODE _IOW('i', 0x00000012, __u32) -/* Note: these can reset the according pulse_width */ -#define LIRC_SET_SEND_CARRIER _IOW('i', 0x00000013, __u32) -#define LIRC_SET_REC_CARRIER _IOW('i', 0x00000014, __u32) -#define LIRC_SET_SEND_DUTY_CYCLE _IOW('i', 0x00000015, __u32) -#define LIRC_SET_REC_DUTY_CYCLE _IOW('i', 0x00000016, __u32) -#define LIRC_SET_TRANSMITTER_MASK _IOW('i', 0x00000017, __u32) - -/* - * when a timeout != 0 is set the driver will send a - * LIRC_MODE2_TIMEOUT data packet, otherwise LIRC_MODE2_TIMEOUT is - * never sent, timeout is disabled by default - */ -#define LIRC_SET_REC_TIMEOUT _IOW('i', 0x00000018, __u32) - -/* 1 enables, 0 disables timeout reports in MODE2 */ -#define LIRC_SET_REC_TIMEOUT_REPORTS _IOW('i', 0x00000019, __u32) - -/* - * pulses shorter than this are filtered out by hardware (software - * emulation in lirc_dev?) - */ -#define LIRC_SET_REC_FILTER_PULSE _IOW('i', 0x0000001a, __u32) -/* - * spaces shorter than this are filtered out by hardware (software - * emulation in lirc_dev?) - */ -#define LIRC_SET_REC_FILTER_SPACE _IOW('i', 0x0000001b, __u32) -/* - * if filter cannot be set independently for pulse/space, this should - * be used - */ -#define LIRC_SET_REC_FILTER _IOW('i', 0x0000001c, __u32) - -/* - * if enabled from the next key press on the driver will send - * LIRC_MODE2_FREQUENCY packets - */ -#define LIRC_SET_MEASURE_CARRIER_MODE _IOW('i', 0x0000001d, __u32) - -/* - * to set a range use - * LIRC_SET_REC_DUTY_CYCLE_RANGE/LIRC_SET_REC_CARRIER_RANGE with the - * lower bound first and later - * LIRC_SET_REC_DUTY_CYCLE/LIRC_SET_REC_CARRIER with the upper bound - */ - -#define LIRC_SET_REC_DUTY_CYCLE_RANGE _IOW('i', 0x0000001e, __u32) -#define LIRC_SET_REC_CARRIER_RANGE _IOW('i', 0x0000001f, __u32) - -#define LIRC_NOTIFY_DECODE _IO('i', 0x00000020) - -#define LIRC_SETUP_START _IO('i', 0x00000021) -#define LIRC_SETUP_END _IO('i', 0x00000022) - -#define LIRC_SET_WIDEBAND_RECEIVER _IOW('i', 0x00000023, __u32) - -#endif +#include diff --git a/include/uapi/linux/lirc.h b/include/uapi/linux/lirc.h new file mode 100644 index 0000000..4b3ab29 --- /dev/null +++ b/include/uapi/linux/lirc.h @@ -0,0 +1,168 @@ +/* + * lirc.h - linux infrared remote control header file + * last modified 2010/07/13 by Jarod Wilson + */ + +#ifndef _LINUX_LIRC_H +#define _LINUX_LIRC_H + +#include +#include + +#define PULSE_BIT 0x01000000 +#define PULSE_MASK 0x00FFFFFF + +#define LIRC_MODE2_SPACE 0x00000000 +#define LIRC_MODE2_PULSE 0x01000000 +#define LIRC_MODE2_FREQUENCY 0x02000000 +#define LIRC_MODE2_TIMEOUT 0x03000000 + +#define LIRC_VALUE_MASK 0x00FFFFFF +#define LIRC_MODE2_MASK 0xFF000000 + +#define LIRC_SPACE(val) (((val)&LIRC_VALUE_MASK) | LIRC_MODE2_SPACE) +#define LIRC_PULSE(val) (((val)&LIRC_VALUE_MASK) | LIRC_MODE2_PULSE) +#define LIRC_FREQUENCY(val) (((val)&LIRC_VALUE_MASK) | LIRC_MODE2_FREQUENCY) +#define LIRC_TIMEOUT(val) (((val)&LIRC_VALUE_MASK) | LIRC_MODE2_TIMEOUT) + +#define LIRC_VALUE(val) ((val)&LIRC_VALUE_MASK) +#define LIRC_MODE2(val) ((val)&LIRC_MODE2_MASK) + +#define LIRC_IS_SPACE(val) (LIRC_MODE2(val) == LIRC_MODE2_SPACE) +#define LIRC_IS_PULSE(val) (LIRC_MODE2(val) == LIRC_MODE2_PULSE) +#define LIRC_IS_FREQUENCY(val) (LIRC_MODE2(val) == LIRC_MODE2_FREQUENCY) +#define LIRC_IS_TIMEOUT(val) (LIRC_MODE2(val) == LIRC_MODE2_TIMEOUT) + +/* used heavily by lirc userspace */ +#define lirc_t int + +/*** lirc compatible hardware features ***/ + +#define LIRC_MODE2SEND(x) (x) +#define LIRC_SEND2MODE(x) (x) +#define LIRC_MODE2REC(x) ((x) << 16) +#define LIRC_REC2MODE(x) ((x) >> 16) + +#define LIRC_MODE_RAW 0x00000001 +#define LIRC_MODE_PULSE 0x00000002 +#define LIRC_MODE_MODE2 0x00000004 +#define LIRC_MODE_LIRCCODE 0x00000010 + + +#define LIRC_CAN_SEND_RAW LIRC_MODE2SEND(LIRC_MODE_RAW) +#define LIRC_CAN_SEND_PULSE LIRC_MODE2SEND(LIRC_MODE_PULSE) +#define LIRC_CAN_SEND_MODE2 LIRC_MODE2SEND(LIRC_MODE_MODE2) +#define LIRC_CAN_SEND_LIRCCODE LIRC_MODE2SEND(LIRC_MODE_LIRCCODE) + +#define LIRC_CAN_SEND_MASK 0x0000003f + +#define LIRC_CAN_SET_SEND_CARRIER 0x00000100 +#define LIRC_CAN_SET_SEND_DUTY_CYCLE 0x00000200 +#define LIRC_CAN_SET_TRANSMITTER_MASK 0x00000400 + +#define LIRC_CAN_REC_RAW LIRC_MODE2REC(LIRC_MODE_RAW) +#define LIRC_CAN_REC_PULSE LIRC_MODE2REC(LIRC_MODE_PULSE) +#define LIRC_CAN_REC_MODE2 LIRC_MODE2REC(LIRC_MODE_MODE2) +#define LIRC_CAN_REC_LIRCCODE LIRC_MODE2REC(LIRC_MODE_LIRCCODE) + +#define LIRC_CAN_REC_MASK LIRC_MODE2REC(LIRC_CAN_SEND_MASK) + +#define LIRC_CAN_SET_REC_CARRIER (LIRC_CAN_SET_SEND_CARRIER << 16) +#define LIRC_CAN_SET_REC_DUTY_CYCLE (LIRC_CAN_SET_SEND_DUTY_CYCLE << 16) + +#define LIRC_CAN_SET_REC_DUTY_CYCLE_RANGE 0x40000000 +#define LIRC_CAN_SET_REC_CARRIER_RANGE 0x80000000 +#define LIRC_CAN_GET_REC_RESOLUTION 0x20000000 +#define LIRC_CAN_SET_REC_TIMEOUT 0x10000000 +#define LIRC_CAN_SET_REC_FILTER 0x08000000 + +#define LIRC_CAN_MEASURE_CARRIER 0x02000000 +#define LIRC_CAN_USE_WIDEBAND_RECEIVER 0x04000000 + +#define LIRC_CAN_SEND(x) ((x)&LIRC_CAN_SEND_MASK) +#define LIRC_CAN_REC(x) ((x)&LIRC_CAN_REC_MASK) + +#define LIRC_CAN_NOTIFY_DECODE 0x01000000 + +/*** IOCTL commands for lirc driver ***/ + +#define LIRC_GET_FEATURES _IOR('i', 0x00000000, __u32) + +#define LIRC_GET_SEND_MODE _IOR('i', 0x00000001, __u32) +#define LIRC_GET_REC_MODE _IOR('i', 0x00000002, __u32) +#define LIRC_GET_SEND_CARRIER _IOR('i', 0x00000003, __u32) +#define LIRC_GET_REC_CARRIER _IOR('i', 0x00000004, __u32) +#define LIRC_GET_SEND_DUTY_CYCLE _IOR('i', 0x00000005, __u32) +#define LIRC_GET_REC_DUTY_CYCLE _IOR('i', 0x00000006, __u32) +#define LIRC_GET_REC_RESOLUTION _IOR('i', 0x00000007, __u32) + +#define LIRC_GET_MIN_TIMEOUT _IOR('i', 0x00000008, __u32) +#define LIRC_GET_MAX_TIMEOUT _IOR('i', 0x00000009, __u32) + +#define LIRC_GET_MIN_FILTER_PULSE _IOR('i', 0x0000000a, __u32) +#define LIRC_GET_MAX_FILTER_PULSE _IOR('i', 0x0000000b, __u32) +#define LIRC_GET_MIN_FILTER_SPACE _IOR('i', 0x0000000c, __u32) +#define LIRC_GET_MAX_FILTER_SPACE _IOR('i', 0x0000000d, __u32) + +/* code length in bits, currently only for LIRC_MODE_LIRCCODE */ +#define LIRC_GET_LENGTH _IOR('i', 0x0000000f, __u32) + +#define LIRC_SET_SEND_MODE _IOW('i', 0x00000011, __u32) +#define LIRC_SET_REC_MODE _IOW('i', 0x00000012, __u32) +/* Note: these can reset the according pulse_width */ +#define LIRC_SET_SEND_CARRIER _IOW('i', 0x00000013, __u32) +#define LIRC_SET_REC_CARRIER _IOW('i', 0x00000014, __u32) +#define LIRC_SET_SEND_DUTY_CYCLE _IOW('i', 0x00000015, __u32) +#define LIRC_SET_REC_DUTY_CYCLE _IOW('i', 0x00000016, __u32) +#define LIRC_SET_TRANSMITTER_MASK _IOW('i', 0x00000017, __u32) + +/* + * when a timeout != 0 is set the driver will send a + * LIRC_MODE2_TIMEOUT data packet, otherwise LIRC_MODE2_TIMEOUT is + * never sent, timeout is disabled by default + */ +#define LIRC_SET_REC_TIMEOUT _IOW('i', 0x00000018, __u32) + +/* 1 enables, 0 disables timeout reports in MODE2 */ +#define LIRC_SET_REC_TIMEOUT_REPORTS _IOW('i', 0x00000019, __u32) + +/* + * pulses shorter than this are filtered out by hardware (software + * emulation in lirc_dev?) + */ +#define LIRC_SET_REC_FILTER_PULSE _IOW('i', 0x0000001a, __u32) +/* + * spaces shorter than this are filtered out by hardware (software + * emulation in lirc_dev?) + */ +#define LIRC_SET_REC_FILTER_SPACE _IOW('i', 0x0000001b, __u32) +/* + * if filter cannot be set independently for pulse/space, this should + * be used + */ +#define LIRC_SET_REC_FILTER _IOW('i', 0x0000001c, __u32) + +/* + * if enabled from the next key press on the driver will send + * LIRC_MODE2_FREQUENCY packets + */ +#define LIRC_SET_MEASURE_CARRIER_MODE _IOW('i', 0x0000001d, __u32) + +/* + * to set a range use + * LIRC_SET_REC_DUTY_CYCLE_RANGE/LIRC_SET_REC_CARRIER_RANGE with the + * lower bound first and later + * LIRC_SET_REC_DUTY_CYCLE/LIRC_SET_REC_CARRIER with the upper bound + */ + +#define LIRC_SET_REC_DUTY_CYCLE_RANGE _IOW('i', 0x0000001e, __u32) +#define LIRC_SET_REC_CARRIER_RANGE _IOW('i', 0x0000001f, __u32) + +#define LIRC_NOTIFY_DECODE _IO('i', 0x00000020) + +#define LIRC_SETUP_START _IO('i', 0x00000021) +#define LIRC_SETUP_END _IO('i', 0x00000022) + +#define LIRC_SET_WIDEBAND_RECEIVER _IOW('i', 0x00000023, __u32) + +#endif -- cgit v0.10.2 From b5dcee225ce972fecb054e104be22b2a6f65303d Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 10 Nov 2015 12:01:44 -0200 Subject: [media] include/media: split I2C headers from V4L2 core Currently, include/media is messy, as it contains both the V4L2 core headers and some driver-specific headers on the same place. That makes harder to identify what core headers should be documented and what headers belong to I2C drivers that are included only by bridge/main drivers that would require the functions provided by them. Let's move those i2c specific files to its own subdirectory. The files to move were produced via the following script: mkdir include/media/i2c (cd include/media; for i in *.h; do n=`echo $i|sed s/.h$/.c/`; if [ -e ../../drivers/media/i2c/$n ]; then echo $i; git mv $i i2c/; fi; done) (cd include/media; for i in *.h; do n=`echo $i|sed s/.h$/.c/`; if [ -e ../../drivers/media/*/i2c/$n ]; then echo $i; git mv $i i2c/; fi; done) for i in include/media/*.h; do n=`basename $i`; (for j in $(git grep -l $n); do dirname $j; done)|sort|uniq|grep -ve '^.$' > list; num=$(wc -l list|cut -d' ' -f1); if [ $num == 1 ]; then if [ "`grep i2c list`" != "" ]; then git mv $i include/media/i2c; fi; fi; done And the references corrected via this script: MAIN_DIR="media/" PREV_DIR="media/" DIRS="i2c/" echo "Checking affected files" >&2 for i in $DIRS; do for j in $(find include/$MAIN_DIR/$i -type f -name '*.h'); do n=`basename $j` git grep -l $n done done|sort|uniq >files && ( echo "Handling files..." >&2; echo "for i in \$(cat files|grep -v Documentation); do cat \$i | \\"; ( cd include/$MAIN_DIR; for j in $DIRS; do for i in $(ls $j); do echo "perl -ne 's,(include [\\\"\\<])$PREV_DIR($i)([\\\"\\>]),\1$MAIN_DIR$j\2\3,; print \$_' |\\"; done; done; echo "cat > a && mv a \$i; done"; ); echo "Handling documentation..." >&2; echo "for i in MAINTAINERS \$(cat files); do cat \$i | \\"; ( cd include/$MAIN_DIR; for j in $DIRS; do for i in $(ls $j); do echo " perl -ne 's,include/$PREV_DIR($i)\b,include/$MAIN_DIR$j\1,; print \$_' |\\"; done; done; echo "cat > a && mv a \$i; done" ); ) >script && . ./script Merged Sakari Ailus patch that moves smiapp.h to include/media/i2c. Signed-off-by: Mauro Carvalho Chehab Acked-by: Arnd Bergmann diff --git a/MAINTAINERS b/MAINTAINERS index e9caa4b..a8e3f47 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -395,7 +395,7 @@ M: Sakari Ailus L: linux-media@vger.kernel.org S: Maintained F: drivers/media/i2c/adp1653.c -F: include/media/adp1653.h +F: include/media/i2c/adp1653.h ADP5520 BACKLIGHT DRIVER WITH IO EXPANDER (ADP5520/ADP5501) M: Michael Hennerich @@ -1773,7 +1773,7 @@ L: linux-media@vger.kernel.org T: git git://linuxtv.org/media_tree.git S: Maintained F: drivers/media/i2c/as3645a.c -F: include/media/as3645a.h +F: include/media/i2c/as3645a.h ASC7621 HARDWARE MONITOR DRIVER M: George Joseph @@ -4596,7 +4596,7 @@ M: Heungjun Kim L: linux-media@vger.kernel.org S: Maintained F: drivers/media/i2c/m5mols/ -F: include/media/m5mols.h +F: include/media/i2c/m5mols.h FUJITSU TABLET EXTRAS M: Robert Gerlach @@ -7169,7 +7169,7 @@ L: linux-media@vger.kernel.org T: git git://linuxtv.org/media_tree.git S: Maintained F: drivers/media/i2c/mt9m032.c -F: include/media/mt9m032.h +F: include/media/i2c/mt9m032.h MT9P031 APTINA CAMERA SENSOR M: Laurent Pinchart @@ -7177,7 +7177,7 @@ L: linux-media@vger.kernel.org T: git git://linuxtv.org/media_tree.git S: Maintained F: drivers/media/i2c/mt9p031.c -F: include/media/mt9p031.h +F: include/media/i2c/mt9p031.h MT9T001 APTINA CAMERA SENSOR M: Laurent Pinchart @@ -7185,7 +7185,7 @@ L: linux-media@vger.kernel.org T: git git://linuxtv.org/media_tree.git S: Maintained F: drivers/media/i2c/mt9t001.c -F: include/media/mt9t001.h +F: include/media/i2c/mt9t001.h MT9V032 APTINA CAMERA SENSOR M: Laurent Pinchart @@ -7194,7 +7194,7 @@ T: git git://linuxtv.org/media_tree.git S: Maintained F: Documentation/devicetree/bindings/media/i2c/mt9v032.txt F: drivers/media/i2c/mt9v032.c -F: include/media/mt9v032.h +F: include/media/i2c/mt9v032.h MULTIFUNCTION DEVICES (MFD) M: Lee Jones @@ -9751,7 +9751,7 @@ Q: http://patchwork.linuxtv.org/project/linux-media/list/ T: git git://linuxtv.org/mhadli/v4l-dvb-davinci_devices.git S: Maintained F: drivers/media/i2c/ov2659.c -F: include/media/ov2659.h +F: include/media/i2c/ov2659.h SILICON MOTION SM712 FRAME BUFFER DRIVER M: Sudip Mukherjee @@ -9840,7 +9840,7 @@ M: Sakari Ailus L: linux-media@vger.kernel.org S: Maintained F: drivers/media/i2c/smiapp/ -F: include/media/smiapp.h +F: include/media/i2c/smiapp.h F: drivers/media/i2c/smiapp-pll.c F: drivers/media/i2c/smiapp-pll.h F: include/uapi/linux/smiapp.h @@ -10781,7 +10781,7 @@ M: Mats Randgaard L: linux-media@vger.kernel.org S: Maintained F: drivers/media/i2c/tc358743* -F: include/media/tc358743.h +F: include/media/i2c/tc358743.h TMIO MMC DRIVER M: Ian Molton diff --git a/arch/arm/mach-davinci/board-da850-evm.c b/arch/arm/mach-davinci/board-da850-evm.c index 1ed545c..9cc7b81 100644 --- a/arch/arm/mach-davinci/board-da850-evm.c +++ b/arch/arm/mach-davinci/board-da850-evm.c @@ -49,8 +49,8 @@ #include #include -#include -#include +#include +#include #define DA850_EVM_PHY_ID "davinci_mdio-0:00" #define DA850_LCD_PWR_PIN GPIO_TO_PIN(2, 8) diff --git a/arch/arm/mach-davinci/board-dm355-evm.c b/arch/arm/mach-davinci/board-dm355-evm.c index b46b4d2..c71dd99 100644 --- a/arch/arm/mach-davinci/board-dm355-evm.c +++ b/arch/arm/mach-davinci/board-dm355-evm.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/arm/mach-davinci/board-dm365-evm.c b/arch/arm/mach-davinci/board-dm365-evm.c index a756003..f073518 100644 --- a/arch/arm/mach-davinci/board-dm365-evm.c +++ b/arch/arm/mach-davinci/board-dm365-evm.c @@ -40,8 +40,8 @@ #include #include -#include -#include +#include +#include #include "davinci.h" diff --git a/arch/arm/mach-davinci/board-dm644x-evm.c b/arch/arm/mach-davinci/board-dm644x-evm.c index bbdd2d6..7a20507 100644 --- a/arch/arm/mach-davinci/board-dm644x-evm.c +++ b/arch/arm/mach-davinci/board-dm644x-evm.c @@ -26,7 +26,7 @@ #include #include -#include +#include #include #include diff --git a/arch/arm/mach-davinci/board-dm646x-evm.c b/arch/arm/mach-davinci/board-dm646x-evm.c index 846a84d..ee6ab7e 100644 --- a/arch/arm/mach-davinci/board-dm646x-evm.c +++ b/arch/arm/mach-davinci/board-dm646x-evm.c @@ -25,8 +25,8 @@ #include #include -#include -#include +#include +#include #include #include diff --git a/arch/arm/mach-pxa/pcm990-baseboard.c b/arch/arm/mach-pxa/pcm990-baseboard.c index b71c96f..e3b58cb 100644 --- a/arch/arm/mach-pxa/pcm990-baseboard.c +++ b/arch/arm/mach-pxa/pcm990-baseboard.c @@ -27,7 +27,7 @@ #include #include -#include +#include #include #include diff --git a/arch/blackfin/mach-bf561/boards/ezkit.c b/arch/blackfin/mach-bf561/boards/ezkit.c index 2de71e8..f35525b 100644 --- a/arch/blackfin/mach-bf561/boards/ezkit.c +++ b/arch/blackfin/mach-bf561/boards/ezkit.c @@ -443,7 +443,7 @@ static const struct ppi_info ppi_info = { }; #if IS_ENABLED(CONFIG_VIDEO_ADV7183) -#include +#include static struct v4l2_input adv7183_inputs[] = { { .index = 0, diff --git a/arch/blackfin/mach-bf609/boards/ezkit.c b/arch/blackfin/mach-bf609/boards/ezkit.c index 2c61fc0..c7928d8 100644 --- a/arch/blackfin/mach-bf609/boards/ezkit.c +++ b/arch/blackfin/mach-bf609/boards/ezkit.c @@ -933,7 +933,7 @@ static struct bfin_capture_config bfin_capture_data = { #endif #if IS_ENABLED(CONFIG_VIDEO_ADV7842) -#include +#include static struct v4l2_input adv7842_inputs[] = { { @@ -1084,7 +1084,7 @@ static const struct ppi_info ppi_info = { }; #if IS_ENABLED(CONFIG_VIDEO_ADV7511) -#include +#include static struct v4l2_output adv7511_outputs[] = { { @@ -1125,7 +1125,7 @@ static struct bfin_display_config bfin_display_data = { #endif #if IS_ENABLED(CONFIG_VIDEO_ADV7343) -#include +#include static struct v4l2_output adv7343_outputs[] = { { diff --git a/arch/sh/boards/mach-ap325rxa/setup.c b/arch/sh/boards/mach-ap325rxa/setup.c index cbd2a9f..62b045c 100644 --- a/arch/sh/boards/mach-ap325rxa/setup.c +++ b/arch/sh/boards/mach-ap325rxa/setup.c @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/sh/boards/mach-ecovec24/setup.c b/arch/sh/boards/mach-ecovec24/setup.c index d531791..5fcec76 100644 --- a/arch/sh/boards/mach-ecovec24/setup.c +++ b/arch/sh/boards/mach-ecovec24/setup.c @@ -40,8 +40,8 @@ #include #include #include -#include -#include +#include +#include #include #include #include @@ -900,7 +900,7 @@ static struct platform_device irda_device = { .resource = irda_resources, }; -#include +#include #include static struct ak881x_pdata ak881x_pdata = { diff --git a/arch/sh/boards/mach-kfr2r09/setup.c b/arch/sh/boards/mach-kfr2r09/setup.c index 7d997ce..ec93573 100644 --- a/arch/sh/boards/mach-kfr2r09/setup.c +++ b/arch/sh/boards/mach-kfr2r09/setup.c @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include #include #include