summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
Diffstat (limited to 'include')
-rw-r--r--include/linux/ceph/ceph_features.h38
-rw-r--r--include/linux/ceph/ceph_fs.h32
-rw-r--r--include/linux/ceph/decode.h29
-rw-r--r--include/linux/ceph/libceph.h16
-rw-r--r--include/linux/ceph/mdsmap.h4
-rw-r--r--include/linux/ceph/messenger.h2
-rw-r--r--include/linux/ceph/osd_client.h74
-rw-r--r--include/linux/ceph/osdmap.h30
-rw-r--r--include/linux/ceph/rados.h158
-rw-r--r--include/linux/crush/crush.h2
10 files changed, 227 insertions, 158 deletions
diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
index dad579b..76554ce 100644
--- a/include/linux/ceph/ceph_features.h
+++ b/include/linux/ceph/ceph_features.h
@@ -12,16 +12,46 @@
#define CEPH_FEATURE_MONNAMES (1<<5)
#define CEPH_FEATURE_RECONNECT_SEQ (1<<6)
#define CEPH_FEATURE_DIRLAYOUTHASH (1<<7)
-/* bits 8-17 defined by user-space; not supported yet here */
+#define CEPH_FEATURE_OBJECTLOCATOR (1<<8)
+#define CEPH_FEATURE_PGID64 (1<<9)
+#define CEPH_FEATURE_INCSUBOSDMAP (1<<10)
+#define CEPH_FEATURE_PGPOOL3 (1<<11)
+#define CEPH_FEATURE_OSDREPLYMUX (1<<12)
+#define CEPH_FEATURE_OSDENC (1<<13)
+#define CEPH_FEATURE_OMAP (1<<14)
+#define CEPH_FEATURE_MONENC (1<<15)
+#define CEPH_FEATURE_QUERY_T (1<<16)
+#define CEPH_FEATURE_INDEP_PG_MAP (1<<17)
#define CEPH_FEATURE_CRUSH_TUNABLES (1<<18)
+#define CEPH_FEATURE_CHUNKY_SCRUB (1<<19)
+#define CEPH_FEATURE_MON_NULLROUTE (1<<20)
+#define CEPH_FEATURE_MON_GV (1<<21)
+#define CEPH_FEATURE_BACKFILL_RESERVATION (1<<22)
+#define CEPH_FEATURE_MSG_AUTH (1<<23)
+#define CEPH_FEATURE_RECOVERY_RESERVATION (1<<24)
+#define CEPH_FEATURE_CRUSH_TUNABLES2 (1<<25)
+#define CEPH_FEATURE_CREATEPOOLID (1<<26)
+#define CEPH_FEATURE_REPLY_CREATE_INODE (1<<27)
+#define CEPH_FEATURE_OSD_HBMSGS (1<<28)
+#define CEPH_FEATURE_MDSENC (1<<29)
+#define CEPH_FEATURE_OSDHASHPSPOOL (1<<30)
/*
* Features supported.
*/
#define CEPH_FEATURES_SUPPORTED_DEFAULT \
- (CEPH_FEATURE_NOSRCADDR | \
- CEPH_FEATURE_CRUSH_TUNABLES)
+ (CEPH_FEATURE_NOSRCADDR | \
+ CEPH_FEATURE_PGID64 | \
+ CEPH_FEATURE_PGPOOL3 | \
+ CEPH_FEATURE_OSDENC | \
+ CEPH_FEATURE_CRUSH_TUNABLES | \
+ CEPH_FEATURE_CRUSH_TUNABLES2 | \
+ CEPH_FEATURE_REPLY_CREATE_INODE | \
+ CEPH_FEATURE_OSDHASHPSPOOL)
#define CEPH_FEATURES_REQUIRED_DEFAULT \
- (CEPH_FEATURE_NOSRCADDR)
+ (CEPH_FEATURE_NOSRCADDR | \
+ CEPH_FEATURE_PGID64 | \
+ CEPH_FEATURE_PGPOOL3 | \
+ CEPH_FEATURE_OSDENC)
#endif
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index cf6f4d9..2ad7b86 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -21,16 +21,14 @@
* internal cluster protocols separately from the public,
* client-facing protocol.
*/
-#define CEPH_OSD_PROTOCOL 8 /* cluster internal */
-#define CEPH_MDS_PROTOCOL 12 /* cluster internal */
-#define CEPH_MON_PROTOCOL 5 /* cluster internal */
#define CEPH_OSDC_PROTOCOL 24 /* server/client */
#define CEPH_MDSC_PROTOCOL 32 /* server/client */
#define CEPH_MONC_PROTOCOL 15 /* server/client */
-#define CEPH_INO_ROOT 1
-#define CEPH_INO_CEPH 2 /* hidden .ceph dir */
+#define CEPH_INO_ROOT 1
+#define CEPH_INO_CEPH 2 /* hidden .ceph dir */
+#define CEPH_INO_DOTDOT 3 /* used by ceph fuse for parent (..) */
/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
#define CEPH_MAX_MON 31
@@ -51,7 +49,7 @@ struct ceph_file_layout {
__le32 fl_object_stripe_unit; /* UNUSED. for per-object parity, if any */
/* object -> pg layout */
- __le32 fl_unused; /* unused; used to be preferred primary (-1) */
+ __le32 fl_unused; /* unused; used to be preferred primary for pg (-1 for none) */
__le32 fl_pg_pool; /* namespace, crush ruleset, rep level */
} __attribute__ ((packed));
@@ -101,6 +99,8 @@ struct ceph_dir_layout {
#define CEPH_MSG_MON_SUBSCRIBE_ACK 16
#define CEPH_MSG_AUTH 17
#define CEPH_MSG_AUTH_REPLY 18
+#define CEPH_MSG_MON_GET_VERSION 19
+#define CEPH_MSG_MON_GET_VERSION_REPLY 20
/* client <-> mds */
#define CEPH_MSG_MDS_MAP 21
@@ -221,6 +221,11 @@ struct ceph_mon_subscribe_ack {
} __attribute__ ((packed));
/*
+ * mdsmap flags
+ */
+#define CEPH_MDSMAP_DOWN (1<<0) /* cluster deliberately down */
+
+/*
* mds states
* > 0 -> in
* <= 0 -> out
@@ -233,6 +238,7 @@ struct ceph_mon_subscribe_ack {
#define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */
#define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds */
#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
+#define CEPH_MDS_STATE_REPLAYONCE -9 /* up, replaying an active node's journal */
#define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */
#define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed
@@ -264,6 +270,7 @@ extern const char *ceph_mds_state_name(int s);
#define CEPH_LOCK_IXATTR 2048
#define CEPH_LOCK_IFLOCK 4096 /* advisory file locks */
#define CEPH_LOCK_INO 8192 /* immutable inode bits; not a lock */
+#define CEPH_LOCK_IPOLICY 16384 /* policy lock on dirs. MDS internal */
/* client_session ops */
enum {
@@ -338,6 +345,12 @@ extern const char *ceph_mds_op_name(int op);
#define CEPH_SETATTR_SIZE 32
#define CEPH_SETATTR_CTIME 64
+/*
+ * Ceph setxattr request flags.
+ */
+#define CEPH_XATTR_CREATE 1
+#define CEPH_XATTR_REPLACE 2
+
union ceph_mds_request_args {
struct {
__le32 mask; /* CEPH_CAP_* */
@@ -522,14 +535,17 @@ int ceph_flags_to_mode(int flags);
#define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */
#define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */
+#define CEPH_CAP_SIMPLE_BITS 2
+#define CEPH_CAP_FILE_BITS 8
+
/* per-lock shift */
#define CEPH_CAP_SAUTH 2
#define CEPH_CAP_SLINK 4
#define CEPH_CAP_SXATTR 6
#define CEPH_CAP_SFILE 8
-#define CEPH_CAP_SFLOCK 20
+#define CEPH_CAP_SFLOCK 20
-#define CEPH_CAP_BITS 22
+#define CEPH_CAP_BITS 22
/* composed values */
#define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH)
diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index 63d0928..360d9d0 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -52,10 +52,10 @@ static inline int ceph_has_room(void **p, void *end, size_t n)
return end >= *p && n <= end - *p;
}
-#define ceph_decode_need(p, end, n, bad) \
- do { \
- if (!likely(ceph_has_room(p, end, n))) \
- goto bad; \
+#define ceph_decode_need(p, end, n, bad) \
+ do { \
+ if (!likely(ceph_has_room(p, end, n))) \
+ goto bad; \
} while (0)
#define ceph_decode_64_safe(p, end, v, bad) \
@@ -99,8 +99,8 @@ static inline int ceph_has_room(void **p, void *end, size_t n)
*
* There are two possible failures:
* - converting the string would require accessing memory at or
- * beyond the "end" pointer provided (-E
- * - memory could not be allocated for the result
+ * beyond the "end" pointer provided (-ERANGE)
+ * - memory could not be allocated for the result (-ENOMEM)
*/
static inline char *ceph_extract_encoded_string(void **p, void *end,
size_t *lenp, gfp_t gfp)
@@ -217,10 +217,10 @@ static inline void ceph_encode_string(void **p, void *end,
*p += len;
}
-#define ceph_encode_need(p, end, n, bad) \
- do { \
- if (!likely(ceph_has_room(p, end, n))) \
- goto bad; \
+#define ceph_encode_need(p, end, n, bad) \
+ do { \
+ if (!likely(ceph_has_room(p, end, n))) \
+ goto bad; \
} while (0)
#define ceph_encode_64_safe(p, end, v, bad) \
@@ -231,12 +231,17 @@ static inline void ceph_encode_string(void **p, void *end,
#define ceph_encode_32_safe(p, end, v, bad) \
do { \
ceph_encode_need(p, end, sizeof(u32), bad); \
- ceph_encode_32(p, v); \
+ ceph_encode_32(p, v); \
} while (0)
#define ceph_encode_16_safe(p, end, v, bad) \
do { \
ceph_encode_need(p, end, sizeof(u16), bad); \
- ceph_encode_16(p, v); \
+ ceph_encode_16(p, v); \
+ } while (0)
+#define ceph_encode_8_safe(p, end, v, bad) \
+ do { \
+ ceph_encode_need(p, end, sizeof(u8), bad); \
+ ceph_encode_8(p, v); \
} while (0)
#define ceph_encode_copy_safe(p, end, pv, n, bad) \
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 084d3c6..29818fc 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -193,6 +193,8 @@ static inline int calc_pages_for(u64 off, u64 len)
}
/* ceph_common.c */
+extern bool libceph_compatible(void *data);
+
extern const char *ceph_msg_type_name(int type);
extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
extern struct kmem_cache *ceph_inode_cachep;
@@ -220,7 +222,7 @@ extern int ceph_open_session(struct ceph_client *client);
/* pagevec.c */
extern void ceph_release_page_vector(struct page **pages, int num_pages);
-extern struct page **ceph_get_direct_page_vector(const char __user *data,
+extern struct page **ceph_get_direct_page_vector(const void __user *data,
int num_pages,
bool write_page);
extern void ceph_put_page_vector(struct page **pages, int num_pages,
@@ -228,15 +230,15 @@ extern void ceph_put_page_vector(struct page **pages, int num_pages,
extern void ceph_release_page_vector(struct page **pages, int num_pages);
extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
extern int ceph_copy_user_to_page_vector(struct page **pages,
- const char __user *data,
+ const void __user *data,
loff_t off, size_t len);
-extern int ceph_copy_to_page_vector(struct page **pages,
- const char *data,
+extern void ceph_copy_to_page_vector(struct page **pages,
+ const void *data,
loff_t off, size_t len);
-extern int ceph_copy_from_page_vector(struct page **pages,
- char *data,
+extern void ceph_copy_from_page_vector(struct page **pages,
+ void *data,
loff_t off, size_t len);
-extern int ceph_copy_page_vector_to_user(struct page **pages, char __user *data,
+extern int ceph_copy_page_vector_to_user(struct page **pages, void __user *data,
loff_t off, size_t len);
extern void ceph_zero_page_vector_range(int off, int len, struct page **pages);
diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h
index cb15b5d..87ed09f 100644
--- a/include/linux/ceph/mdsmap.h
+++ b/include/linux/ceph/mdsmap.h
@@ -29,8 +29,8 @@ struct ceph_mdsmap {
/* which object pools file data can be stored in */
int m_num_data_pg_pools;
- u32 *m_data_pg_pools;
- u32 m_cas_pg_pool;
+ u64 *m_data_pg_pools;
+ u64 m_cas_pg_pool;
};
static inline struct ceph_entity_addr *
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 14ba5ee..60903e0 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -83,9 +83,11 @@ struct ceph_msg {
struct list_head list_head;
struct kref kref;
+#ifdef CONFIG_BLOCK
struct bio *bio; /* instead of pages/pagelist */
struct bio *bio_iter; /* bio iterator */
int bio_seg; /* current bio segment */
+#endif /* CONFIG_BLOCK */
struct ceph_pagelist *trail; /* the trailing part of the data */
bool front_is_vmalloc;
bool more_to_follow;
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index d9b880e..1dd5d46 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -10,6 +10,7 @@
#include <linux/ceph/osdmap.h>
#include <linux/ceph/messenger.h>
#include <linux/ceph/auth.h>
+#include <linux/ceph/pagelist.h>
/*
* Maximum object name size
@@ -22,7 +23,6 @@ struct ceph_snap_context;
struct ceph_osd_request;
struct ceph_osd_client;
struct ceph_authorizer;
-struct ceph_pagelist;
/*
* completion callback for async writepages
@@ -47,6 +47,9 @@ struct ceph_osd {
struct list_head o_keepalive_item;
};
+
+#define CEPH_OSD_MAX_OP 10
+
/* an in-flight request */
struct ceph_osd_request {
u64 r_tid; /* unique for this client */
@@ -63,9 +66,23 @@ struct ceph_osd_request {
struct ceph_connection *r_con_filling_msg;
struct ceph_msg *r_request, *r_reply;
- int r_result;
int r_flags; /* any additional flags for the osd */
u32 r_sent; /* >0 if r_request is sending/sent */
+ int r_num_ops;
+
+ /* encoded message content */
+ struct ceph_osd_op *r_request_ops;
+ /* these are updated on each send */
+ __le32 *r_request_osdmap_epoch;
+ __le32 *r_request_flags;
+ __le64 *r_request_pool;
+ void *r_request_pgid;
+ __le32 *r_request_attempts;
+ struct ceph_eversion *r_request_reassert_version;
+
+ int r_result;
+ int r_reply_op_len[CEPH_OSD_MAX_OP];
+ s32 r_reply_op_result[CEPH_OSD_MAX_OP];
int r_got_reply;
int r_linger;
@@ -82,6 +99,7 @@ struct ceph_osd_request {
char r_oid[MAX_OBJ_NAME_SIZE]; /* object name */
int r_oid_len;
+ u64 r_snapid;
unsigned long r_stamp; /* send OR check time */
struct ceph_file_layout r_file_layout;
@@ -95,7 +113,7 @@ struct ceph_osd_request {
struct bio *r_bio; /* instead of pages */
#endif
- struct ceph_pagelist *r_trail; /* trailing part of the data */
+ struct ceph_pagelist r_trail; /* trailing part of the data */
};
struct ceph_osd_event {
@@ -107,7 +125,6 @@ struct ceph_osd_event {
struct rb_node node;
struct list_head osd_node;
struct kref kref;
- struct completion completion;
};
struct ceph_osd_event_work {
@@ -157,7 +174,7 @@ struct ceph_osd_client {
struct ceph_osd_req_op {
u16 op; /* CEPH_OSD_OP_* */
- u32 flags; /* CEPH_OSD_FLAG_* */
+ u32 payload_len;
union {
struct {
u64 offset, length;
@@ -166,23 +183,24 @@ struct ceph_osd_req_op {
} extent;
struct {
const char *name;
- u32 name_len;
const char *val;
+ u32 name_len;
u32 value_len;
__u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */
__u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */
} xattr;
struct {
const char *class_name;
- __u8 class_len;
const char *method_name;
- __u8 method_len;
- __u8 argc;
const char *indata;
u32 indata_len;
+ __u8 class_len;
+ __u8 method_len;
+ __u8 argc;
} cls;
struct {
- u64 cookie, count;
+ u64 cookie;
+ u64 count;
} pgls;
struct {
u64 snapid;
@@ -190,12 +208,11 @@ struct ceph_osd_req_op {
struct {
u64 cookie;
u64 ver;
- __u8 flag;
u32 prot_ver;
u32 timeout;
+ __u8 flag;
} watch;
};
- u32 payload_len;
};
extern int ceph_osdc_init(struct ceph_osd_client *osdc,
@@ -207,29 +224,19 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
struct ceph_msg *msg);
-extern int ceph_calc_raw_layout(struct ceph_osd_client *osdc,
- struct ceph_file_layout *layout,
- u64 snapid,
- u64 off, u64 *plen, u64 *bno,
- struct ceph_osd_request *req,
- struct ceph_osd_req_op *op);
-
extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
- int flags,
struct ceph_snap_context *snapc,
- struct ceph_osd_req_op *ops,
+ unsigned int num_op,
bool use_mempool,
- gfp_t gfp_flags,
- struct page **pages,
- struct bio *bio);
+ gfp_t gfp_flags);
extern void ceph_osdc_build_request(struct ceph_osd_request *req,
- u64 off, u64 *plen,
+ u64 off, u64 len,
+ unsigned int num_op,
struct ceph_osd_req_op *src_ops,
struct ceph_snap_context *snapc,
- struct timespec *mtime,
- const char *oid,
- int oid_len);
+ u64 snap_id,
+ struct timespec *mtime);
extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
struct ceph_file_layout *layout,
@@ -239,8 +246,7 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
int do_sync, u32 truncate_seq,
u64 truncate_size,
struct timespec *mtime,
- bool use_mempool, int num_reply,
- int page_align);
+ bool use_mempool, int page_align);
extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
struct ceph_osd_request *req);
@@ -279,17 +285,13 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
u64 off, u64 len,
u32 truncate_seq, u64 truncate_size,
struct timespec *mtime,
- struct page **pages, int nr_pages,
- int flags, int do_sync, bool nofail);
+ struct page **pages, int nr_pages);
/* watch/notify events */
extern int ceph_osdc_create_event(struct ceph_osd_client *osdc,
void (*event_cb)(u64, u64, u8, void *),
- int one_shot, void *data,
- struct ceph_osd_event **pevent);
+ void *data, struct ceph_osd_event **pevent);
extern void ceph_osdc_cancel_event(struct ceph_osd_event *event);
-extern int ceph_osdc_wait_event(struct ceph_osd_event *event,
- unsigned long timeout);
extern void ceph_osdc_put_event(struct ceph_osd_event *event);
#endif
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 10a417f..c819190 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -18,14 +18,31 @@
* The map can be updated either via an incremental map (diff) describing
* the change between two successive epochs, or as a fully encoded map.
*/
+struct ceph_pg {
+ uint64_t pool;
+ uint32_t seed;
+};
+
+#define CEPH_POOL_FLAG_HASHPSPOOL 1
+
struct ceph_pg_pool_info {
struct rb_node node;
- int id;
- struct ceph_pg_pool v;
- int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
+ s64 id;
+ u8 type;
+ u8 size;
+ u8 crush_ruleset;
+ u8 object_hash;
+ u32 pg_num, pgp_num;
+ int pg_num_mask, pgp_num_mask;
+ u64 flags;
char *name;
};
+struct ceph_object_locator {
+ uint64_t pool;
+ char *key;
+};
+
struct ceph_pg_mapping {
struct rb_node node;
struct ceph_pg pgid;
@@ -110,15 +127,16 @@ extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
/* calculate mapping of a file extent to an object */
extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
- u64 off, u64 *plen,
+ u64 off, u64 len,
u64 *bno, u64 *oxoff, u64 *oxlen);
/* calculate mapping of object to a placement group */
-extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
+extern int ceph_calc_object_layout(struct ceph_pg *pg,
const char *oid,
struct ceph_file_layout *fl,
struct ceph_osdmap *osdmap);
-extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap,
+ struct ceph_pg pgid,
int *acting);
extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
struct ceph_pg pgid);
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index 2c04afe..68c96a5 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -9,14 +9,6 @@
#include <linux/ceph/msgr.h>
/*
- * osdmap encoding versions
- */
-#define CEPH_OSDMAP_INC_VERSION 5
-#define CEPH_OSDMAP_INC_VERSION_EXT 6
-#define CEPH_OSDMAP_VERSION 5
-#define CEPH_OSDMAP_VERSION_EXT 6
-
-/*
* fs id
*/
struct ceph_fsid {
@@ -64,7 +56,7 @@ struct ceph_timespec {
* placement group.
* we encode this into one __le64.
*/
-struct ceph_pg {
+struct ceph_pg_v1 {
__le16 preferred; /* preferred primary osd */
__le16 ps; /* placement seed */
__le32 pool; /* object pool */
@@ -91,21 +83,6 @@ struct ceph_pg {
#define CEPH_PG_TYPE_REP 1
#define CEPH_PG_TYPE_RAID4 2
-#define CEPH_PG_POOL_VERSION 2
-struct ceph_pg_pool {
- __u8 type; /* CEPH_PG_TYPE_* */
- __u8 size; /* number of osds in each pg */
- __u8 crush_ruleset; /* crush placement rule */
- __u8 object_hash; /* hash mapping object name to ps */
- __le32 pg_num, pgp_num; /* number of pg's */
- __le32 lpg_num, lpgp_num; /* number of localized pg's */
- __le32 last_change; /* most recent epoch changed */
- __le64 snap_seq; /* seq for per-pool snapshot */
- __le32 snap_epoch; /* epoch of last snap */
- __le32 num_snaps;
- __le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */
- __le64 auid; /* who owns the pg */
-} __attribute__ ((packed));
/*
* stable_mod func is used to control number of placement groups.
@@ -128,7 +105,7 @@ static inline int ceph_stable_mod(int x, int b, int bmask)
* object layout - how a given object should be stored.
*/
struct ceph_object_layout {
- struct ceph_pg ol_pgid; /* raw pg, with _full_ ps precision. */
+ struct ceph_pg_v1 ol_pgid; /* raw pg, with _full_ ps precision. */
__le32 ol_stripe_unit; /* for per-object parity, if any */
} __attribute__ ((packed));
@@ -145,8 +122,12 @@ struct ceph_eversion {
*/
/* status bits */
-#define CEPH_OSD_EXISTS 1
-#define CEPH_OSD_UP 2
+#define CEPH_OSD_EXISTS (1<<0)
+#define CEPH_OSD_UP (1<<1)
+#define CEPH_OSD_AUTOOUT (1<<2) /* osd was automatically marked out */
+#define CEPH_OSD_NEW (1<<3) /* osd is new, never marked in */
+
+extern const char *ceph_osd_state_name(int s);
/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
#define CEPH_OSD_IN 0x10000
@@ -161,9 +142,25 @@ struct ceph_eversion {
#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */
#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */
#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */
+#define CEPH_OSDMAP_NOUP (1<<5) /* block osd boot */
+#define CEPH_OSDMAP_NODOWN (1<<6) /* block osd mark-down/failure */
+#define CEPH_OSDMAP_NOOUT (1<<7) /* block osd auto mark-out */
+#define CEPH_OSDMAP_NOIN (1<<8) /* block osd auto mark-in */
+#define CEPH_OSDMAP_NOBACKFILL (1<<9) /* block osd backfill */
+#define CEPH_OSDMAP_NORECOVER (1<<10) /* block osd recovery and backfill */
+
+/*
+ * The error code to return when an OSD can't handle a write
+ * because it is too large.
+ */
+#define OSD_WRITETOOBIG EMSGSIZE
/*
* osd ops
+ *
+ * WARNING: do not use these op codes directly. Use the helpers
+ * defined below instead. In certain cases, op code behavior was
+ * redefined, resulting in special-cases in the helpers.
*/
#define CEPH_OSD_OP_MODE 0xf000
#define CEPH_OSD_OP_MODE_RD 0x1000
@@ -177,6 +174,7 @@ struct ceph_eversion {
#define CEPH_OSD_OP_TYPE_ATTR 0x0300
#define CEPH_OSD_OP_TYPE_EXEC 0x0400
#define CEPH_OSD_OP_TYPE_PG 0x0500
+#define CEPH_OSD_OP_TYPE_MULTI 0x0600 /* multiobject */
enum {
/** data **/
@@ -217,6 +215,23 @@ enum {
CEPH_OSD_OP_WATCH = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 15,
+ /* omap */
+ CEPH_OSD_OP_OMAPGETKEYS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 17,
+ CEPH_OSD_OP_OMAPGETVALS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 18,
+ CEPH_OSD_OP_OMAPGETHEADER = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 19,
+ CEPH_OSD_OP_OMAPGETVALSBYKEYS =
+ CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 20,
+ CEPH_OSD_OP_OMAPSETVALS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 21,
+ CEPH_OSD_OP_OMAPSETHEADER = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 22,
+ CEPH_OSD_OP_OMAPCLEAR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 23,
+ CEPH_OSD_OP_OMAPRMKEYS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 24,
+ CEPH_OSD_OP_OMAP_CMP = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 25,
+
+ /** multi **/
+ CEPH_OSD_OP_CLONERANGE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_MULTI | 1,
+ CEPH_OSD_OP_ASSERT_SRC_VERSION = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_MULTI | 2,
+ CEPH_OSD_OP_SRC_CMPXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_MULTI | 3,
+
/** attrs **/
/* read */
CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
@@ -238,6 +253,7 @@ enum {
CEPH_OSD_OP_SCRUB_RESERVE = CEPH_OSD_OP_MODE_SUB | 6,
CEPH_OSD_OP_SCRUB_UNRESERVE = CEPH_OSD_OP_MODE_SUB | 7,
CEPH_OSD_OP_SCRUB_STOP = CEPH_OSD_OP_MODE_SUB | 8,
+ CEPH_OSD_OP_SCRUB_MAP = CEPH_OSD_OP_MODE_SUB | 9,
/** lock **/
CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
@@ -248,10 +264,12 @@ enum {
CEPH_OSD_OP_DNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
/** exec **/
+ /* note: the RD bit here is wrong; see special-case below in helper */
CEPH_OSD_OP_CALL = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1,
/** pg **/
CEPH_OSD_OP_PGLS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1,
+ CEPH_OSD_OP_PGLS_FILTER = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 2,
};
static inline int ceph_osd_op_type_lock(int op)
@@ -274,6 +292,10 @@ static inline int ceph_osd_op_type_pg(int op)
{
return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
}
+static inline int ceph_osd_op_type_multi(int op)
+{
+ return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_MULTI;
+}
static inline int ceph_osd_op_mode_subop(int op)
{
@@ -281,11 +303,12 @@ static inline int ceph_osd_op_mode_subop(int op)
}
static inline int ceph_osd_op_mode_read(int op)
{
- return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD;
+ return (op & CEPH_OSD_OP_MODE_RD) &&
+ op != CEPH_OSD_OP_CALL;
}
static inline int ceph_osd_op_mode_modify(int op)
{
- return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
+ return op & CEPH_OSD_OP_MODE_WR;
}
/*
@@ -294,34 +317,38 @@ static inline int ceph_osd_op_mode_modify(int op)
*/
#define CEPH_OSD_TMAP_HDR 'h'
#define CEPH_OSD_TMAP_SET 's'
+#define CEPH_OSD_TMAP_CREATE 'c' /* create key */
#define CEPH_OSD_TMAP_RM 'r'
+#define CEPH_OSD_TMAP_RMSLOPPY 'R'
extern const char *ceph_osd_op_name(int op);
-
/*
* osd op flags
*
* An op may be READ, WRITE, or READ|WRITE.
*/
enum {
- CEPH_OSD_FLAG_ACK = 1, /* want (or is) "ack" ack */
- CEPH_OSD_FLAG_ONNVRAM = 2, /* want (or is) "onnvram" ack */
- CEPH_OSD_FLAG_ONDISK = 4, /* want (or is) "ondisk" ack */
- CEPH_OSD_FLAG_RETRY = 8, /* resend attempt */
- CEPH_OSD_FLAG_READ = 16, /* op may read */
- CEPH_OSD_FLAG_WRITE = 32, /* op may write */
- CEPH_OSD_FLAG_ORDERSNAP = 64, /* EOLDSNAP if snapc is out of order */
- CEPH_OSD_FLAG_PEERSTAT = 128, /* msg includes osd_peer_stat */
- CEPH_OSD_FLAG_BALANCE_READS = 256,
- CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
- CEPH_OSD_FLAG_PGOP = 1024, /* pg op, no object */
- CEPH_OSD_FLAG_EXEC = 2048, /* op may exec */
- CEPH_OSD_FLAG_EXEC_PUBLIC = 4096, /* op may exec (public) */
+ CEPH_OSD_FLAG_ACK = 0x0001, /* want (or is) "ack" ack */
+ CEPH_OSD_FLAG_ONNVRAM = 0x0002, /* want (or is) "onnvram" ack */
+ CEPH_OSD_FLAG_ONDISK = 0x0004, /* want (or is) "ondisk" ack */
+ CEPH_OSD_FLAG_RETRY = 0x0008, /* resend attempt */
+ CEPH_OSD_FLAG_READ = 0x0010, /* op may read */
+ CEPH_OSD_FLAG_WRITE = 0x0020, /* op may write */
+ CEPH_OSD_FLAG_ORDERSNAP = 0x0040, /* EOLDSNAP if snapc is out of order */
+ CEPH_OSD_FLAG_PEERSTAT_OLD = 0x0080, /* DEPRECATED msg includes osd_peer_stat */
+ CEPH_OSD_FLAG_BALANCE_READS = 0x0100,
+ CEPH_OSD_FLAG_PARALLELEXEC = 0x0200, /* execute op in parallel */
+ CEPH_OSD_FLAG_PGOP = 0x0400, /* pg op, no object */
+ CEPH_OSD_FLAG_EXEC = 0x0800, /* op may exec */
+ CEPH_OSD_FLAG_EXEC_PUBLIC = 0x1000, /* DEPRECATED op may exec (public) */
+ CEPH_OSD_FLAG_LOCALIZE_READS = 0x2000, /* read from nearby replica, if any */
+ CEPH_OSD_FLAG_RWORDERED = 0x4000, /* order wrt concurrent reads */
};
enum {
CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */
+ CEPH_OSD_OP_FLAG_FAILOK = 2, /* continue despite failure */
};
#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/
@@ -381,48 +408,13 @@ struct ceph_osd_op {
__le64 ver;
__u8 flag; /* 0 = unwatch, 1 = watch */
} __attribute__ ((packed)) watch;
-};
+ struct {
+ __le64 offset, length;
+ __le64 src_offset;
+ } __attribute__ ((packed)) clonerange;
+ };
__le32 payload_len;
} __attribute__ ((packed));
-/*
- * osd request message header. each request may include multiple
- * ceph_osd_op object operations.
- */
-struct ceph_osd_request_head {
- __le32 client_inc; /* client incarnation */
- struct ceph_object_layout layout; /* pgid */
- __le32 osdmap_epoch; /* client's osdmap epoch */
-
- __le32 flags;
-
- struct ceph_timespec mtime; /* for mutations only */
- struct ceph_eversion reassert_version; /* if we are replaying op */
-
- __le32 object_len; /* length of object name */
-
- __le64 snapid; /* snapid to read */
- __le64 snap_seq; /* writer's snap context */
- __le32 num_snaps;
-
- __le16 num_ops;
- struct ceph_osd_op ops[]; /* followed by ops[], obj, ticket, snaps */
-} __attribute__ ((packed));
-
-struct ceph_osd_reply_head {
- __le32 client_inc; /* client incarnation */
- __le32 flags;
- struct ceph_object_layout layout;
- __le32 osdmap_epoch;
- struct ceph_eversion reassert_version; /* for replaying uncommitted */
-
- __le32 result; /* result code */
-
- __le32 object_len; /* length of object name */
- __le32 num_ops;
- struct ceph_osd_op ops[0]; /* ops[], object */
-} __attribute__ ((packed));
-
-
#endif
diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
index 25baa28..6a1101f 100644
--- a/include/linux/crush/crush.h
+++ b/include/linux/crush/crush.h
@@ -162,6 +162,8 @@ struct crush_map {
__u32 choose_local_fallback_tries;
/* choose attempts before giving up */
__u32 choose_total_tries;
+ /* attempt chooseleaf inner descent once; on failure retry outer descent */
+ __u32 chooseleaf_descend_once;
};