summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-09-06 12:10:15 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2023-09-06 12:10:15 -0700
commit7ba2090ca64ea1aa435744884124387db1fac70f (patch)
treeed4ea24f4cfed5f28b9c8cdf99dbdf7df6a221ae /include
parent744a759492b5c57ff24a6e8aabe47b17ad8ee964 (diff)
parentce0d5bd3a6c176f9a3bf867624a07119dd4d0878 (diff)
Merge tag 'ceph-for-6.6-rc1' of https://github.com/ceph/ceph-client
Pull ceph updates from Ilya Dryomov: "Mixed with some fixes and cleanups, this brings in reasonably complete fscrypt support to CephFS! The list of things which don't work with encryption should be fairly short, mostly around the edges: fallocate (not supported well in CephFS to begin with), copy_file_range (requires re-encryption), non-default striping patterns. This was a multi-year effort principally by Jeff Layton with assistance from Xiubo Li, Luís Henriques and others, including several dependant changes in the MDS, netfs helper library and fscrypt framework itself" * tag 'ceph-for-6.6-rc1' of https://github.com/ceph/ceph-client: (53 commits) ceph: make num_fwd and num_retry to __u32 ceph: make members in struct ceph_mds_request_args_ext a union rbd: use list_for_each_entry() helper libceph: do not include crypto/algapi.h ceph: switch ceph_lookup/atomic_open() to use new fscrypt helper ceph: fix updating i_truncate_pagecache_size for fscrypt ceph: wait for OSD requests' callbacks to finish when unmounting ceph: drop messages from MDS when unmounting ceph: update documentation regarding snapshot naming limitations ceph: prevent snapshot creation in encrypted locked directories ceph: add support for encrypted snapshot names ceph: invalidate pages when doing direct/sync writes ceph: plumb in decryption during reads ceph: add encryption support to writepage and writepages ceph: add read/modify/write to ceph_sync_write ceph: align data in pages in ceph_sync_write ceph: don't use special DIO path for encrypted inodes ceph: add truncate size handling support for fscrypt ceph: add object version support for sync read libceph: allow ceph_osdc_new_request to accept a multi-op read ...
Diffstat (limited to 'include')
-rw-r--r--include/linux/ceph/ceph_fs.h68
-rw-r--r--include/linux/ceph/messenger.h40
-rw-r--r--include/linux/ceph/osd_client.h93
-rw-r--r--include/linux/ceph/rados.h4
4 files changed, 180 insertions, 25 deletions
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 49586ff26152..5f2301ee88bc 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -359,14 +359,19 @@ enum {
extern const char *ceph_mds_op_name(int op);
-
-#define CEPH_SETATTR_MODE 1
-#define CEPH_SETATTR_UID 2
-#define CEPH_SETATTR_GID 4
-#define CEPH_SETATTR_MTIME 8
-#define CEPH_SETATTR_ATIME 16
-#define CEPH_SETATTR_SIZE 32
-#define CEPH_SETATTR_CTIME 64
+#define CEPH_SETATTR_MODE (1 << 0)
+#define CEPH_SETATTR_UID (1 << 1)
+#define CEPH_SETATTR_GID (1 << 2)
+#define CEPH_SETATTR_MTIME (1 << 3)
+#define CEPH_SETATTR_ATIME (1 << 4)
+#define CEPH_SETATTR_SIZE (1 << 5)
+#define CEPH_SETATTR_CTIME (1 << 6)
+#define CEPH_SETATTR_MTIME_NOW (1 << 7)
+#define CEPH_SETATTR_ATIME_NOW (1 << 8)
+#define CEPH_SETATTR_BTIME (1 << 9)
+#define CEPH_SETATTR_KILL_SGUID (1 << 10)
+#define CEPH_SETATTR_FSCRYPT_AUTH (1 << 11)
+#define CEPH_SETATTR_FSCRYPT_FILE (1 << 12)
/*
* Ceph setxattr request flags.
@@ -462,24 +467,26 @@ union ceph_mds_request_args {
} __attribute__ ((packed));
union ceph_mds_request_args_ext {
- union ceph_mds_request_args old;
- struct {
- __le32 mode;
- __le32 uid;
- __le32 gid;
- struct ceph_timespec mtime;
- struct ceph_timespec atime;
- __le64 size, old_size; /* old_size needed by truncate */
- __le32 mask; /* CEPH_SETATTR_* */
- struct ceph_timespec btime;
- } __attribute__ ((packed)) setattr_ext;
+ union {
+ union ceph_mds_request_args old;
+ struct {
+ __le32 mode;
+ __le32 uid;
+ __le32 gid;
+ struct ceph_timespec mtime;
+ struct ceph_timespec atime;
+ __le64 size, old_size; /* old_size needed by truncate */
+ __le32 mask; /* CEPH_SETATTR_* */
+ struct ceph_timespec btime;
+ } __attribute__ ((packed)) setattr_ext;
+ };
};
#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */
#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */
#define CEPH_MDS_FLAG_ASYNC 4 /* request is asynchronous */
-struct ceph_mds_request_head_old {
+struct ceph_mds_request_head_legacy {
__le64 oldest_client_tid;
__le32 mdsmap_epoch; /* on client */
__le32 flags; /* CEPH_MDS_FLAG_* */
@@ -492,9 +499,9 @@ struct ceph_mds_request_head_old {
union ceph_mds_request_args args;
} __attribute__ ((packed));
-#define CEPH_MDS_REQUEST_HEAD_VERSION 1
+#define CEPH_MDS_REQUEST_HEAD_VERSION 2
-struct ceph_mds_request_head {
+struct ceph_mds_request_head_old {
__le16 version; /* struct version */
__le64 oldest_client_tid;
__le32 mdsmap_epoch; /* on client */
@@ -508,6 +515,23 @@ struct ceph_mds_request_head {
union ceph_mds_request_args_ext args;
} __attribute__ ((packed));
+struct ceph_mds_request_head {
+ __le16 version; /* struct version */
+ __le64 oldest_client_tid;
+ __le32 mdsmap_epoch; /* on client */
+ __le32 flags; /* CEPH_MDS_FLAG_* */
+ __u8 num_retry, num_fwd; /* legacy count retry and fwd attempts */
+ __le16 num_releases; /* # include cap/lease release records */
+ __le32 op; /* mds op code */
+ __le32 caller_uid, caller_gid;
+ __le64 ino; /* use this ino for openc, mkdir, mknod,
+ etc. (if replaying) */
+ union ceph_mds_request_args_ext args;
+
+ __le32 ext_num_retry; /* new count retry attempts */
+ __le32 ext_num_fwd; /* new count fwd attempts */
+} __attribute__ ((packed));
+
/* cap/lease release record */
struct ceph_mds_request_release {
__le64 ino, cap_id; /* ino and unique cap id */
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 99c1726be6ee..2eaaabbe98cb 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -17,6 +17,7 @@
struct ceph_msg;
struct ceph_connection;
+struct ceph_msg_data_cursor;
/*
* Ceph defines these callbacks for handling connection events.
@@ -70,6 +71,30 @@ struct ceph_connection_operations {
int used_proto, int result,
const int *allowed_protos, int proto_cnt,
const int *allowed_modes, int mode_cnt);
+
+ /**
+ * sparse_read: read sparse data
+ * @con: connection we're reading from
+ * @cursor: data cursor for reading extents
+ * @buf: optional buffer to read into
+ *
+ * This should be called more than once, each time setting up to
+ * receive an extent into the current cursor position, and zeroing
+ * the holes between them.
+ *
+ * Returns amount of data to be read (in bytes), 0 if reading is
+ * complete, or -errno if there was an error.
+ *
+ * If @buf is set on a >0 return, then the data should be read into
+ * the provided buffer. Otherwise, it should be read into the cursor.
+ *
+ * The sparse read operation is expected to initialize the cursor
+ * with a length covering up to the end of the last extent.
+ */
+ int (*sparse_read)(struct ceph_connection *con,
+ struct ceph_msg_data_cursor *cursor,
+ char **buf);
+
};
/* use format string %s%lld */
@@ -98,6 +123,7 @@ enum ceph_msg_data_type {
CEPH_MSG_DATA_BIO, /* data source/destination is a bio list */
#endif /* CONFIG_BLOCK */
CEPH_MSG_DATA_BVECS, /* data source/destination is a bio_vec array */
+ CEPH_MSG_DATA_ITER, /* data source/destination is an iov_iter */
};
#ifdef CONFIG_BLOCK
@@ -199,6 +225,7 @@ struct ceph_msg_data {
bool own_pages;
};
struct ceph_pagelist *pagelist;
+ struct iov_iter iter;
};
};
@@ -207,6 +234,7 @@ struct ceph_msg_data_cursor {
struct ceph_msg_data *data; /* current data item */
size_t resid; /* bytes not yet consumed */
+ int sr_resid; /* residual sparse_read len */
bool need_crc; /* crc update needed */
union {
#ifdef CONFIG_BLOCK
@@ -222,6 +250,10 @@ struct ceph_msg_data_cursor {
struct page *page; /* page from list */
size_t offset; /* bytes from list */
};
+ struct {
+ struct iov_iter iov_iter;
+ unsigned int lastlen;
+ };
};
};
@@ -251,6 +283,7 @@ struct ceph_msg {
struct kref kref;
bool more_to_follow;
bool needs_out_seq;
+ bool sparse_read;
int front_alloc_len;
struct ceph_msgpool *pool;
@@ -309,6 +342,10 @@ struct ceph_connection_v1_info {
int in_base_pos; /* bytes read */
+ /* sparse reads */
+ struct kvec in_sr_kvec; /* current location to receive into */
+ u64 in_sr_len; /* amount of data in this extent */
+
/* message in temps */
u8 in_tag; /* protocol control byte */
struct ceph_msg_header in_hdr;
@@ -395,6 +432,7 @@ struct ceph_connection_v2_info {
void *conn_bufs[16];
int conn_buf_cnt;
+ int data_len_remain;
struct kvec in_sign_kvecs[8];
struct kvec out_sign_kvecs[8];
@@ -573,6 +611,8 @@ void ceph_msg_data_add_bio(struct ceph_msg *msg, struct ceph_bio_iter *bio_pos,
#endif /* CONFIG_BLOCK */
void ceph_msg_data_add_bvecs(struct ceph_msg *msg,
struct ceph_bvec_iter *bvec_pos);
+void ceph_msg_data_add_iter(struct ceph_msg *msg,
+ struct iov_iter *iter);
struct ceph_msg *ceph_msg_new2(int type, int front_len, int max_data_items,
gfp_t flags, bool can_fail);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index fb6be72104df..bf9823956758 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -29,14 +29,62 @@ typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *);
#define CEPH_HOMELESS_OSD -1
-/* a given osd we're communicating with */
+/*
+ * A single extent in a SPARSE_READ reply.
+ *
+ * Note that these come from the OSD as little-endian values. On BE arches,
+ * we convert them in-place after receipt.
+ */
+struct ceph_sparse_extent {
+ u64 off;
+ u64 len;
+} __packed;
+
+/* Sparse read state machine state values */
+enum ceph_sparse_read_state {
+ CEPH_SPARSE_READ_HDR = 0,
+ CEPH_SPARSE_READ_EXTENTS,
+ CEPH_SPARSE_READ_DATA_LEN,
+ CEPH_SPARSE_READ_DATA,
+};
+
+/*
+ * A SPARSE_READ reply is a 32-bit count of extents, followed by an array of
+ * 64-bit offset/length pairs, and then all of the actual file data
+ * concatenated after it (sans holes).
+ *
+ * Unfortunately, we don't know how long the extent array is until we've
+ * started reading the data section of the reply. The caller should send down
+ * a destination buffer for the array, but we'll alloc one if it's too small
+ * or if the caller doesn't.
+ */
+struct ceph_sparse_read {
+ enum ceph_sparse_read_state sr_state; /* state machine state */
+ u64 sr_req_off; /* orig request offset */
+ u64 sr_req_len; /* orig request length */
+ u64 sr_pos; /* current pos in buffer */
+ int sr_index; /* current extent index */
+ __le32 sr_datalen; /* length of actual data */
+ u32 sr_count; /* extent count in reply */
+ int sr_ext_len; /* length of extent array */
+ struct ceph_sparse_extent *sr_extent; /* extent array */
+};
+
+/*
+ * A given osd we're communicating with.
+ *
+ * Note that the o_requests tree can be searched while holding the "lock" mutex
+ * or the "o_requests_lock" spinlock. Insertion or removal requires both!
+ */
struct ceph_osd {
refcount_t o_ref;
+ int o_sparse_op_idx;
struct ceph_osd_client *o_osdc;
int o_osd;
int o_incarnation;
struct rb_node o_node;
struct ceph_connection o_con;
+ spinlock_t o_requests_lock;
struct rb_root o_requests;
struct rb_root o_linger_requests;
struct rb_root o_backoff_mappings;
@@ -46,6 +94,7 @@ struct ceph_osd {
unsigned long lru_ttl;
struct list_head o_keepalive_item;
struct mutex lock;
+ struct ceph_sparse_read o_sparse_read;
};
#define CEPH_OSD_SLAB_OPS 2
@@ -59,6 +108,7 @@ enum ceph_osd_data_type {
CEPH_OSD_DATA_TYPE_BIO,
#endif /* CONFIG_BLOCK */
CEPH_OSD_DATA_TYPE_BVECS,
+ CEPH_OSD_DATA_TYPE_ITER,
};
struct ceph_osd_data {
@@ -82,6 +132,7 @@ struct ceph_osd_data {
struct ceph_bvec_iter bvec_pos;
u32 num_bvecs;
};
+ struct iov_iter iter;
};
};
@@ -98,6 +149,8 @@ struct ceph_osd_req_op {
u64 offset, length;
u64 truncate_size;
u32 truncate_seq;
+ int sparse_ext_cnt;
+ struct ceph_sparse_extent *sparse_ext;
struct ceph_osd_data osd_data;
} extent;
struct {
@@ -145,6 +198,9 @@ struct ceph_osd_req_op {
u32 src_fadvise_flags;
struct ceph_osd_data osd_data;
} copy_from;
+ struct {
+ u64 ver;
+ } assert_ver;
};
};
@@ -199,6 +255,7 @@ struct ceph_osd_request {
struct ceph_osd_client *r_osdc;
struct kref r_kref;
bool r_mempool;
+ bool r_linger; /* don't resend on failure */
struct completion r_completion; /* private to osd_client.c */
ceph_osdc_callback_t r_callback;
@@ -211,9 +268,9 @@ struct ceph_osd_request {
struct ceph_snap_context *r_snapc; /* for writes */
struct timespec64 r_mtime; /* ditto */
u64 r_data_offset; /* ditto */
- bool r_linger; /* don't resend on failure */
/* internal */
+ u64 r_version; /* data version sent in reply */
unsigned long r_stamp; /* jiffies, send or check time */
unsigned long r_start_stamp; /* jiffies */
ktime_t r_start_latency; /* ktime_t */
@@ -450,6 +507,8 @@ void osd_req_op_extent_osd_data_bvecs(struct ceph_osd_request *osd_req,
void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req,
unsigned int which,
struct ceph_bvec_iter *bvec_pos);
+void osd_req_op_extent_osd_iter(struct ceph_osd_request *osd_req,
+ unsigned int which, struct iov_iter *iter);
extern void osd_req_op_cls_request_data_pagelist(struct ceph_osd_request *,
unsigned int which,
@@ -504,6 +563,20 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
u32 truncate_seq, u64 truncate_size,
bool use_mempool);
+int __ceph_alloc_sparse_ext_map(struct ceph_osd_req_op *op, int cnt);
+
+/*
+ * How big an extent array should we preallocate for a sparse read? This is
+ * just a starting value. If we get more than this back from the OSD, the
+ * receiver will reallocate.
+ */
+#define CEPH_SPARSE_EXT_ARRAY_INITIAL 16
+
+static inline int ceph_alloc_sparse_ext_map(struct ceph_osd_req_op *op)
+{
+ return __ceph_alloc_sparse_ext_map(op, CEPH_SPARSE_EXT_ARRAY_INITIAL);
+}
+
extern void ceph_osdc_get_request(struct ceph_osd_request *req);
extern void ceph_osdc_put_request(struct ceph_osd_request *req);
@@ -558,5 +631,19 @@ int ceph_osdc_list_watchers(struct ceph_osd_client *osdc,
struct ceph_object_locator *oloc,
struct ceph_watch_item **watchers,
u32 *num_watchers);
-#endif
+/* Find offset into the buffer of the end of the extent map */
+static inline u64 ceph_sparse_ext_map_end(struct ceph_osd_req_op *op)
+{
+ struct ceph_sparse_extent *ext;
+
+ /* No extents? No data */
+ if (op->extent.sparse_ext_cnt == 0)
+ return 0;
+
+ ext = &op->extent.sparse_ext[op->extent.sparse_ext_cnt - 1];
+
+ return ext->off + ext->len - op->extent.offset;
+}
+
+#endif
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index 43a7a1573b51..73c3efbec36c 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -524,6 +524,10 @@ struct ceph_osd_op {
__le64 cookie;
} __attribute__ ((packed)) notify;
struct {
+ __le64 unused;
+ __le64 ver;
+ } __attribute__ ((packed)) assert_ver;
+ struct {
__le64 offset, length;
__le64 src_offset;
} __attribute__ ((packed)) clonerange;