aboutsummaryrefslogtreecommitdiff
path: root/fs/ceph/addr.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-03-24 18:32:48 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2022-03-24 18:32:48 -0700
commit85c7000fda0029ec16569b1eec8fd3a8d026be73 (patch)
treec6e7544175414fc6aa44f7f46f75242bf8c9b5c4 /fs/ceph/addr.c
parentb1b07ba356f04268230e16a8e1813fe1b19dac54 (diff)
parentf639d9867eea647005dc824e0e24f39ffc50d4e4 (diff)
Merge tag 'ceph-for-5.18-rc1' of https://github.com/ceph/ceph-client
Pull ceph updates from Ilya Dryomov: "The highlights are: - several changes to how snap context and snap realms are tracked (Xiubo Li). In particular, this should resolve a long-standing issue of high kworker CPU usage and various stalls caused by needless iteration over all inodes in the snap realm. - async create fixes to address hangs in some edge cases (Jeff Layton) - support for getvxattr MDS op for querying server-side xattrs, such as file/directory layouts and ephemeral pins (Milind Changire) - average latency is now maintained for all metrics (Venky Shankar) - some tweaks around handling inline data to make it fit better with netfs helper library (David Howells) Also a couple of memory leaks got plugged along with a few assorted fixups. Last but not least, Xiubo has stepped up to serve as a CephFS co-maintainer" * tag 'ceph-for-5.18-rc1' of https://github.com/ceph/ceph-client: (27 commits) ceph: fix memory leak in ceph_readdir when note_last_dentry returns error ceph: uninitialized variable in debug output ceph: use tracked average r/w/m latencies to display metrics in debugfs ceph: include average/stdev r/w/m latency in mds metrics ceph: track average r/w/m latency ceph: use ktime_to_timespec64() rather than jiffies_to_timespec64() ceph: assign the ci only when the inode isn't NULL ceph: fix inode reference leakage in ceph_get_snapdir() ceph: misc fix for code style and logs ceph: allocate capsnap memory outside of ceph_queue_cap_snap() ceph: do not release the global snaprealm until unmounting ceph: remove incorrect and unused CEPH_INO_DOTDOT macro MAINTAINERS: add Xiubo Li as cephfs co-maintainer ceph: eliminate the recursion when rebuilding the snap context ceph: do not update snapshot context when there is no new snapshot ceph: zero the dir_entries memory when allocating it ceph: move to a dedicated slabcache for ceph_cap_snap ceph: add getvxattr op libceph: drop else branches in prepare_read_data{,_cont} ceph: fix comments mentioning i_mutex ...
Diffstat (limited to 'fs/ceph/addr.c')
-rw-r--r--fs/ceph/addr.c240
1 files changed, 112 insertions, 128 deletions
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index f6135c93ce9d..c7a0ab0d298b 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -184,7 +184,7 @@ static int ceph_releasepage(struct page *page, gfp_t gfp)
static void ceph_netfs_expand_readahead(struct netfs_read_request *rreq)
{
- struct inode *inode = rreq->mapping->host;
+ struct inode *inode = rreq->inode;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_file_layout *lo = &ci->i_layout;
u32 blockoff;
@@ -201,7 +201,7 @@ static void ceph_netfs_expand_readahead(struct netfs_read_request *rreq)
static bool ceph_netfs_clamp_length(struct netfs_read_subrequest *subreq)
{
- struct inode *inode = subreq->rreq->mapping->host;
+ struct inode *inode = subreq->rreq->inode;
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
u64 objno, objoff;
@@ -244,10 +244,63 @@ static void finish_netfs_read(struct ceph_osd_request *req)
iput(req->r_inode);
}
+static bool ceph_netfs_issue_op_inline(struct netfs_read_subrequest *subreq)
+{
+ struct netfs_read_request *rreq = subreq->rreq;
+ struct inode *inode = rreq->inode;
+ struct ceph_mds_reply_info_parsed *rinfo;
+ struct ceph_mds_reply_info_in *iinfo;
+ struct ceph_mds_request *req;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct iov_iter iter;
+ ssize_t err = 0;
+ size_t len;
+
+ __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+ __clear_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags);
+
+ if (subreq->start >= inode->i_size)
+ goto out;
+
+ /* We need to fetch the inline data. */
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto out;
+ }
+ req->r_ino1 = ci->i_vino;
+ req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INLINE_DATA);
+ req->r_num_caps = 2;
+
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ if (err < 0)
+ goto out;
+
+ rinfo = &req->r_reply_info;
+ iinfo = &rinfo->targeti;
+ if (iinfo->inline_version == CEPH_INLINE_NONE) {
+ /* The data got uninlined */
+ ceph_mdsc_put_request(req);
+ return false;
+ }
+
+ len = min_t(size_t, iinfo->inline_len - subreq->start, subreq->len);
+ iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len);
+ err = copy_to_iter(iinfo->inline_data + subreq->start, len, &iter);
+ if (err == 0)
+ err = -EFAULT;
+
+ ceph_mdsc_put_request(req);
+out:
+ netfs_subreq_terminated(subreq, err, false);
+ return true;
+}
+
static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq)
{
struct netfs_read_request *rreq = subreq->rreq;
- struct inode *inode = rreq->mapping->host;
+ struct inode *inode = rreq->inode;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_osd_request *req;
@@ -258,6 +311,10 @@ static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq)
int err = 0;
u64 len = subreq->len;
+ if (ci->i_inline_version != CEPH_INLINE_NONE &&
+ ceph_netfs_issue_op_inline(subreq))
+ return;
+
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, subreq->start, &len,
0, 1, CEPH_OSD_OP_READ,
CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica,
@@ -326,23 +383,9 @@ static int ceph_readpage(struct file *file, struct page *subpage)
size_t len = folio_size(folio);
u64 off = folio_file_pos(folio);
- if (ci->i_inline_version != CEPH_INLINE_NONE) {
- /*
- * Uptodate inline data should have been added
- * into page cache while getting Fcr caps.
- */
- if (off == 0) {
- folio_unlock(folio);
- return -EINVAL;
- }
- zero_user_segment(&folio->page, 0, folio_size(folio));
- folio_mark_uptodate(folio);
- folio_unlock(folio);
- return 0;
- }
-
- dout("readpage ino %llx.%llx file %p off %llu len %zu folio %p index %lu\n",
- vino.ino, vino.snap, file, off, len, folio, folio_index(folio));
+ dout("readpage ino %llx.%llx file %p off %llu len %zu folio %p index %lu\n inline %d",
+ vino.ino, vino.snap, file, off, len, folio, folio_index(folio),
+ ci->i_inline_version != CEPH_INLINE_NONE);
return netfs_readpage(file, folio, &ceph_netfs_read_ops, NULL);
}
@@ -1281,45 +1324,11 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
struct page **pagep, void **fsdata)
{
struct inode *inode = file_inode(file);
- struct ceph_inode_info *ci = ceph_inode(inode);
struct folio *folio = NULL;
- pgoff_t index = pos >> PAGE_SHIFT;
int r;
- /*
- * Uninlining should have already been done and everything updated, EXCEPT
- * for inline_version sent to the MDS.
- */
- if (ci->i_inline_version != CEPH_INLINE_NONE) {
- unsigned int fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE;
- if (aop_flags & AOP_FLAG_NOFS)
- fgp_flags |= FGP_NOFS;
- folio = __filemap_get_folio(mapping, index, fgp_flags,
- mapping_gfp_mask(mapping));
- if (!folio)
- return -ENOMEM;
-
- /*
- * The inline_version on a new inode is set to 1. If that's the
- * case, then the folio is brand new and isn't yet Uptodate.
- */
- r = 0;
- if (index == 0 && ci->i_inline_version != 1) {
- if (!folio_test_uptodate(folio)) {
- WARN_ONCE(1, "ceph: write_begin called on still-inlined inode (inline_version %llu)!\n",
- ci->i_inline_version);
- r = -EINVAL;
- }
- goto out;
- }
- zero_user_segment(&folio->page, 0, folio_size(folio));
- folio_mark_uptodate(folio);
- goto out;
- }
-
r = netfs_write_begin(file, inode->i_mapping, pos, len, 0, &folio, NULL,
&ceph_netfs_read_ops, NULL);
-out:
if (r == 0)
folio_wait_fscache(folio);
if (r < 0) {
@@ -1515,19 +1524,6 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
sb_start_pagefault(inode->i_sb);
ceph_block_sigs(&oldset);
- if (ci->i_inline_version != CEPH_INLINE_NONE) {
- struct page *locked_page = NULL;
- if (off == 0) {
- lock_page(page);
- locked_page = page;
- }
- err = ceph_uninline_data(vma->vm_file, locked_page);
- if (locked_page)
- unlock_page(locked_page);
- if (err < 0)
- goto out_free;
- }
-
if (off + thp_size(page) <= size)
len = thp_size(page);
else
@@ -1584,11 +1580,9 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
ceph_put_snap_context(snapc);
} while (err == 0);
- if (ret == VM_FAULT_LOCKED ||
- ci->i_inline_version != CEPH_INLINE_NONE) {
+ if (ret == VM_FAULT_LOCKED) {
int dirty;
spin_lock(&ci->i_ceph_lock);
- ci->i_inline_version = CEPH_INLINE_NONE;
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
&prealloc_cf);
spin_unlock(&ci->i_ceph_lock);
@@ -1652,16 +1646,30 @@ void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
}
}
-int ceph_uninline_data(struct file *filp, struct page *locked_page)
+int ceph_uninline_data(struct file *file)
{
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_osd_request *req;
- struct page *page = NULL;
- u64 len, inline_version;
+ struct ceph_cap_flush *prealloc_cf;
+ struct folio *folio = NULL;
+ u64 inline_version = CEPH_INLINE_NONE;
+ struct page *pages[1];
int err = 0;
- bool from_pagecache = false;
+ u64 len;
+
+ prealloc_cf = ceph_alloc_cap_flush();
+ if (!prealloc_cf)
+ return -ENOMEM;
+
+ folio = read_mapping_folio(inode->i_mapping, 0, file);
+ if (IS_ERR(folio)) {
+ err = PTR_ERR(folio);
+ goto out;
+ }
+
+ folio_lock(folio);
spin_lock(&ci->i_ceph_lock);
inline_version = ci->i_inline_version;
@@ -1672,45 +1680,11 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
if (inline_version == 1 || /* initial version, no data */
inline_version == CEPH_INLINE_NONE)
- goto out;
-
- if (locked_page) {
- page = locked_page;
- WARN_ON(!PageUptodate(page));
- } else if (ceph_caps_issued(ci) &
- (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) {
- page = find_get_page(inode->i_mapping, 0);
- if (page) {
- if (PageUptodate(page)) {
- from_pagecache = true;
- lock_page(page);
- } else {
- put_page(page);
- page = NULL;
- }
- }
- }
+ goto out_unlock;
- if (page) {
- len = i_size_read(inode);
- if (len > PAGE_SIZE)
- len = PAGE_SIZE;
- } else {
- page = __page_cache_alloc(GFP_NOFS);
- if (!page) {
- err = -ENOMEM;
- goto out;
- }
- err = __ceph_do_getattr(inode, page,
- CEPH_STAT_CAP_INLINE_DATA, true);
- if (err < 0) {
- /* no inline data */
- if (err == -ENODATA)
- err = 0;
- goto out;
- }
- len = err;
- }
+ len = i_size_read(inode);
+ if (len > folio_size(folio))
+ len = folio_size(folio);
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
ceph_vino(inode), 0, &len, 0, 1,
@@ -1718,7 +1692,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
NULL, 0, 0, false);
if (IS_ERR(req)) {
err = PTR_ERR(req);
- goto out;
+ goto out_unlock;
}
req->r_mtime = inode->i_mtime;
@@ -1727,7 +1701,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
err = ceph_osdc_wait_request(&fsc->client->osdc, req);
ceph_osdc_put_request(req);
if (err < 0)
- goto out;
+ goto out_unlock;
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
ceph_vino(inode), 0, &len, 1, 3,
@@ -1736,10 +1710,11 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
ci->i_truncate_size, false);
if (IS_ERR(req)) {
err = PTR_ERR(req);
- goto out;
+ goto out_unlock;
}
- osd_req_op_extent_osd_data_pages(req, 1, &page, len, 0, false, false);
+ pages[0] = folio_page(folio, 0);
+ osd_req_op_extent_osd_data_pages(req, 1, pages, len, 0, false, false);
{
__le64 xattr_buf = cpu_to_le64(inline_version);
@@ -1749,7 +1724,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
CEPH_OSD_CMPXATTR_OP_GT,
CEPH_OSD_CMPXATTR_MODE_U64);
if (err)
- goto out_put;
+ goto out_put_req;
}
{
@@ -1760,7 +1735,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
"inline_version",
xattr_buf, xattr_len, 0, 0);
if (err)
- goto out_put;
+ goto out_put_req;
}
req->r_mtime = inode->i_mtime;
@@ -1771,19 +1746,28 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
req->r_end_latency, len, err);
-out_put:
+ if (!err) {
+ int dirty;
+
+ /* Set to CAP_INLINE_NONE and dirty the caps */
+ down_read(&fsc->mdsc->snap_rwsem);
+ spin_lock(&ci->i_ceph_lock);
+ ci->i_inline_version = CEPH_INLINE_NONE;
+ dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, &prealloc_cf);
+ spin_unlock(&ci->i_ceph_lock);
+ up_read(&fsc->mdsc->snap_rwsem);
+ if (dirty)
+ __mark_inode_dirty(inode, dirty);
+ }
+out_put_req:
ceph_osdc_put_request(req);
if (err == -ECANCELED)
err = 0;
+out_unlock:
+ folio_unlock(folio);
+ folio_put(folio);
out:
- if (page && page != locked_page) {
- if (from_pagecache) {
- unlock_page(page);
- put_page(page);
- } else
- __free_pages(page, 0);
- }
-
+ ceph_free_cap_flush(prealloc_cf);
dout("uninline_data %p %llx.%llx inline_version %llu = %d\n",
inode, ceph_vinop(inode), inline_version, err);
return err;