diff options
author | Mike Pagano <mpagano@gentoo.org> | 2022-05-01 13:02:58 -0400 |
---|---|---|
committer | Mike Pagano <mpagano@gentoo.org> | 2022-05-01 13:02:58 -0400 |
commit | aa3aade4f155b96481a44b6733e806c8181271cc (patch) | |
tree | 0d9e193aa1b1503cfffa79498d6c285193648117 | |
parent | Remove redundant patch (diff) | |
download | linux-patches-aa3aade4f155b96481a44b6733e806c8181271cc.tar.gz linux-patches-aa3aade4f155b96481a44b6733e806c8181271cc.tar.bz2 linux-patches-aa3aade4f155b96481a44b6733e806c8181271cc.zip |
Linux patch 5.15.375.15-41
Signed-off-by: Mike Pagano <mpagano@gentoo.org>
-rw-r--r-- | 0000_README | 4 | ||||
-rw-r--r-- | 1036_linux-5.15.37.patch | 4223 |
2 files changed, 4227 insertions, 0 deletions
diff --git a/0000_README b/0000_README index 0f44e39b..cb4266b1 100644 --- a/0000_README +++ b/0000_README @@ -187,6 +187,10 @@ Patch: 1035_linux-5.15.36.patch From: http://www.kernel.org Desc: Linux 5.15.36 +Patch: 1036_linux-5.15.37.patch +From: http://www.kernel.org +Desc: Linux 5.15.37 + Patch: 1500_XATTR_USER_PREFIX.patch From: https://bugs.gentoo.org/show_bug.cgi?id=470644 Desc: Support for namespace user.pax.* on tmpfs. diff --git a/1036_linux-5.15.37.patch b/1036_linux-5.15.37.patch new file mode 100644 index 00000000..b9d4c0ea --- /dev/null +++ b/1036_linux-5.15.37.patch @@ -0,0 +1,4223 @@ +diff --git a/Makefile b/Makefile +index e0710f9837847..50b1688a4ca2c 100644 +--- a/Makefile ++++ b/Makefile +@@ -1,7 +1,7 @@ + # SPDX-License-Identifier: GPL-2.0 + VERSION = 5 + PATCHLEVEL = 15 +-SUBLEVEL = 36 ++SUBLEVEL = 37 + EXTRAVERSION = + NAME = Trick or Treat + +diff --git a/arch/arm/boot/dts/socfpga.dtsi b/arch/arm/boot/dts/socfpga.dtsi +index 0b021eef0b538..7c1d6423d7f8c 100644 +--- a/arch/arm/boot/dts/socfpga.dtsi ++++ b/arch/arm/boot/dts/socfpga.dtsi +@@ -782,7 +782,7 @@ + }; + + qspi: spi@ff705000 { +- compatible = "cdns,qspi-nor"; ++ compatible = "intel,socfpga-qspi", "cdns,qspi-nor"; + #address-cells = <1>; + #size-cells = <0>; + reg = <0xff705000 0x1000>, +diff --git a/arch/arm/boot/dts/socfpga_arria10.dtsi b/arch/arm/boot/dts/socfpga_arria10.dtsi +index a574ea91d9d3f..3ba431dfa8c94 100644 +--- a/arch/arm/boot/dts/socfpga_arria10.dtsi ++++ b/arch/arm/boot/dts/socfpga_arria10.dtsi +@@ -756,7 +756,7 @@ + }; + + qspi: spi@ff809000 { +- compatible = "cdns,qspi-nor"; ++ compatible = "intel,socfpga-qspi", "cdns,qspi-nor"; + #address-cells = <1>; + #size-cells = <0>; + reg = <0xff809000 0x100>, +diff --git a/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi b/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi +index d301ac0d406bf..3ec301bd08a91 100644 +--- a/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi ++++ b/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi +@@ -594,7 +594,7 @@ + }; + + qspi: spi@ff8d2000 { +- compatible = "cdns,qspi-nor"; ++ compatible = "intel,socfpga-qspi", "cdns,qspi-nor"; + #address-cells = <1>; + #size-cells = <0>; + reg = <0xff8d2000 0x100>, +diff --git a/arch/arm64/boot/dts/intel/socfpga_agilex.dtsi b/arch/arm64/boot/dts/intel/socfpga_agilex.dtsi +index de1e98c99ec5b..f4270cf189962 100644 +--- a/arch/arm64/boot/dts/intel/socfpga_agilex.dtsi ++++ b/arch/arm64/boot/dts/intel/socfpga_agilex.dtsi +@@ -628,7 +628,7 @@ + }; + + qspi: spi@ff8d2000 { +- compatible = "cdns,qspi-nor"; ++ compatible = "intel,socfpga-qspi", "cdns,qspi-nor"; + #address-cells = <1>; + #size-cells = <0>; + reg = <0xff8d2000 0x100>, +diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c +index d89cf802d9aa7..6568823cf3063 100644 +--- a/arch/powerpc/kernel/kvm.c ++++ b/arch/powerpc/kernel/kvm.c +@@ -669,7 +669,8 @@ static void __init kvm_use_magic_page(void) + on_each_cpu(kvm_map_magic_page, &features, 1); + + /* Quick self-test to see if the mapping works */ +- if (fault_in_pages_readable((const char *)KVM_MAGIC_PAGE, sizeof(u32))) { ++ if (fault_in_readable((const char __user *)KVM_MAGIC_PAGE, ++ sizeof(u32))) { + kvm_patching_worked = false; + return; + } +diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c +index f2da879264bcd..3e053e2fd6b69 100644 +--- a/arch/powerpc/kernel/signal_32.c ++++ b/arch/powerpc/kernel/signal_32.c +@@ -1048,7 +1048,7 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx, + if (new_ctx == NULL) + return 0; + if (!access_ok(new_ctx, ctx_size) || +- fault_in_pages_readable((u8 __user *)new_ctx, ctx_size)) ++ fault_in_readable((char __user *)new_ctx, ctx_size)) + return -EFAULT; + + /* +@@ -1239,7 +1239,7 @@ SYSCALL_DEFINE3(debug_setcontext, struct ucontext __user *, ctx, + #endif + + if (!access_ok(ctx, sizeof(*ctx)) || +- fault_in_pages_readable((u8 __user *)ctx, sizeof(*ctx))) ++ fault_in_readable((char __user *)ctx, sizeof(*ctx))) + return -EFAULT; + + /* +diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c +index bb9c077ac1322..d1e1fc0acbea3 100644 +--- a/arch/powerpc/kernel/signal_64.c ++++ b/arch/powerpc/kernel/signal_64.c +@@ -688,7 +688,7 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx, + if (new_ctx == NULL) + return 0; + if (!access_ok(new_ctx, ctx_size) || +- fault_in_pages_readable((u8 __user *)new_ctx, ctx_size)) ++ fault_in_readable((char __user *)new_ctx, ctx_size)) + return -EFAULT; + + /* +diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c +index 831b25c5e7058..7f71bd4dcd0d6 100644 +--- a/arch/x86/kernel/fpu/signal.c ++++ b/arch/x86/kernel/fpu/signal.c +@@ -205,7 +205,7 @@ retry: + fpregs_unlock(); + + if (ret) { +- if (!fault_in_pages_writeable(buf_fx, fpu_user_xstate_size)) ++ if (!fault_in_writeable(buf_fx, fpu_user_xstate_size)) + goto retry; + return -EFAULT; + } +@@ -278,10 +278,9 @@ retry: + if (ret != -EFAULT) + return -EINVAL; + +- ret = fault_in_pages_readable(buf, size); +- if (!ret) ++ if (!fault_in_readable(buf, size)) + goto retry; +- return ret; ++ return -EFAULT; + } + + /* +diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig +index ab3e37aa1830c..f93cb989241cc 100644 +--- a/drivers/block/Kconfig ++++ b/drivers/block/Kconfig +@@ -33,6 +33,22 @@ config BLK_DEV_FD + To compile this driver as a module, choose M here: the + module will be called floppy. + ++config BLK_DEV_FD_RAWCMD ++ bool "Support for raw floppy disk commands (DEPRECATED)" ++ depends on BLK_DEV_FD ++ help ++ If you want to use actual physical floppies and expect to do ++ special low-level hardware accesses to them (access and use ++ non-standard formats, for example), then enable this. ++ ++ Note that the code enabled by this option is rarely used and ++ might be unstable or insecure, and distros should not enable it. ++ ++ Note: FDRAWCMD is deprecated and will be removed from the kernel ++ in the near future. ++ ++ If unsure, say N. ++ + config AMIGA_FLOPPY + tristate "Amiga floppy support" + depends on AMIGA +diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c +index 0f58594c5a4d6..1c152b542a52d 100644 +--- a/drivers/block/floppy.c ++++ b/drivers/block/floppy.c +@@ -2984,6 +2984,8 @@ static const char *drive_name(int type, int drive) + return "(null)"; + } + ++#ifdef CONFIG_BLK_DEV_FD_RAWCMD ++ + /* raw commands */ + static void raw_cmd_done(int flag) + { +@@ -3183,6 +3185,35 @@ static int raw_cmd_ioctl(int cmd, void __user *param) + return ret; + } + ++static int floppy_raw_cmd_ioctl(int type, int drive, int cmd, ++ void __user *param) ++{ ++ int ret; ++ ++ pr_warn_once("Note: FDRAWCMD is deprecated and will be removed from the kernel in the near future.\n"); ++ ++ if (type) ++ return -EINVAL; ++ if (lock_fdc(drive)) ++ return -EINTR; ++ set_floppy(drive); ++ ret = raw_cmd_ioctl(cmd, param); ++ if (ret == -EINTR) ++ return -EINTR; ++ process_fd_request(); ++ return ret; ++} ++ ++#else /* CONFIG_BLK_DEV_FD_RAWCMD */ ++ ++static int floppy_raw_cmd_ioctl(int type, int drive, int cmd, ++ void __user *param) ++{ ++ return -EOPNOTSUPP; ++} ++ ++#endif ++ + static int invalidate_drive(struct block_device *bdev) + { + /* invalidate the buffer track to force a reread */ +@@ -3371,7 +3402,6 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int + { + int drive = (long)bdev->bd_disk->private_data; + int type = ITYPE(drive_state[drive].fd_device); +- int i; + int ret; + int size; + union inparam { +@@ -3522,16 +3552,7 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int + outparam = &write_errors[drive]; + break; + case FDRAWCMD: +- if (type) +- return -EINVAL; +- if (lock_fdc(drive)) +- return -EINTR; +- set_floppy(drive); +- i = raw_cmd_ioctl(cmd, (void __user *)param); +- if (i == -EINTR) +- return -EINTR; +- process_fd_request(); +- return i; ++ return floppy_raw_cmd_ioctl(type, drive, cmd, (void __user *)param); + case FDTWADDLE: + if (lock_fdc(drive)) + return -EINTR; +diff --git a/drivers/gpu/drm/armada/armada_gem.c b/drivers/gpu/drm/armada/armada_gem.c +index 21909642ee4ca..8fbb25913327c 100644 +--- a/drivers/gpu/drm/armada/armada_gem.c ++++ b/drivers/gpu/drm/armada/armada_gem.c +@@ -336,7 +336,7 @@ int armada_gem_pwrite_ioctl(struct drm_device *dev, void *data, + struct drm_armada_gem_pwrite *args = data; + struct armada_gem_object *dobj; + char __user *ptr; +- int ret; ++ int ret = 0; + + DRM_DEBUG_DRIVER("handle %u off %u size %u ptr 0x%llx\n", + args->handle, args->offset, args->size, args->ptr); +@@ -349,9 +349,8 @@ int armada_gem_pwrite_ioctl(struct drm_device *dev, void *data, + if (!access_ok(ptr, args->size)) + return -EFAULT; + +- ret = fault_in_pages_readable(ptr, args->size); +- if (ret) +- return ret; ++ if (fault_in_readable(ptr, args->size)) ++ return -EFAULT; + + dobj = armada_gem_object_lookup(file, args->handle); + if (dobj == NULL) +diff --git a/drivers/spi/spi-cadence-quadspi.c b/drivers/spi/spi-cadence-quadspi.c +index 75680eecd2f7d..2714ba02b176b 100644 +--- a/drivers/spi/spi-cadence-quadspi.c ++++ b/drivers/spi/spi-cadence-quadspi.c +@@ -36,6 +36,7 @@ + /* Quirks */ + #define CQSPI_NEEDS_WR_DELAY BIT(0) + #define CQSPI_DISABLE_DAC_MODE BIT(1) ++#define CQSPI_NO_SUPPORT_WR_COMPLETION BIT(3) + + /* Capabilities */ + #define CQSPI_SUPPORTS_OCTAL BIT(0) +@@ -83,6 +84,7 @@ struct cqspi_st { + u32 wr_delay; + bool use_direct_mode; + struct cqspi_flash_pdata f_pdata[CQSPI_MAX_CHIPSELECT]; ++ bool wr_completion; + }; + + struct cqspi_driver_platdata { +@@ -797,9 +799,11 @@ static int cqspi_write_setup(struct cqspi_flash_pdata *f_pdata, + * polling on the controller's side. spinand and spi-nor will take + * care of polling the status register. + */ +- reg = readl(reg_base + CQSPI_REG_WR_COMPLETION_CTRL); +- reg |= CQSPI_REG_WR_DISABLE_AUTO_POLL; +- writel(reg, reg_base + CQSPI_REG_WR_COMPLETION_CTRL); ++ if (cqspi->wr_completion) { ++ reg = readl(reg_base + CQSPI_REG_WR_COMPLETION_CTRL); ++ reg |= CQSPI_REG_WR_DISABLE_AUTO_POLL; ++ writel(reg, reg_base + CQSPI_REG_WR_COMPLETION_CTRL); ++ } + + reg = readl(reg_base + CQSPI_REG_SIZE); + reg &= ~CQSPI_REG_SIZE_ADDRESS_MASK; +@@ -1532,6 +1536,10 @@ static int cqspi_probe(struct platform_device *pdev) + + cqspi->master_ref_clk_hz = clk_get_rate(cqspi->clk); + master->max_speed_hz = cqspi->master_ref_clk_hz; ++ ++ /* write completion is supported by default */ ++ cqspi->wr_completion = true; ++ + ddata = of_device_get_match_data(dev); + if (ddata) { + if (ddata->quirks & CQSPI_NEEDS_WR_DELAY) +@@ -1541,6 +1549,8 @@ static int cqspi_probe(struct platform_device *pdev) + master->mode_bits |= SPI_RX_OCTAL | SPI_TX_OCTAL; + if (!(ddata->quirks & CQSPI_DISABLE_DAC_MODE)) + cqspi->use_direct_mode = true; ++ if (ddata->quirks & CQSPI_NO_SUPPORT_WR_COMPLETION) ++ cqspi->wr_completion = false; + } + + ret = devm_request_irq(dev, irq, cqspi_irq_handler, 0, +@@ -1649,6 +1659,10 @@ static const struct cqspi_driver_platdata intel_lgm_qspi = { + .quirks = CQSPI_DISABLE_DAC_MODE, + }; + ++static const struct cqspi_driver_platdata socfpga_qspi = { ++ .quirks = CQSPI_NO_SUPPORT_WR_COMPLETION, ++}; ++ + static const struct of_device_id cqspi_dt_ids[] = { + { + .compatible = "cdns,qspi-nor", +@@ -1666,6 +1680,10 @@ static const struct of_device_id cqspi_dt_ids[] = { + .compatible = "intel,lgm-qspi", + .data = &intel_lgm_qspi, + }, ++ { ++ .compatible = "intel,socfpga-qspi", ++ .data = (void *)&socfpga_qspi, ++ }, + { /* end of table */ } + }; + +diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c +index dc1e4d1b72914..ff578c934bbcf 100644 +--- a/fs/btrfs/file.c ++++ b/fs/btrfs/file.c +@@ -1709,7 +1709,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, + * Fault pages before locking them in prepare_pages + * to avoid recursive lock + */ +- if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) { ++ if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) { + ret = -EFAULT; + break; + } +@@ -1903,16 +1903,17 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, + + static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) + { ++ const bool is_sync_write = (iocb->ki_flags & IOCB_DSYNC); + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + loff_t pos; + ssize_t written = 0; + ssize_t written_buffered; ++ size_t prev_left = 0; + loff_t endbyte; + ssize_t err; + unsigned int ilock_flags = 0; +- struct iomap_dio *dio = NULL; + + if (iocb->ki_flags & IOCB_NOWAIT) + ilock_flags |= BTRFS_ILOCK_TRY; +@@ -1955,23 +1956,80 @@ relock: + goto buffered; + } + +- dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops, +- 0); ++ /* ++ * We remove IOCB_DSYNC so that we don't deadlock when iomap_dio_rw() ++ * calls generic_write_sync() (through iomap_dio_complete()), because ++ * that results in calling fsync (btrfs_sync_file()) which will try to ++ * lock the inode in exclusive/write mode. ++ */ ++ if (is_sync_write) ++ iocb->ki_flags &= ~IOCB_DSYNC; + +- btrfs_inode_unlock(inode, ilock_flags); ++ /* ++ * The iov_iter can be mapped to the same file range we are writing to. ++ * If that's the case, then we will deadlock in the iomap code, because ++ * it first calls our callback btrfs_dio_iomap_begin(), which will create ++ * an ordered extent, and after that it will fault in the pages that the ++ * iov_iter refers to. During the fault in we end up in the readahead ++ * pages code (starting at btrfs_readahead()), which will lock the range, ++ * find that ordered extent and then wait for it to complete (at ++ * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since ++ * obviously the ordered extent can never complete as we didn't submit ++ * yet the respective bio(s). This always happens when the buffer is ++ * memory mapped to the same file range, since the iomap DIO code always ++ * invalidates pages in the target file range (after starting and waiting ++ * for any writeback). ++ * ++ * So here we disable page faults in the iov_iter and then retry if we ++ * got -EFAULT, faulting in the pages before the retry. ++ */ ++again: ++ from->nofault = true; ++ err = iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops, ++ IOMAP_DIO_PARTIAL, written); ++ from->nofault = false; + +- if (IS_ERR_OR_NULL(dio)) { +- err = PTR_ERR_OR_ZERO(dio); +- if (err < 0 && err != -ENOTBLK) +- goto out; +- } else { +- written = iomap_dio_complete(dio); ++ /* No increment (+=) because iomap returns a cumulative value. */ ++ if (err > 0) ++ written = err; ++ ++ if (iov_iter_count(from) > 0 && (err == -EFAULT || err > 0)) { ++ const size_t left = iov_iter_count(from); ++ /* ++ * We have more data left to write. Try to fault in as many as ++ * possible of the remainder pages and retry. We do this without ++ * releasing and locking again the inode, to prevent races with ++ * truncate. ++ * ++ * Also, in case the iov refers to pages in the file range of the ++ * file we want to write to (due to a mmap), we could enter an ++ * infinite loop if we retry after faulting the pages in, since ++ * iomap will invalidate any pages in the range early on, before ++ * it tries to fault in the pages of the iov. So we keep track of ++ * how much was left of iov in the previous EFAULT and fallback ++ * to buffered IO in case we haven't made any progress. ++ */ ++ if (left == prev_left) { ++ err = -ENOTBLK; ++ } else { ++ fault_in_iov_iter_readable(from, left); ++ prev_left = left; ++ goto again; ++ } + } + +- if (written < 0 || !iov_iter_count(from)) { +- err = written; ++ btrfs_inode_unlock(inode, ilock_flags); ++ ++ /* ++ * Add back IOCB_DSYNC. Our caller, btrfs_file_write_iter(), will do ++ * the fsync (call generic_write_sync()). ++ */ ++ if (is_sync_write) ++ iocb->ki_flags |= IOCB_DSYNC; ++ ++ /* If 'err' is -ENOTBLK then it means we must fallback to buffered IO. */ ++ if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from)) + goto out; +- } + + buffered: + pos = iocb->ki_pos; +@@ -1996,7 +2054,7 @@ buffered: + invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT, + endbyte >> PAGE_SHIFT); + out: +- return written ? written : err; ++ return err < 0 ? err : written; + } + + static ssize_t btrfs_file_write_iter(struct kiocb *iocb, +@@ -3659,6 +3717,8 @@ static int check_direct_read(struct btrfs_fs_info *fs_info, + static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to) + { + struct inode *inode = file_inode(iocb->ki_filp); ++ size_t prev_left = 0; ++ ssize_t read = 0; + ssize_t ret; + + if (fsverity_active(inode)) +@@ -3668,9 +3728,57 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to) + return 0; + + btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); +- ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, 0); ++again: ++ /* ++ * This is similar to what we do for direct IO writes, see the comment ++ * at btrfs_direct_write(), but we also disable page faults in addition ++ * to disabling them only at the iov_iter level. This is because when ++ * reading from a hole or prealloc extent, iomap calls iov_iter_zero(), ++ * which can still trigger page fault ins despite having set ->nofault ++ * to true of our 'to' iov_iter. ++ * ++ * The difference to direct IO writes is that we deadlock when trying ++ * to lock the extent range in the inode's tree during he page reads ++ * triggered by the fault in (while for writes it is due to waiting for ++ * our own ordered extent). This is because for direct IO reads, ++ * btrfs_dio_iomap_begin() returns with the extent range locked, which ++ * is only unlocked in the endio callback (end_bio_extent_readpage()). ++ */ ++ pagefault_disable(); ++ to->nofault = true; ++ ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, ++ IOMAP_DIO_PARTIAL, read); ++ to->nofault = false; ++ pagefault_enable(); ++ ++ /* No increment (+=) because iomap returns a cumulative value. */ ++ if (ret > 0) ++ read = ret; ++ ++ if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) { ++ const size_t left = iov_iter_count(to); ++ ++ if (left == prev_left) { ++ /* ++ * We didn't make any progress since the last attempt, ++ * fallback to a buffered read for the remainder of the ++ * range. This is just to avoid any possibility of looping ++ * for too long. ++ */ ++ ret = read; ++ } else { ++ /* ++ * We made some progress since the last retry or this is ++ * the first time we are retrying. Fault in as many pages ++ * as possible and retry. ++ */ ++ fault_in_iov_iter_writeable(to, left); ++ prev_left = left; ++ goto again; ++ } ++ } + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); +- return ret; ++ return ret < 0 ? ret : read; + } + + static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c +index 6266a706bff7d..044d584c3467c 100644 +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -7961,6 +7961,34 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, + } + + len = min(len, em->len - (start - em->start)); ++ ++ /* ++ * If we have a NOWAIT request and the range contains multiple extents ++ * (or a mix of extents and holes), then we return -EAGAIN to make the ++ * caller fallback to a context where it can do a blocking (without ++ * NOWAIT) request. This way we avoid doing partial IO and returning ++ * success to the caller, which is not optimal for writes and for reads ++ * it can result in unexpected behaviour for an application. ++ * ++ * When doing a read, because we use IOMAP_DIO_PARTIAL when calling ++ * iomap_dio_rw(), we can end up returning less data then what the caller ++ * asked for, resulting in an unexpected, and incorrect, short read. ++ * That is, the caller asked to read N bytes and we return less than that, ++ * which is wrong unless we are crossing EOF. This happens if we get a ++ * page fault error when trying to fault in pages for the buffer that is ++ * associated to the struct iov_iter passed to iomap_dio_rw(), and we ++ * have previously submitted bios for other extents in the range, in ++ * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of ++ * those bios have completed by the time we get the page fault error, ++ * which we return back to our caller - we should only return EIOCBQUEUED ++ * after we have submitted bios for all the extents in the range. ++ */ ++ if ((flags & IOMAP_NOWAIT) && len < length) { ++ free_extent_map(em); ++ ret = -EAGAIN; ++ goto unlock_err; ++ } ++ + if (write) { + ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, + start, len); +diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c +index 6a863b3f6de03..bf53af8694f8e 100644 +--- a/fs/btrfs/ioctl.c ++++ b/fs/btrfs/ioctl.c +@@ -2258,9 +2258,8 @@ static noinline int search_ioctl(struct inode *inode, + key.offset = sk->min_offset; + + while (1) { +- ret = fault_in_pages_writeable(ubuf + sk_offset, +- *buf_size - sk_offset); +- if (ret) ++ ret = -EFAULT; ++ if (fault_in_writeable(ubuf + sk_offset, *buf_size - sk_offset)) + break; + + ret = btrfs_search_forward(root, &key, path, sk->min_transid); +diff --git a/fs/erofs/data.c b/fs/erofs/data.c +index 9db8297156527..16a41d0db55a3 100644 +--- a/fs/erofs/data.c ++++ b/fs/erofs/data.c +@@ -287,7 +287,7 @@ static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) + + if (!err) + return iomap_dio_rw(iocb, to, &erofs_iomap_ops, +- NULL, 0); ++ NULL, 0, 0); + if (err < 0) + return err; + } +diff --git a/fs/ext4/file.c b/fs/ext4/file.c +index ac0e11bbb4450..b25c1f8f7c4f1 100644 +--- a/fs/ext4/file.c ++++ b/fs/ext4/file.c +@@ -74,7 +74,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) + return generic_file_read_iter(iocb, to); + } + +- ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0); ++ ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0, 0); + inode_unlock_shared(inode); + + file_accessed(iocb->ki_filp); +@@ -566,7 +566,8 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) + if (ilock_shared) + iomap_ops = &ext4_iomap_overwrite_ops; + ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops, +- (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0); ++ (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0, ++ 0); + if (ret == -ENOTBLK) + ret = 0; + +diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c +index 0e14dc41ed4e6..8ef92719c6799 100644 +--- a/fs/f2fs/file.c ++++ b/fs/f2fs/file.c +@@ -4279,7 +4279,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) + size_t target_size = 0; + int err; + +- if (iov_iter_fault_in_readable(from, iov_iter_count(from))) ++ if (fault_in_iov_iter_readable(from, iov_iter_count(from))) + set_inode_flag(inode, FI_NO_PREALLOC); + + if ((iocb->ki_flags & IOCB_NOWAIT)) { +diff --git a/fs/fuse/file.c b/fs/fuse/file.c +index bc50a9fa84a0c..71e9e301e569d 100644 +--- a/fs/fuse/file.c ++++ b/fs/fuse/file.c +@@ -1164,7 +1164,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, + + again: + err = -EFAULT; +- if (iov_iter_fault_in_readable(ii, bytes)) ++ if (fault_in_iov_iter_readable(ii, bytes)) + break; + + err = -ENOMEM; +diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c +index bb9014ced702a..fbdb7a30470a3 100644 +--- a/fs/gfs2/bmap.c ++++ b/fs/gfs2/bmap.c +@@ -961,46 +961,6 @@ hole_found: + goto out; + } + +-static int gfs2_write_lock(struct inode *inode) +-{ +- struct gfs2_inode *ip = GFS2_I(inode); +- struct gfs2_sbd *sdp = GFS2_SB(inode); +- int error; +- +- gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh); +- error = gfs2_glock_nq(&ip->i_gh); +- if (error) +- goto out_uninit; +- if (&ip->i_inode == sdp->sd_rindex) { +- struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); +- +- error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, +- GL_NOCACHE, &m_ip->i_gh); +- if (error) +- goto out_unlock; +- } +- return 0; +- +-out_unlock: +- gfs2_glock_dq(&ip->i_gh); +-out_uninit: +- gfs2_holder_uninit(&ip->i_gh); +- return error; +-} +- +-static void gfs2_write_unlock(struct inode *inode) +-{ +- struct gfs2_inode *ip = GFS2_I(inode); +- struct gfs2_sbd *sdp = GFS2_SB(inode); +- +- if (&ip->i_inode == sdp->sd_rindex) { +- struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); +- +- gfs2_glock_dq_uninit(&m_ip->i_gh); +- } +- gfs2_glock_dq_uninit(&ip->i_gh); +-} +- + static int gfs2_iomap_page_prepare(struct inode *inode, loff_t pos, + unsigned len) + { +@@ -1118,11 +1078,6 @@ out_qunlock: + return ret; + } + +-static inline bool gfs2_iomap_need_write_lock(unsigned flags) +-{ +- return (flags & IOMAP_WRITE) && !(flags & IOMAP_DIRECT); +-} +- + static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, + unsigned flags, struct iomap *iomap, + struct iomap *srcmap) +@@ -1135,12 +1090,6 @@ static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, + iomap->flags |= IOMAP_F_BUFFER_HEAD; + + trace_gfs2_iomap_start(ip, pos, length, flags); +- if (gfs2_iomap_need_write_lock(flags)) { +- ret = gfs2_write_lock(inode); +- if (ret) +- goto out; +- } +- + ret = __gfs2_iomap_get(inode, pos, length, flags, iomap, &mp); + if (ret) + goto out_unlock; +@@ -1168,10 +1117,7 @@ static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, + ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap, &mp); + + out_unlock: +- if (ret && gfs2_iomap_need_write_lock(flags)) +- gfs2_write_unlock(inode); + release_metapath(&mp); +-out: + trace_gfs2_iomap_end(ip, iomap, ret); + return ret; + } +@@ -1219,15 +1165,11 @@ static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length, + } + + if (unlikely(!written)) +- goto out_unlock; ++ return 0; + + if (iomap->flags & IOMAP_F_SIZE_CHANGED) + mark_inode_dirty(inode); + set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); +- +-out_unlock: +- if (gfs2_iomap_need_write_lock(flags)) +- gfs2_write_unlock(inode); + return 0; + } + +diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c +index 1c8b747072cba..247b8d95b5ef4 100644 +--- a/fs/gfs2/file.c ++++ b/fs/gfs2/file.c +@@ -777,27 +777,99 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end, + return ret ? ret : ret1; + } + ++static inline bool should_fault_in_pages(ssize_t ret, struct iov_iter *i, ++ size_t *prev_count, ++ size_t *window_size) ++{ ++ char __user *p = i->iov[0].iov_base + i->iov_offset; ++ size_t count = iov_iter_count(i); ++ int pages = 1; ++ ++ if (likely(!count)) ++ return false; ++ if (ret <= 0 && ret != -EFAULT) ++ return false; ++ if (!iter_is_iovec(i)) ++ return false; ++ ++ if (*prev_count != count || !*window_size) { ++ int pages, nr_dirtied; ++ ++ pages = min_t(int, BIO_MAX_VECS, ++ DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE)); ++ nr_dirtied = max(current->nr_dirtied_pause - ++ current->nr_dirtied, 1); ++ pages = min(pages, nr_dirtied); ++ } ++ ++ *prev_count = count; ++ *window_size = (size_t)PAGE_SIZE * pages - offset_in_page(p); ++ return true; ++} ++ + static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to, + struct gfs2_holder *gh) + { + struct file *file = iocb->ki_filp; + struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); +- size_t count = iov_iter_count(to); ++ size_t prev_count = 0, window_size = 0; ++ size_t written = 0; + ssize_t ret; + +- if (!count) ++ /* ++ * In this function, we disable page faults when we're holding the ++ * inode glock while doing I/O. If a page fault occurs, we indicate ++ * that the inode glock may be dropped, fault in the pages manually, ++ * and retry. ++ * ++ * Unlike generic_file_read_iter, for reads, iomap_dio_rw can trigger ++ * physical as well as manual page faults, and we need to disable both ++ * kinds. ++ * ++ * For direct I/O, gfs2 takes the inode glock in deferred mode. This ++ * locking mode is compatible with other deferred holders, so multiple ++ * processes and nodes can do direct I/O to a file at the same time. ++ * There's no guarantee that reads or writes will be atomic. Any ++ * coordination among readers and writers needs to happen externally. ++ */ ++ ++ if (!iov_iter_count(to)) + return 0; /* skip atime */ + + gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh); ++retry: + ret = gfs2_glock_nq(gh); + if (ret) + goto out_uninit; ++retry_under_glock: ++ pagefault_disable(); ++ to->nofault = true; ++ ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, ++ IOMAP_DIO_PARTIAL, written); ++ to->nofault = false; ++ pagefault_enable(); ++ if (ret > 0) ++ written = ret; ++ ++ if (should_fault_in_pages(ret, to, &prev_count, &window_size)) { ++ size_t leftover; + +- ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, 0); +- gfs2_glock_dq(gh); ++ gfs2_holder_allow_demote(gh); ++ leftover = fault_in_iov_iter_writeable(to, window_size); ++ gfs2_holder_disallow_demote(gh); ++ if (leftover != window_size) { ++ if (!gfs2_holder_queued(gh)) ++ goto retry; ++ goto retry_under_glock; ++ } ++ } ++ if (gfs2_holder_queued(gh)) ++ gfs2_glock_dq(gh); + out_uninit: + gfs2_holder_uninit(gh); +- return ret; ++ if (ret < 0) ++ return ret; ++ return written; + } + + static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from, +@@ -806,10 +878,20 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from, + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); +- size_t len = iov_iter_count(from); +- loff_t offset = iocb->ki_pos; ++ size_t prev_count = 0, window_size = 0; ++ size_t read = 0; + ssize_t ret; + ++ /* ++ * In this function, we disable page faults when we're holding the ++ * inode glock while doing I/O. If a page fault occurs, we indicate ++ * that the inode glock may be dropped, fault in the pages manually, ++ * and retry. ++ * ++ * For writes, iomap_dio_rw only triggers manual page faults, so we ++ * don't need to disable physical ones. ++ */ ++ + /* + * Deferred lock, even if its a write, since we do no allocation on + * this path. All we need to change is the atime, and this lock mode +@@ -819,31 +901,62 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from, + * VFS does. + */ + gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh); ++retry: + ret = gfs2_glock_nq(gh); + if (ret) + goto out_uninit; +- ++retry_under_glock: + /* Silently fall back to buffered I/O when writing beyond EOF */ +- if (offset + len > i_size_read(&ip->i_inode)) ++ if (iocb->ki_pos + iov_iter_count(from) > i_size_read(&ip->i_inode)) + goto out; + +- ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, 0); ++ from->nofault = true; ++ ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, ++ IOMAP_DIO_PARTIAL, read); ++ from->nofault = false; ++ + if (ret == -ENOTBLK) + ret = 0; ++ if (ret > 0) ++ read = ret; ++ ++ if (should_fault_in_pages(ret, from, &prev_count, &window_size)) { ++ size_t leftover; ++ ++ gfs2_holder_allow_demote(gh); ++ leftover = fault_in_iov_iter_readable(from, window_size); ++ gfs2_holder_disallow_demote(gh); ++ if (leftover != window_size) { ++ if (!gfs2_holder_queued(gh)) ++ goto retry; ++ goto retry_under_glock; ++ } ++ } + out: +- gfs2_glock_dq(gh); ++ if (gfs2_holder_queued(gh)) ++ gfs2_glock_dq(gh); + out_uninit: + gfs2_holder_uninit(gh); +- return ret; ++ if (ret < 0) ++ return ret; ++ return read; + } + + static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) + { + struct gfs2_inode *ip; + struct gfs2_holder gh; ++ size_t prev_count = 0, window_size = 0; + size_t written = 0; + ssize_t ret; + ++ /* ++ * In this function, we disable page faults when we're holding the ++ * inode glock while doing I/O. If a page fault occurs, we indicate ++ * that the inode glock may be dropped, fault in the pages manually, ++ * and retry. ++ */ ++ + if (iocb->ki_flags & IOCB_DIRECT) { + ret = gfs2_file_direct_read(iocb, to, &gh); + if (likely(ret != -ENOTBLK)) +@@ -865,18 +978,118 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) + } + ip = GFS2_I(iocb->ki_filp->f_mapping->host); + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); ++retry: + ret = gfs2_glock_nq(&gh); + if (ret) + goto out_uninit; ++retry_under_glock: ++ pagefault_disable(); + ret = generic_file_read_iter(iocb, to); ++ pagefault_enable(); + if (ret > 0) + written += ret; +- gfs2_glock_dq(&gh); ++ ++ if (should_fault_in_pages(ret, to, &prev_count, &window_size)) { ++ size_t leftover; ++ ++ gfs2_holder_allow_demote(&gh); ++ leftover = fault_in_iov_iter_writeable(to, window_size); ++ gfs2_holder_disallow_demote(&gh); ++ if (leftover != window_size) { ++ if (!gfs2_holder_queued(&gh)) { ++ if (written) ++ goto out_uninit; ++ goto retry; ++ } ++ goto retry_under_glock; ++ } ++ } ++ if (gfs2_holder_queued(&gh)) ++ gfs2_glock_dq(&gh); + out_uninit: + gfs2_holder_uninit(&gh); + return written ? written : ret; + } + ++static ssize_t gfs2_file_buffered_write(struct kiocb *iocb, ++ struct iov_iter *from, ++ struct gfs2_holder *gh) ++{ ++ struct file *file = iocb->ki_filp; ++ struct inode *inode = file_inode(file); ++ struct gfs2_inode *ip = GFS2_I(inode); ++ struct gfs2_sbd *sdp = GFS2_SB(inode); ++ struct gfs2_holder *statfs_gh = NULL; ++ size_t prev_count = 0, window_size = 0; ++ size_t read = 0; ++ ssize_t ret; ++ ++ /* ++ * In this function, we disable page faults when we're holding the ++ * inode glock while doing I/O. If a page fault occurs, we indicate ++ * that the inode glock may be dropped, fault in the pages manually, ++ * and retry. ++ */ ++ ++ if (inode == sdp->sd_rindex) { ++ statfs_gh = kmalloc(sizeof(*statfs_gh), GFP_NOFS); ++ if (!statfs_gh) ++ return -ENOMEM; ++ } ++ ++ gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, gh); ++retry: ++ ret = gfs2_glock_nq(gh); ++ if (ret) ++ goto out_uninit; ++retry_under_glock: ++ if (inode == sdp->sd_rindex) { ++ struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); ++ ++ ret = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, ++ GL_NOCACHE, statfs_gh); ++ if (ret) ++ goto out_unlock; ++ } ++ ++ current->backing_dev_info = inode_to_bdi(inode); ++ pagefault_disable(); ++ ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops); ++ pagefault_enable(); ++ current->backing_dev_info = NULL; ++ if (ret > 0) { ++ iocb->ki_pos += ret; ++ read += ret; ++ } ++ ++ if (inode == sdp->sd_rindex) ++ gfs2_glock_dq_uninit(statfs_gh); ++ ++ if (should_fault_in_pages(ret, from, &prev_count, &window_size)) { ++ size_t leftover; ++ ++ gfs2_holder_allow_demote(gh); ++ leftover = fault_in_iov_iter_readable(from, window_size); ++ gfs2_holder_disallow_demote(gh); ++ if (leftover != window_size) { ++ if (!gfs2_holder_queued(gh)) { ++ if (read) ++ goto out_uninit; ++ goto retry; ++ } ++ goto retry_under_glock; ++ } ++ } ++out_unlock: ++ if (gfs2_holder_queued(gh)) ++ gfs2_glock_dq(gh); ++out_uninit: ++ gfs2_holder_uninit(gh); ++ if (statfs_gh) ++ kfree(statfs_gh); ++ return read ? read : ret; ++} ++ + /** + * gfs2_file_write_iter - Perform a write to a file + * @iocb: The io context +@@ -928,9 +1141,7 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) + goto out_unlock; + + iocb->ki_flags |= IOCB_DSYNC; +- current->backing_dev_info = inode_to_bdi(inode); +- buffered = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops); +- current->backing_dev_info = NULL; ++ buffered = gfs2_file_buffered_write(iocb, from, &gh); + if (unlikely(buffered <= 0)) { + if (!ret) + ret = buffered; +@@ -944,7 +1155,6 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) + * the direct I/O range as we don't know if the buffered pages + * made it to disk. + */ +- iocb->ki_pos += buffered; + ret2 = generic_write_sync(iocb, buffered); + invalidate_mapping_pages(mapping, + (iocb->ki_pos - buffered) >> PAGE_SHIFT, +@@ -952,13 +1162,9 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) + if (!ret || ret2 > 0) + ret += ret2; + } else { +- current->backing_dev_info = inode_to_bdi(inode); +- ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops); +- current->backing_dev_info = NULL; +- if (likely(ret > 0)) { +- iocb->ki_pos += ret; ++ ret = gfs2_file_buffered_write(iocb, from, &gh); ++ if (likely(ret > 0)) + ret = generic_write_sync(iocb, ret); +- } + } + + out_unlock: +diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c +index 02cd0ae98208d..e85ef6b14777d 100644 +--- a/fs/gfs2/glock.c ++++ b/fs/gfs2/glock.c +@@ -58,6 +58,7 @@ struct gfs2_glock_iter { + typedef void (*glock_examiner) (struct gfs2_glock * gl); + + static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target); ++static void __gfs2_glock_dq(struct gfs2_holder *gh); + + static struct dentry *gfs2_root; + static struct workqueue_struct *glock_workqueue; +@@ -197,6 +198,12 @@ static int demote_ok(const struct gfs2_glock *gl) + + if (gl->gl_state == LM_ST_UNLOCKED) + return 0; ++ /* ++ * Note that demote_ok is used for the lru process of disposing of ++ * glocks. For this purpose, we don't care if the glock's holders ++ * have the HIF_MAY_DEMOTE flag set or not. If someone is using ++ * them, don't demote. ++ */ + if (!list_empty(&gl->gl_holders)) + return 0; + if (glops->go_demote_ok) +@@ -301,46 +308,59 @@ void gfs2_glock_put(struct gfs2_glock *gl) + } + + /** +- * may_grant - check if its ok to grant a new lock ++ * may_grant - check if it's ok to grant a new lock + * @gl: The glock ++ * @current_gh: One of the current holders of @gl + * @gh: The lock request which we wish to grant + * +- * Returns: true if its ok to grant the lock ++ * With our current compatibility rules, if a glock has one or more active ++ * holders (HIF_HOLDER flag set), any of those holders can be passed in as ++ * @current_gh; they are all the same as far as compatibility with the new @gh ++ * goes. ++ * ++ * Returns true if it's ok to grant the lock. + */ + +-static inline int may_grant(const struct gfs2_glock *gl, const struct gfs2_holder *gh) +-{ +- const struct gfs2_holder *gh_head = list_first_entry(&gl->gl_holders, const struct gfs2_holder, gh_list); ++static inline bool may_grant(struct gfs2_glock *gl, ++ struct gfs2_holder *current_gh, ++ struct gfs2_holder *gh) ++{ ++ if (current_gh) { ++ GLOCK_BUG_ON(gl, !test_bit(HIF_HOLDER, ¤t_gh->gh_iflags)); ++ ++ switch(current_gh->gh_state) { ++ case LM_ST_EXCLUSIVE: ++ /* ++ * Here we make a special exception to grant holders ++ * who agree to share the EX lock with other holders ++ * who also have the bit set. If the original holder ++ * has the LM_FLAG_NODE_SCOPE bit set, we grant more ++ * holders with the bit set. ++ */ ++ return gh->gh_state == LM_ST_EXCLUSIVE && ++ (current_gh->gh_flags & LM_FLAG_NODE_SCOPE) && ++ (gh->gh_flags & LM_FLAG_NODE_SCOPE); + +- if (gh != gh_head) { +- /** +- * Here we make a special exception to grant holders who agree +- * to share the EX lock with other holders who also have the +- * bit set. If the original holder has the LM_FLAG_NODE_SCOPE bit +- * is set, we grant more holders with the bit set. +- */ +- if (gh_head->gh_state == LM_ST_EXCLUSIVE && +- (gh_head->gh_flags & LM_FLAG_NODE_SCOPE) && +- gh->gh_state == LM_ST_EXCLUSIVE && +- (gh->gh_flags & LM_FLAG_NODE_SCOPE)) +- return 1; +- if ((gh->gh_state == LM_ST_EXCLUSIVE || +- gh_head->gh_state == LM_ST_EXCLUSIVE)) +- return 0; ++ case LM_ST_SHARED: ++ case LM_ST_DEFERRED: ++ return gh->gh_state == current_gh->gh_state; ++ ++ default: ++ return false; ++ } + } ++ + if (gl->gl_state == gh->gh_state) +- return 1; ++ return true; + if (gh->gh_flags & GL_EXACT) +- return 0; ++ return false; + if (gl->gl_state == LM_ST_EXCLUSIVE) { +- if (gh->gh_state == LM_ST_SHARED && gh_head->gh_state == LM_ST_SHARED) +- return 1; +- if (gh->gh_state == LM_ST_DEFERRED && gh_head->gh_state == LM_ST_DEFERRED) +- return 1; ++ return gh->gh_state == LM_ST_SHARED || ++ gh->gh_state == LM_ST_DEFERRED; + } +- if (gl->gl_state != LM_ST_UNLOCKED && (gh->gh_flags & LM_FLAG_ANY)) +- return 1; +- return 0; ++ if (gh->gh_flags & LM_FLAG_ANY) ++ return gl->gl_state != LM_ST_UNLOCKED; ++ return false; + } + + static void gfs2_holder_wake(struct gfs2_holder *gh) +@@ -366,7 +386,7 @@ static void do_error(struct gfs2_glock *gl, const int ret) + struct gfs2_holder *gh, *tmp; + + list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) { +- if (test_bit(HIF_HOLDER, &gh->gh_iflags)) ++ if (!test_bit(HIF_WAIT, &gh->gh_iflags)) + continue; + if (ret & LM_OUT_ERROR) + gh->gh_error = -EIO; +@@ -380,6 +400,78 @@ static void do_error(struct gfs2_glock *gl, const int ret) + } + } + ++/** ++ * demote_incompat_holders - demote incompatible demoteable holders ++ * @gl: the glock we want to promote ++ * @new_gh: the new holder to be promoted ++ */ ++static void demote_incompat_holders(struct gfs2_glock *gl, ++ struct gfs2_holder *new_gh) ++{ ++ struct gfs2_holder *gh; ++ ++ /* ++ * Demote incompatible holders before we make ourselves eligible. ++ * (This holder may or may not allow auto-demoting, but we don't want ++ * to demote the new holder before it's even granted.) ++ */ ++ list_for_each_entry(gh, &gl->gl_holders, gh_list) { ++ /* ++ * Since holders are at the front of the list, we stop when we ++ * find the first non-holder. ++ */ ++ if (!test_bit(HIF_HOLDER, &gh->gh_iflags)) ++ return; ++ if (test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags) && ++ !may_grant(gl, new_gh, gh)) { ++ /* ++ * We should not recurse into do_promote because ++ * __gfs2_glock_dq only calls handle_callback, ++ * gfs2_glock_add_to_lru and __gfs2_glock_queue_work. ++ */ ++ __gfs2_glock_dq(gh); ++ } ++ } ++} ++ ++/** ++ * find_first_holder - find the first "holder" gh ++ * @gl: the glock ++ */ ++ ++static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl) ++{ ++ struct gfs2_holder *gh; ++ ++ if (!list_empty(&gl->gl_holders)) { ++ gh = list_first_entry(&gl->gl_holders, struct gfs2_holder, ++ gh_list); ++ if (test_bit(HIF_HOLDER, &gh->gh_iflags)) ++ return gh; ++ } ++ return NULL; ++} ++ ++/** ++ * find_first_strong_holder - find the first non-demoteable holder ++ * @gl: the glock ++ * ++ * Find the first holder that doesn't have the HIF_MAY_DEMOTE flag set. ++ */ ++static inline struct gfs2_holder * ++find_first_strong_holder(struct gfs2_glock *gl) ++{ ++ struct gfs2_holder *gh; ++ ++ list_for_each_entry(gh, &gl->gl_holders, gh_list) { ++ if (!test_bit(HIF_HOLDER, &gh->gh_iflags)) ++ return NULL; ++ if (!test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags)) ++ return gh; ++ } ++ return NULL; ++} ++ + /** + * do_promote - promote as many requests as possible on the current queue + * @gl: The glock +@@ -393,14 +485,21 @@ __releases(&gl->gl_lockref.lock) + __acquires(&gl->gl_lockref.lock) + { + const struct gfs2_glock_operations *glops = gl->gl_ops; +- struct gfs2_holder *gh, *tmp; ++ struct gfs2_holder *gh, *tmp, *first_gh; ++ bool incompat_holders_demoted = false; + int ret; + + restart: ++ first_gh = find_first_strong_holder(gl); + list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) { +- if (test_bit(HIF_HOLDER, &gh->gh_iflags)) ++ if (!test_bit(HIF_WAIT, &gh->gh_iflags)) + continue; +- if (may_grant(gl, gh)) { ++ if (may_grant(gl, first_gh, gh)) { ++ if (!incompat_holders_demoted) { ++ demote_incompat_holders(gl, first_gh); ++ incompat_holders_demoted = true; ++ first_gh = gh; ++ } + if (gh->gh_list.prev == &gl->gl_holders && + glops->go_lock) { + spin_unlock(&gl->gl_lockref.lock); +@@ -426,6 +525,11 @@ restart: + gfs2_holder_wake(gh); + continue; + } ++ /* ++ * If we get here, it means we may not grant this holder for ++ * some reason. If this holder is the head of the list, it ++ * means we have a blocked holder at the head, so return 1. ++ */ + if (gh->gh_list.prev == &gl->gl_holders) + return 1; + do_error(gl, 0); +@@ -722,23 +826,6 @@ out: + spin_lock(&gl->gl_lockref.lock); + } + +-/** +- * find_first_holder - find the first "holder" gh +- * @gl: the glock +- */ +- +-static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl) +-{ +- struct gfs2_holder *gh; +- +- if (!list_empty(&gl->gl_holders)) { +- gh = list_first_entry(&gl->gl_holders, struct gfs2_holder, gh_list); +- if (test_bit(HIF_HOLDER, &gh->gh_iflags)) +- return gh; +- } +- return NULL; +-} +- + /** + * run_queue - do all outstanding tasks related to a glock + * @gl: The glock in question +@@ -1354,15 +1441,20 @@ __acquires(&gl->gl_lockref.lock) + GLOCK_BUG_ON(gl, true); + + if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) { +- if (test_bit(GLF_LOCK, &gl->gl_flags)) +- try_futile = !may_grant(gl, gh); ++ if (test_bit(GLF_LOCK, &gl->gl_flags)) { ++ struct gfs2_holder *first_gh; ++ ++ first_gh = find_first_strong_holder(gl); ++ try_futile = !may_grant(gl, first_gh, gh); ++ } + if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) + goto fail; + } + + list_for_each_entry(gh2, &gl->gl_holders, gh_list) { + if (unlikely(gh2->gh_owner_pid == gh->gh_owner_pid && +- (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK))) ++ (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK) && ++ !test_bit(HIF_MAY_DEMOTE, &gh2->gh_iflags))) + goto trap_recursive; + if (try_futile && + !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) { +@@ -1458,51 +1550,83 @@ int gfs2_glock_poll(struct gfs2_holder *gh) + return test_bit(HIF_WAIT, &gh->gh_iflags) ? 0 : 1; + } + +-/** +- * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock) +- * @gh: the glock holder +- * +- */ ++static inline bool needs_demote(struct gfs2_glock *gl) ++{ ++ return (test_bit(GLF_DEMOTE, &gl->gl_flags) || ++ test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags)); ++} + +-void gfs2_glock_dq(struct gfs2_holder *gh) ++static void __gfs2_glock_dq(struct gfs2_holder *gh) + { + struct gfs2_glock *gl = gh->gh_gl; + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + unsigned delay = 0; + int fast_path = 0; + +- spin_lock(&gl->gl_lockref.lock); + /* +- * If we're in the process of file system withdraw, we cannot just +- * dequeue any glocks until our journal is recovered, lest we +- * introduce file system corruption. We need two exceptions to this +- * rule: We need to allow unlocking of nondisk glocks and the glock +- * for our own journal that needs recovery. ++ * This while loop is similar to function demote_incompat_holders: ++ * If the glock is due to be demoted (which may be from another node ++ * or even if this holder is GL_NOCACHE), the weak holders are ++ * demoted as well, allowing the glock to be demoted. + */ +- if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) && +- glock_blocked_by_withdraw(gl) && +- gh->gh_gl != sdp->sd_jinode_gl) { +- sdp->sd_glock_dqs_held++; +- spin_unlock(&gl->gl_lockref.lock); +- might_sleep(); +- wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY, +- TASK_UNINTERRUPTIBLE); +- spin_lock(&gl->gl_lockref.lock); +- } +- if (gh->gh_flags & GL_NOCACHE) +- handle_callback(gl, LM_ST_UNLOCKED, 0, false); ++ while (gh) { ++ /* ++ * If we're in the process of file system withdraw, we cannot ++ * just dequeue any glocks until our journal is recovered, lest ++ * we introduce file system corruption. We need two exceptions ++ * to this rule: We need to allow unlocking of nondisk glocks ++ * and the glock for our own journal that needs recovery. ++ */ ++ if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) && ++ glock_blocked_by_withdraw(gl) && ++ gh->gh_gl != sdp->sd_jinode_gl) { ++ sdp->sd_glock_dqs_held++; ++ spin_unlock(&gl->gl_lockref.lock); ++ might_sleep(); ++ wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY, ++ TASK_UNINTERRUPTIBLE); ++ spin_lock(&gl->gl_lockref.lock); ++ } + +- list_del_init(&gh->gh_list); +- clear_bit(HIF_HOLDER, &gh->gh_iflags); +- if (list_empty(&gl->gl_holders) && +- !test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && +- !test_bit(GLF_DEMOTE, &gl->gl_flags)) +- fast_path = 1; ++ /* ++ * This holder should not be cached, so mark it for demote. ++ * Note: this should be done before the check for needs_demote ++ * below. ++ */ ++ if (gh->gh_flags & GL_NOCACHE) ++ handle_callback(gl, LM_ST_UNLOCKED, 0, false); ++ ++ list_del_init(&gh->gh_list); ++ clear_bit(HIF_HOLDER, &gh->gh_iflags); ++ trace_gfs2_glock_queue(gh, 0); ++ ++ /* ++ * If there hasn't been a demote request we are done. ++ * (Let the remaining holders, if any, keep holding it.) ++ */ ++ if (!needs_demote(gl)) { ++ if (list_empty(&gl->gl_holders)) ++ fast_path = 1; ++ break; ++ } ++ /* ++ * If we have another strong holder (we cannot auto-demote) ++ * we are done. It keeps holding it until it is done. ++ */ ++ if (find_first_strong_holder(gl)) ++ break; ++ ++ /* ++ * If we have a weak holder at the head of the list, it ++ * (and all others like it) must be auto-demoted. If there ++ * are no more weak holders, we exit the while loop. ++ */ ++ gh = find_first_holder(gl); ++ } + + if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl)) + gfs2_glock_add_to_lru(gl); + +- trace_gfs2_glock_queue(gh, 0); + if (unlikely(!fast_path)) { + gl->gl_lockref.count++; + if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && +@@ -1511,6 +1635,19 @@ void gfs2_glock_dq(struct gfs2_holder *gh) + delay = gl->gl_hold_time; + __gfs2_glock_queue_work(gl, delay); + } ++} ++ ++/** ++ * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock) ++ * @gh: the glock holder ++ * ++ */ ++void gfs2_glock_dq(struct gfs2_holder *gh) ++{ ++ struct gfs2_glock *gl = gh->gh_gl; ++ ++ spin_lock(&gl->gl_lockref.lock); ++ __gfs2_glock_dq(gh); + spin_unlock(&gl->gl_lockref.lock); + } + +@@ -1673,6 +1810,7 @@ void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs) + + void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) + { ++ struct gfs2_holder mock_gh = { .gh_gl = gl, .gh_state = state, }; + unsigned long delay = 0; + unsigned long holdtime; + unsigned long now = jiffies; +@@ -1687,6 +1825,28 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) + if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags)) + delay = gl->gl_hold_time; + } ++ /* ++ * Note 1: We cannot call demote_incompat_holders from handle_callback ++ * or gfs2_set_demote due to recursion problems like: gfs2_glock_dq -> ++ * handle_callback -> demote_incompat_holders -> gfs2_glock_dq ++ * Plus, we only want to demote the holders if the request comes from ++ * a remote cluster node because local holder conflicts are resolved ++ * elsewhere. ++ * ++ * Note 2: if a remote node wants this glock in EX mode, lock_dlm will ++ * request that we set our state to UNLOCKED. Here we mock up a holder ++ * to make it look like someone wants the lock EX locally. Any SH ++ * and DF requests should be able to share the lock without demoting. ++ * ++ * Note 3: We only want to demote the demoteable holders when there ++ * are no more strong holders. The demoteable holders might as well ++ * keep the glock until the last strong holder is done with it. ++ */ ++ if (!find_first_strong_holder(gl)) { ++ if (state == LM_ST_UNLOCKED) ++ mock_gh.gh_state = LM_ST_EXCLUSIVE; ++ demote_incompat_holders(gl, &mock_gh); ++ } + handle_callback(gl, state, delay, true); + __gfs2_glock_queue_work(gl, delay); + spin_unlock(&gl->gl_lockref.lock); +@@ -2078,6 +2238,8 @@ static const char *hflags2str(char *buf, u16 flags, unsigned long iflags) + *p++ = 'H'; + if (test_bit(HIF_WAIT, &iflags)) + *p++ = 'W'; ++ if (test_bit(HIF_MAY_DEMOTE, &iflags)) ++ *p++ = 'D'; + *p = 0; + return buf; + } +diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h +index 31a8f2f649b52..9012487da4c69 100644 +--- a/fs/gfs2/glock.h ++++ b/fs/gfs2/glock.h +@@ -150,6 +150,8 @@ static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock * + list_for_each_entry(gh, &gl->gl_holders, gh_list) { + if (!test_bit(HIF_HOLDER, &gh->gh_iflags)) + break; ++ if (test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags)) ++ continue; + if (gh->gh_owner_pid == pid) + goto out; + } +@@ -325,6 +327,24 @@ static inline void glock_clear_object(struct gfs2_glock *gl, void *object) + spin_unlock(&gl->gl_lockref.lock); + } + ++static inline void gfs2_holder_allow_demote(struct gfs2_holder *gh) ++{ ++ struct gfs2_glock *gl = gh->gh_gl; ++ ++ spin_lock(&gl->gl_lockref.lock); ++ set_bit(HIF_MAY_DEMOTE, &gh->gh_iflags); ++ spin_unlock(&gl->gl_lockref.lock); ++} ++ ++static inline void gfs2_holder_disallow_demote(struct gfs2_holder *gh) ++{ ++ struct gfs2_glock *gl = gh->gh_gl; ++ ++ spin_lock(&gl->gl_lockref.lock); ++ clear_bit(HIF_MAY_DEMOTE, &gh->gh_iflags); ++ spin_unlock(&gl->gl_lockref.lock); ++} ++ + extern void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation); + extern bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation); + +diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h +index 0fe49770166ea..ca42d310fd4d6 100644 +--- a/fs/gfs2/incore.h ++++ b/fs/gfs2/incore.h +@@ -252,6 +252,7 @@ struct gfs2_lkstats { + + enum { + /* States */ ++ HIF_MAY_DEMOTE = 1, + HIF_HOLDER = 6, /* Set for gh that "holds" the glock */ + HIF_WAIT = 10, + }; +@@ -386,9 +387,8 @@ struct gfs2_inode { + u64 i_generation; + u64 i_eattr; + unsigned long i_flags; /* GIF_... */ +- struct gfs2_glock *i_gl; /* Move into i_gh? */ ++ struct gfs2_glock *i_gl; + struct gfs2_holder i_iopen_gh; +- struct gfs2_holder i_gh; /* for prepare/commit_write only */ + struct gfs2_qadata *i_qadata; /* quota allocation data */ + struct gfs2_holder i_rgd_gh; + struct gfs2_blkreserv i_res; /* rgrp multi-block reservation */ +diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c +index 97119ec3b8503..fe10d8a30f6bd 100644 +--- a/fs/iomap/buffered-io.c ++++ b/fs/iomap/buffered-io.c +@@ -757,7 +757,7 @@ again: + * same page as we're writing to, without it being marked + * up-to-date. + */ +- if (unlikely(iov_iter_fault_in_readable(i, bytes))) { ++ if (unlikely(fault_in_iov_iter_readable(i, bytes))) { + status = -EFAULT; + break; + } +diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c +index 4ecd255e0511c..468dcbba45bcb 100644 +--- a/fs/iomap/direct-io.c ++++ b/fs/iomap/direct-io.c +@@ -31,6 +31,7 @@ struct iomap_dio { + atomic_t ref; + unsigned flags; + int error; ++ size_t done_before; + bool wait_for_completion; + + union { +@@ -124,6 +125,9 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio) + if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC)) + ret = generic_write_sync(iocb, ret); + ++ if (ret > 0) ++ ret += dio->done_before; ++ + kfree(dio); + + return ret; +@@ -371,6 +375,8 @@ static loff_t iomap_dio_hole_iter(const struct iomap_iter *iter, + loff_t length = iov_iter_zero(iomap_length(iter), dio->submit.iter); + + dio->size += length; ++ if (!length) ++ return -EFAULT; + return length; + } + +@@ -402,6 +408,8 @@ static loff_t iomap_dio_inline_iter(const struct iomap_iter *iomi, + copied = copy_to_iter(inline_data, length, iter); + } + dio->size += copied; ++ if (!copied) ++ return -EFAULT; + return copied; + } + +@@ -446,13 +454,21 @@ static loff_t iomap_dio_iter(const struct iomap_iter *iter, + * may be pure data writes. In that case, we still need to do a full data sync + * completion. + * ++ * When page faults are disabled and @dio_flags includes IOMAP_DIO_PARTIAL, ++ * __iomap_dio_rw can return a partial result if it encounters a non-resident ++ * page in @iter after preparing a transfer. In that case, the non-resident ++ * pages can be faulted in and the request resumed with @done_before set to the ++ * number of bytes previously transferred. The request will then complete with ++ * the correct total number of bytes transferred; this is essential for ++ * completing partial requests asynchronously. ++ * + * Returns -ENOTBLK In case of a page invalidation invalidation failure for + * writes. The callers needs to fall back to buffered I/O in this case. + */ + struct iomap_dio * + __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, + const struct iomap_ops *ops, const struct iomap_dio_ops *dops, +- unsigned int dio_flags) ++ unsigned int dio_flags, size_t done_before) + { + struct address_space *mapping = iocb->ki_filp->f_mapping; + struct inode *inode = file_inode(iocb->ki_filp); +@@ -482,6 +498,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, + dio->dops = dops; + dio->error = 0; + dio->flags = 0; ++ dio->done_before = done_before; + + dio->submit.iter = iter; + dio->submit.waiter = current; +@@ -577,6 +594,12 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, + if (iov_iter_rw(iter) == READ && iomi.pos >= dio->i_size) + iov_iter_revert(iter, iomi.pos - dio->i_size); + ++ if (ret == -EFAULT && dio->size && (dio_flags & IOMAP_DIO_PARTIAL)) { ++ if (!(iocb->ki_flags & IOCB_NOWAIT)) ++ wait_for_completion = true; ++ ret = 0; ++ } ++ + /* magic error code to fall back to buffered I/O */ + if (ret == -ENOTBLK) { + wait_for_completion = true; +@@ -642,11 +665,11 @@ EXPORT_SYMBOL_GPL(__iomap_dio_rw); + ssize_t + iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, + const struct iomap_ops *ops, const struct iomap_dio_ops *dops, +- unsigned int dio_flags) ++ unsigned int dio_flags, size_t done_before) + { + struct iomap_dio *dio; + +- dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags); ++ dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags, done_before); + if (IS_ERR_OR_NULL(dio)) + return PTR_ERR_OR_ZERO(dio); + return iomap_dio_complete(dio); +diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c +index ab4f3362466d0..a43adeacd930c 100644 +--- a/fs/ntfs/file.c ++++ b/fs/ntfs/file.c +@@ -1829,7 +1829,7 @@ again: + * pages being swapped out between us bringing them into memory + * and doing the actual copying. + */ +- if (unlikely(iov_iter_fault_in_readable(i, bytes))) { ++ if (unlikely(fault_in_iov_iter_readable(i, bytes))) { + status = -EFAULT; + break; + } +diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c +index 43b1451bff539..54b9599640ef4 100644 +--- a/fs/ntfs3/file.c ++++ b/fs/ntfs3/file.c +@@ -989,7 +989,7 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) + frame_vbo = pos & ~(frame_size - 1); + index = frame_vbo >> PAGE_SHIFT; + +- if (unlikely(iov_iter_fault_in_readable(from, bytes))) { ++ if (unlikely(fault_in_iov_iter_readable(from, bytes))) { + err = -EFAULT; + goto out; + } +diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c +index 7aa943edfc02f..240eb932c014b 100644 +--- a/fs/xfs/xfs_file.c ++++ b/fs/xfs/xfs_file.c +@@ -259,7 +259,7 @@ xfs_file_dio_read( + ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); + if (ret) + return ret; +- ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0); ++ ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, 0); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + + return ret; +@@ -569,7 +569,7 @@ xfs_file_dio_write_aligned( + } + trace_xfs_file_direct_write(iocb, from); + ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, +- &xfs_dio_write_ops, 0); ++ &xfs_dio_write_ops, 0, 0); + out_unlock: + if (iolock) + xfs_iunlock(ip, iolock); +@@ -647,7 +647,7 @@ retry_exclusive: + + trace_xfs_file_direct_write(iocb, from); + ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, +- &xfs_dio_write_ops, flags); ++ &xfs_dio_write_ops, flags, 0); + + /* + * Retry unaligned I/O with exclusive blocking semantics if the DIO +diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c +index 807f33553a8eb..bced33b76beac 100644 +--- a/fs/zonefs/super.c ++++ b/fs/zonefs/super.c +@@ -852,7 +852,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) + ret = zonefs_file_dio_append(iocb, from); + else + ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops, +- &zonefs_write_dio_ops, 0); ++ &zonefs_write_dio_ops, 0, 0); + if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && + (ret > 0 || ret == -EIOCBQUEUED)) { + if (ret > 0) +@@ -987,7 +987,7 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) + } + file_accessed(iocb->ki_filp); + ret = iomap_dio_rw(iocb, to, &zonefs_iomap_ops, +- &zonefs_read_dio_ops, 0); ++ &zonefs_read_dio_ops, 0, 0); + } else { + ret = generic_file_read_iter(iocb, to); + if (ret == -EIO) +diff --git a/include/linux/bpf.h b/include/linux/bpf.h +index 15b690a0cecb0..c5c4b6f09e230 100644 +--- a/include/linux/bpf.h ++++ b/include/linux/bpf.h +@@ -293,6 +293,34 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0, + + extern const struct bpf_map_ops bpf_map_offload_ops; + ++/* bpf_type_flag contains a set of flags that are applicable to the values of ++ * arg_type, ret_type and reg_type. For example, a pointer value may be null, ++ * or a memory is read-only. We classify types into two categories: base types ++ * and extended types. Extended types are base types combined with a type flag. ++ * ++ * Currently there are no more than 32 base types in arg_type, ret_type and ++ * reg_types. ++ */ ++#define BPF_BASE_TYPE_BITS 8 ++ ++enum bpf_type_flag { ++ /* PTR may be NULL. */ ++ PTR_MAYBE_NULL = BIT(0 + BPF_BASE_TYPE_BITS), ++ ++ /* MEM is read-only. When applied on bpf_arg, it indicates the arg is ++ * compatible with both mutable and immutable memory. ++ */ ++ MEM_RDONLY = BIT(1 + BPF_BASE_TYPE_BITS), ++ ++ __BPF_TYPE_LAST_FLAG = MEM_RDONLY, ++}; ++ ++/* Max number of base types. */ ++#define BPF_BASE_TYPE_LIMIT (1UL << BPF_BASE_TYPE_BITS) ++ ++/* Max number of all types. */ ++#define BPF_TYPE_LIMIT (__BPF_TYPE_LAST_FLAG | (__BPF_TYPE_LAST_FLAG - 1)) ++ + /* function argument constraints */ + enum bpf_arg_type { + ARG_DONTCARE = 0, /* unused argument in helper function */ +@@ -304,13 +332,11 @@ enum bpf_arg_type { + ARG_PTR_TO_MAP_KEY, /* pointer to stack used as map key */ + ARG_PTR_TO_MAP_VALUE, /* pointer to stack used as map value */ + ARG_PTR_TO_UNINIT_MAP_VALUE, /* pointer to valid memory used to store a map value */ +- ARG_PTR_TO_MAP_VALUE_OR_NULL, /* pointer to stack used as map value or NULL */ + + /* the following constraints used to prototype bpf_memcmp() and other + * functions that access data on eBPF program stack + */ + ARG_PTR_TO_MEM, /* pointer to valid memory (stack, packet, map value) */ +- ARG_PTR_TO_MEM_OR_NULL, /* pointer to valid memory or NULL */ + ARG_PTR_TO_UNINIT_MEM, /* pointer to memory does not need to be initialized, + * helper function must fill all bytes or clear + * them in error case. +@@ -320,42 +346,65 @@ enum bpf_arg_type { + ARG_CONST_SIZE_OR_ZERO, /* number of bytes accessed from memory or 0 */ + + ARG_PTR_TO_CTX, /* pointer to context */ +- ARG_PTR_TO_CTX_OR_NULL, /* pointer to context or NULL */ + ARG_ANYTHING, /* any (initialized) argument is ok */ + ARG_PTR_TO_SPIN_LOCK, /* pointer to bpf_spin_lock */ + ARG_PTR_TO_SOCK_COMMON, /* pointer to sock_common */ + ARG_PTR_TO_INT, /* pointer to int */ + ARG_PTR_TO_LONG, /* pointer to long */ + ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */ +- ARG_PTR_TO_SOCKET_OR_NULL, /* pointer to bpf_sock (fullsock) or NULL */ + ARG_PTR_TO_BTF_ID, /* pointer to in-kernel struct */ + ARG_PTR_TO_ALLOC_MEM, /* pointer to dynamically allocated memory */ +- ARG_PTR_TO_ALLOC_MEM_OR_NULL, /* pointer to dynamically allocated memory or NULL */ + ARG_CONST_ALLOC_SIZE_OR_ZERO, /* number of allocated bytes requested */ + ARG_PTR_TO_BTF_ID_SOCK_COMMON, /* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */ + ARG_PTR_TO_PERCPU_BTF_ID, /* pointer to in-kernel percpu type */ + ARG_PTR_TO_FUNC, /* pointer to a bpf program function */ +- ARG_PTR_TO_STACK_OR_NULL, /* pointer to stack or NULL */ ++ ARG_PTR_TO_STACK, /* pointer to stack */ + ARG_PTR_TO_CONST_STR, /* pointer to a null terminated read-only string */ + ARG_PTR_TO_TIMER, /* pointer to bpf_timer */ + __BPF_ARG_TYPE_MAX, ++ ++ /* Extended arg_types. */ ++ ARG_PTR_TO_MAP_VALUE_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_MAP_VALUE, ++ ARG_PTR_TO_MEM_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_MEM, ++ ARG_PTR_TO_CTX_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_CTX, ++ ARG_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_SOCKET, ++ ARG_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_ALLOC_MEM, ++ ARG_PTR_TO_STACK_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_STACK, ++ ++ /* This must be the last entry. Its purpose is to ensure the enum is ++ * wide enough to hold the higher bits reserved for bpf_type_flag. ++ */ ++ __BPF_ARG_TYPE_LIMIT = BPF_TYPE_LIMIT, + }; ++static_assert(__BPF_ARG_TYPE_MAX <= BPF_BASE_TYPE_LIMIT); + + /* type of values returned from helper functions */ + enum bpf_return_type { + RET_INTEGER, /* function returns integer */ + RET_VOID, /* function doesn't return anything */ + RET_PTR_TO_MAP_VALUE, /* returns a pointer to map elem value */ +- RET_PTR_TO_MAP_VALUE_OR_NULL, /* returns a pointer to map elem value or NULL */ +- RET_PTR_TO_SOCKET_OR_NULL, /* returns a pointer to a socket or NULL */ +- RET_PTR_TO_TCP_SOCK_OR_NULL, /* returns a pointer to a tcp_sock or NULL */ +- RET_PTR_TO_SOCK_COMMON_OR_NULL, /* returns a pointer to a sock_common or NULL */ +- RET_PTR_TO_ALLOC_MEM_OR_NULL, /* returns a pointer to dynamically allocated memory or NULL */ +- RET_PTR_TO_BTF_ID_OR_NULL, /* returns a pointer to a btf_id or NULL */ +- RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, /* returns a pointer to a valid memory or a btf_id or NULL */ ++ RET_PTR_TO_SOCKET, /* returns a pointer to a socket */ ++ RET_PTR_TO_TCP_SOCK, /* returns a pointer to a tcp_sock */ ++ RET_PTR_TO_SOCK_COMMON, /* returns a pointer to a sock_common */ ++ RET_PTR_TO_ALLOC_MEM, /* returns a pointer to dynamically allocated memory */ + RET_PTR_TO_MEM_OR_BTF_ID, /* returns a pointer to a valid memory or a btf_id */ + RET_PTR_TO_BTF_ID, /* returns a pointer to a btf_id */ ++ __BPF_RET_TYPE_MAX, ++ ++ /* Extended ret_types. */ ++ RET_PTR_TO_MAP_VALUE_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_MAP_VALUE, ++ RET_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCKET, ++ RET_PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_TCP_SOCK, ++ RET_PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCK_COMMON, ++ RET_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_ALLOC_MEM, ++ RET_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_BTF_ID, ++ ++ /* This must be the last entry. Its purpose is to ensure the enum is ++ * wide enough to hold the higher bits reserved for bpf_type_flag. ++ */ ++ __BPF_RET_TYPE_LIMIT = BPF_TYPE_LIMIT, + }; ++static_assert(__BPF_RET_TYPE_MAX <= BPF_BASE_TYPE_LIMIT); + + /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs + * to in-kernel helper functions and for adjusting imm32 field in BPF_CALL +@@ -417,18 +466,15 @@ enum bpf_reg_type { + PTR_TO_CTX, /* reg points to bpf_context */ + CONST_PTR_TO_MAP, /* reg points to struct bpf_map */ + PTR_TO_MAP_VALUE, /* reg points to map element value */ +- PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */ ++ PTR_TO_MAP_KEY, /* reg points to a map element key */ + PTR_TO_STACK, /* reg == frame_pointer + offset */ + PTR_TO_PACKET_META, /* skb->data - meta_len */ + PTR_TO_PACKET, /* reg points to skb->data */ + PTR_TO_PACKET_END, /* skb->data + headlen */ + PTR_TO_FLOW_KEYS, /* reg points to bpf_flow_keys */ + PTR_TO_SOCKET, /* reg points to struct bpf_sock */ +- PTR_TO_SOCKET_OR_NULL, /* reg points to struct bpf_sock or NULL */ + PTR_TO_SOCK_COMMON, /* reg points to sock_common */ +- PTR_TO_SOCK_COMMON_OR_NULL, /* reg points to sock_common or NULL */ + PTR_TO_TCP_SOCK, /* reg points to struct tcp_sock */ +- PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */ + PTR_TO_TP_BUFFER, /* reg points to a writable raw tp's buffer */ + PTR_TO_XDP_SOCK, /* reg points to struct xdp_sock */ + /* PTR_TO_BTF_ID points to a kernel struct that does not need +@@ -446,18 +492,25 @@ enum bpf_reg_type { + * been checked for null. Used primarily to inform the verifier + * an explicit null check is required for this struct. + */ +- PTR_TO_BTF_ID_OR_NULL, + PTR_TO_MEM, /* reg points to valid memory region */ +- PTR_TO_MEM_OR_NULL, /* reg points to valid memory region or NULL */ +- PTR_TO_RDONLY_BUF, /* reg points to a readonly buffer */ +- PTR_TO_RDONLY_BUF_OR_NULL, /* reg points to a readonly buffer or NULL */ +- PTR_TO_RDWR_BUF, /* reg points to a read/write buffer */ +- PTR_TO_RDWR_BUF_OR_NULL, /* reg points to a read/write buffer or NULL */ ++ PTR_TO_BUF, /* reg points to a read/write buffer */ + PTR_TO_PERCPU_BTF_ID, /* reg points to a percpu kernel variable */ + PTR_TO_FUNC, /* reg points to a bpf program function */ +- PTR_TO_MAP_KEY, /* reg points to a map element key */ + __BPF_REG_TYPE_MAX, ++ ++ /* Extended reg_types. */ ++ PTR_TO_MAP_VALUE_OR_NULL = PTR_MAYBE_NULL | PTR_TO_MAP_VALUE, ++ PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | PTR_TO_SOCKET, ++ PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | PTR_TO_SOCK_COMMON, ++ PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | PTR_TO_TCP_SOCK, ++ PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | PTR_TO_BTF_ID, ++ ++ /* This must be the last entry. Its purpose is to ensure the enum is ++ * wide enough to hold the higher bits reserved for bpf_type_flag. ++ */ ++ __BPF_REG_TYPE_LIMIT = BPF_TYPE_LIMIT, + }; ++static_assert(__BPF_REG_TYPE_MAX <= BPF_BASE_TYPE_LIMIT); + + /* The information passed from prog-specific *_is_valid_access + * back to the verifier. +diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h +index 364550dd19c4a..bb1cc3fbc4bab 100644 +--- a/include/linux/bpf_verifier.h ++++ b/include/linux/bpf_verifier.h +@@ -18,6 +18,8 @@ + * that converting umax_value to int cannot overflow. + */ + #define BPF_MAX_VAR_SIZ (1 << 29) ++/* size of type_str_buf in bpf_verifier. */ ++#define TYPE_STR_BUF_LEN 64 + + /* Liveness marks, used for registers and spilled-regs (in stack slots). + * Read marks propagate upwards until they find a write mark; they record that +@@ -474,6 +476,8 @@ struct bpf_verifier_env { + /* longest register parentage chain walked for liveness marking */ + u32 longest_mark_read_walk; + bpfptr_t fd_array; ++ /* buffer used in reg_type_str() to generate reg_type string */ ++ char type_str_buf[TYPE_STR_BUF_LEN]; + }; + + __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log, +@@ -535,4 +539,18 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, + u32 btf_id, + struct bpf_attach_target_info *tgt_info); + ++#define BPF_BASE_TYPE_MASK GENMASK(BPF_BASE_TYPE_BITS - 1, 0) ++ ++/* extract base type from bpf_{arg, return, reg}_type. */ ++static inline u32 base_type(u32 type) ++{ ++ return type & BPF_BASE_TYPE_MASK; ++} ++ ++/* extract flags from an extended type. See bpf_type_flag in bpf.h. */ ++static inline u32 type_flag(u32 type) ++{ ++ return type & ~BPF_BASE_TYPE_MASK; ++} ++ + #endif /* _LINUX_BPF_VERIFIER_H */ +diff --git a/include/linux/iomap.h b/include/linux/iomap.h +index 24f8489583ca7..829f2325ecbab 100644 +--- a/include/linux/iomap.h ++++ b/include/linux/iomap.h +@@ -330,12 +330,19 @@ struct iomap_dio_ops { + */ + #define IOMAP_DIO_OVERWRITE_ONLY (1 << 1) + ++/* ++ * When a page fault occurs, return a partial synchronous result and allow ++ * the caller to retry the rest of the operation after dealing with the page ++ * fault. ++ */ ++#define IOMAP_DIO_PARTIAL (1 << 2) ++ + ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, + const struct iomap_ops *ops, const struct iomap_dio_ops *dops, +- unsigned int dio_flags); ++ unsigned int dio_flags, size_t done_before); + struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, + const struct iomap_ops *ops, const struct iomap_dio_ops *dops, +- unsigned int dio_flags); ++ unsigned int dio_flags, size_t done_before); + ssize_t iomap_dio_complete(struct iomap_dio *dio); + int iomap_dio_iopoll(struct kiocb *kiocb, bool spin); + +diff --git a/include/linux/mm.h b/include/linux/mm.h +index 90c2d7f3c7a88..04345ff97f8ca 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -2858,7 +2858,8 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, + #define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */ + #define FOLL_NOWAIT 0x20 /* if a disk transfer is needed, start the IO + * and return without waiting upon it */ +-#define FOLL_POPULATE 0x40 /* fault in page */ ++#define FOLL_POPULATE 0x40 /* fault in pages (with FOLL_MLOCK) */ ++#define FOLL_NOFAULT 0x80 /* do not fault in pages */ + #define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ + #define FOLL_NUMA 0x200 /* force NUMA hinting page fault */ + #define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */ +diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h +index 62db6b0176b95..2f7dd14083d94 100644 +--- a/include/linux/pagemap.h ++++ b/include/linux/pagemap.h +@@ -733,61 +733,11 @@ int wait_on_page_private_2_killable(struct page *page); + extern void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter); + + /* +- * Fault everything in given userspace address range in. ++ * Fault in userspace address range. + */ +-static inline int fault_in_pages_writeable(char __user *uaddr, size_t size) +-{ +- char __user *end = uaddr + size - 1; +- +- if (unlikely(size == 0)) +- return 0; +- +- if (unlikely(uaddr > end)) +- return -EFAULT; +- /* +- * Writing zeroes into userspace here is OK, because we know that if +- * the zero gets there, we'll be overwriting it. +- */ +- do { +- if (unlikely(__put_user(0, uaddr) != 0)) +- return -EFAULT; +- uaddr += PAGE_SIZE; +- } while (uaddr <= end); +- +- /* Check whether the range spilled into the next page. */ +- if (((unsigned long)uaddr & PAGE_MASK) == +- ((unsigned long)end & PAGE_MASK)) +- return __put_user(0, end); +- +- return 0; +-} +- +-static inline int fault_in_pages_readable(const char __user *uaddr, size_t size) +-{ +- volatile char c; +- const char __user *end = uaddr + size - 1; +- +- if (unlikely(size == 0)) +- return 0; +- +- if (unlikely(uaddr > end)) +- return -EFAULT; +- +- do { +- if (unlikely(__get_user(c, uaddr) != 0)) +- return -EFAULT; +- uaddr += PAGE_SIZE; +- } while (uaddr <= end); +- +- /* Check whether the range spilled into the next page. */ +- if (((unsigned long)uaddr & PAGE_MASK) == +- ((unsigned long)end & PAGE_MASK)) { +- return __get_user(c, end); +- } +- +- (void)c; +- return 0; +-} ++size_t fault_in_writeable(char __user *uaddr, size_t size); ++size_t fault_in_safe_writeable(const char __user *uaddr, size_t size); ++size_t fault_in_readable(const char __user *uaddr, size_t size); + + int add_to_page_cache_locked(struct page *page, struct address_space *mapping, + pgoff_t index, gfp_t gfp_mask); +diff --git a/include/linux/uio.h b/include/linux/uio.h +index 207101a9c5c32..6350354f97e90 100644 +--- a/include/linux/uio.h ++++ b/include/linux/uio.h +@@ -35,6 +35,7 @@ struct iov_iter_state { + + struct iov_iter { + u8 iter_type; ++ bool nofault; + bool data_source; + size_t iov_offset; + size_t count; +@@ -133,7 +134,8 @@ size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, + size_t bytes, struct iov_iter *i); + void iov_iter_advance(struct iov_iter *i, size_t bytes); + void iov_iter_revert(struct iov_iter *i, size_t bytes); +-int iov_iter_fault_in_readable(const struct iov_iter *i, size_t bytes); ++size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t bytes); ++size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t bytes); + size_t iov_iter_single_seg_count(const struct iov_iter *i); + size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, + struct iov_iter *i); +diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c +index 09406b0e215e1..40df35088cdbd 100644 +--- a/kernel/bpf/btf.c ++++ b/kernel/bpf/btf.c +@@ -4800,10 +4800,12 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, + /* check for PTR_TO_RDONLY_BUF_OR_NULL or PTR_TO_RDWR_BUF_OR_NULL */ + for (i = 0; i < prog->aux->ctx_arg_info_size; i++) { + const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i]; ++ u32 type, flag; + +- if (ctx_arg_info->offset == off && +- (ctx_arg_info->reg_type == PTR_TO_RDONLY_BUF_OR_NULL || +- ctx_arg_info->reg_type == PTR_TO_RDWR_BUF_OR_NULL)) { ++ type = base_type(ctx_arg_info->reg_type); ++ flag = type_flag(ctx_arg_info->reg_type); ++ if (ctx_arg_info->offset == off && type == PTR_TO_BUF && ++ (flag & PTR_MAYBE_NULL)) { + info->reg_type = ctx_arg_info->reg_type; + return true; + } +@@ -5508,9 +5510,9 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, + if (reg->type == PTR_TO_BTF_ID) { + reg_btf = reg->btf; + reg_ref_id = reg->btf_id; +- } else if (reg2btf_ids[reg->type]) { ++ } else if (reg2btf_ids[base_type(reg->type)]) { + reg_btf = btf_vmlinux; +- reg_ref_id = *reg2btf_ids[reg->type]; ++ reg_ref_id = *reg2btf_ids[base_type(reg->type)]; + } else { + bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d is not a pointer to btf_id\n", + func_name, i, +@@ -5717,7 +5719,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, + return -EINVAL; + } + +- reg->type = PTR_TO_MEM_OR_NULL; ++ reg->type = PTR_TO_MEM | PTR_MAYBE_NULL; + reg->id = ++env->id_gen; + + continue; +@@ -6229,7 +6231,7 @@ const struct bpf_func_proto bpf_btf_find_by_name_kind_proto = { + .func = bpf_btf_find_by_name_kind, + .gpl_only = false, + .ret_type = RET_INTEGER, +- .arg1_type = ARG_PTR_TO_MEM, ++ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg2_type = ARG_CONST_SIZE, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c +index 7dbd68195a2b0..fe053ffd89329 100644 +--- a/kernel/bpf/cgroup.c ++++ b/kernel/bpf/cgroup.c +@@ -1753,7 +1753,7 @@ static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = { + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +- .arg2_type = ARG_PTR_TO_MEM, ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, + }; + +diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c +index 6f600cc95ccda..a711ffe238932 100644 +--- a/kernel/bpf/helpers.c ++++ b/kernel/bpf/helpers.c +@@ -530,7 +530,7 @@ const struct bpf_func_proto bpf_strtol_proto = { + .func = bpf_strtol, + .gpl_only = false, + .ret_type = RET_INTEGER, +- .arg1_type = ARG_PTR_TO_MEM, ++ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg2_type = ARG_CONST_SIZE, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_LONG, +@@ -558,7 +558,7 @@ const struct bpf_func_proto bpf_strtoul_proto = { + .func = bpf_strtoul, + .gpl_only = false, + .ret_type = RET_INTEGER, +- .arg1_type = ARG_PTR_TO_MEM, ++ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg2_type = ARG_CONST_SIZE, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_LONG, +@@ -630,7 +630,7 @@ const struct bpf_func_proto bpf_event_output_data_proto = { + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +- .arg4_type = ARG_PTR_TO_MEM, ++ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, + }; + +@@ -667,7 +667,7 @@ BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu) + const struct bpf_func_proto bpf_per_cpu_ptr_proto = { + .func = bpf_per_cpu_ptr, + .gpl_only = false, +- .ret_type = RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, ++ .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | PTR_MAYBE_NULL | MEM_RDONLY, + .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, + .arg2_type = ARG_ANYTHING, + }; +@@ -680,7 +680,7 @@ BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr) + const struct bpf_func_proto bpf_this_cpu_ptr_proto = { + .func = bpf_this_cpu_ptr, + .gpl_only = false, +- .ret_type = RET_PTR_TO_MEM_OR_BTF_ID, ++ .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | MEM_RDONLY, + .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, + }; + +@@ -1013,7 +1013,7 @@ const struct bpf_func_proto bpf_snprintf_proto = { + .arg1_type = ARG_PTR_TO_MEM_OR_NULL, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_PTR_TO_CONST_STR, +- .arg4_type = ARG_PTR_TO_MEM_OR_NULL, ++ .arg4_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, + }; + +diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c +index 6a9542af4212a..b0fa190b09790 100644 +--- a/kernel/bpf/map_iter.c ++++ b/kernel/bpf/map_iter.c +@@ -174,9 +174,9 @@ static const struct bpf_iter_reg bpf_map_elem_reg_info = { + .ctx_arg_info_size = 2, + .ctx_arg_info = { + { offsetof(struct bpf_iter__bpf_map_elem, key), +- PTR_TO_RDONLY_BUF_OR_NULL }, ++ PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY }, + { offsetof(struct bpf_iter__bpf_map_elem, value), +- PTR_TO_RDWR_BUF_OR_NULL }, ++ PTR_TO_BUF | PTR_MAYBE_NULL }, + }, + }; + +diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c +index f1c51c45667d3..710ba9de12ce4 100644 +--- a/kernel/bpf/ringbuf.c ++++ b/kernel/bpf/ringbuf.c +@@ -444,7 +444,7 @@ const struct bpf_func_proto bpf_ringbuf_output_proto = { + .func = bpf_ringbuf_output, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, +- .arg2_type = ARG_PTR_TO_MEM, ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, + }; +diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c +index 42490c39dfbf5..48e02a725563f 100644 +--- a/kernel/bpf/syscall.c ++++ b/kernel/bpf/syscall.c +@@ -4753,7 +4753,7 @@ static const struct bpf_func_proto bpf_sys_bpf_proto = { + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, +- .arg2_type = ARG_PTR_TO_MEM, ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, + }; + +diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c +index 670721e39c0e8..d2b119b4fbe74 100644 +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -445,18 +445,6 @@ static bool reg_type_not_null(enum bpf_reg_type type) + type == PTR_TO_SOCK_COMMON; + } + +-static bool reg_type_may_be_null(enum bpf_reg_type type) +-{ +- return type == PTR_TO_MAP_VALUE_OR_NULL || +- type == PTR_TO_SOCKET_OR_NULL || +- type == PTR_TO_SOCK_COMMON_OR_NULL || +- type == PTR_TO_TCP_SOCK_OR_NULL || +- type == PTR_TO_BTF_ID_OR_NULL || +- type == PTR_TO_MEM_OR_NULL || +- type == PTR_TO_RDONLY_BUF_OR_NULL || +- type == PTR_TO_RDWR_BUF_OR_NULL; +-} +- + static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) + { + return reg->type == PTR_TO_MAP_VALUE && +@@ -465,12 +453,14 @@ static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) + + static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type) + { +- return type == PTR_TO_SOCKET || +- type == PTR_TO_SOCKET_OR_NULL || +- type == PTR_TO_TCP_SOCK || +- type == PTR_TO_TCP_SOCK_OR_NULL || +- type == PTR_TO_MEM || +- type == PTR_TO_MEM_OR_NULL; ++ return base_type(type) == PTR_TO_SOCKET || ++ base_type(type) == PTR_TO_TCP_SOCK || ++ base_type(type) == PTR_TO_MEM; ++} ++ ++static bool type_is_rdonly_mem(u32 type) ++{ ++ return type & MEM_RDONLY; + } + + static bool arg_type_may_be_refcounted(enum bpf_arg_type type) +@@ -478,14 +468,9 @@ static bool arg_type_may_be_refcounted(enum bpf_arg_type type) + return type == ARG_PTR_TO_SOCK_COMMON; + } + +-static bool arg_type_may_be_null(enum bpf_arg_type type) ++static bool type_may_be_null(u32 type) + { +- return type == ARG_PTR_TO_MAP_VALUE_OR_NULL || +- type == ARG_PTR_TO_MEM_OR_NULL || +- type == ARG_PTR_TO_CTX_OR_NULL || +- type == ARG_PTR_TO_SOCKET_OR_NULL || +- type == ARG_PTR_TO_ALLOC_MEM_OR_NULL || +- type == ARG_PTR_TO_STACK_OR_NULL; ++ return type & PTR_MAYBE_NULL; + } + + /* Determine whether the function releases some resources allocated by another +@@ -545,39 +530,54 @@ static bool is_cmpxchg_insn(const struct bpf_insn *insn) + insn->imm == BPF_CMPXCHG; + } + +-/* string representation of 'enum bpf_reg_type' */ +-static const char * const reg_type_str[] = { +- [NOT_INIT] = "?", +- [SCALAR_VALUE] = "inv", +- [PTR_TO_CTX] = "ctx", +- [CONST_PTR_TO_MAP] = "map_ptr", +- [PTR_TO_MAP_VALUE] = "map_value", +- [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null", +- [PTR_TO_STACK] = "fp", +- [PTR_TO_PACKET] = "pkt", +- [PTR_TO_PACKET_META] = "pkt_meta", +- [PTR_TO_PACKET_END] = "pkt_end", +- [PTR_TO_FLOW_KEYS] = "flow_keys", +- [PTR_TO_SOCKET] = "sock", +- [PTR_TO_SOCKET_OR_NULL] = "sock_or_null", +- [PTR_TO_SOCK_COMMON] = "sock_common", +- [PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null", +- [PTR_TO_TCP_SOCK] = "tcp_sock", +- [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null", +- [PTR_TO_TP_BUFFER] = "tp_buffer", +- [PTR_TO_XDP_SOCK] = "xdp_sock", +- [PTR_TO_BTF_ID] = "ptr_", +- [PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_", +- [PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_", +- [PTR_TO_MEM] = "mem", +- [PTR_TO_MEM_OR_NULL] = "mem_or_null", +- [PTR_TO_RDONLY_BUF] = "rdonly_buf", +- [PTR_TO_RDONLY_BUF_OR_NULL] = "rdonly_buf_or_null", +- [PTR_TO_RDWR_BUF] = "rdwr_buf", +- [PTR_TO_RDWR_BUF_OR_NULL] = "rdwr_buf_or_null", +- [PTR_TO_FUNC] = "func", +- [PTR_TO_MAP_KEY] = "map_key", +-}; ++/* string representation of 'enum bpf_reg_type' ++ * ++ * Note that reg_type_str() can not appear more than once in a single verbose() ++ * statement. ++ */ ++static const char *reg_type_str(struct bpf_verifier_env *env, ++ enum bpf_reg_type type) ++{ ++ char postfix[16] = {0}, prefix[16] = {0}; ++ static const char * const str[] = { ++ [NOT_INIT] = "?", ++ [SCALAR_VALUE] = "inv", ++ [PTR_TO_CTX] = "ctx", ++ [CONST_PTR_TO_MAP] = "map_ptr", ++ [PTR_TO_MAP_VALUE] = "map_value", ++ [PTR_TO_STACK] = "fp", ++ [PTR_TO_PACKET] = "pkt", ++ [PTR_TO_PACKET_META] = "pkt_meta", ++ [PTR_TO_PACKET_END] = "pkt_end", ++ [PTR_TO_FLOW_KEYS] = "flow_keys", ++ [PTR_TO_SOCKET] = "sock", ++ [PTR_TO_SOCK_COMMON] = "sock_common", ++ [PTR_TO_TCP_SOCK] = "tcp_sock", ++ [PTR_TO_TP_BUFFER] = "tp_buffer", ++ [PTR_TO_XDP_SOCK] = "xdp_sock", ++ [PTR_TO_BTF_ID] = "ptr_", ++ [PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_", ++ [PTR_TO_MEM] = "mem", ++ [PTR_TO_BUF] = "buf", ++ [PTR_TO_FUNC] = "func", ++ [PTR_TO_MAP_KEY] = "map_key", ++ }; ++ ++ if (type & PTR_MAYBE_NULL) { ++ if (base_type(type) == PTR_TO_BTF_ID || ++ base_type(type) == PTR_TO_PERCPU_BTF_ID) ++ strncpy(postfix, "or_null_", 16); ++ else ++ strncpy(postfix, "_or_null", 16); ++ } ++ ++ if (type & MEM_RDONLY) ++ strncpy(prefix, "rdonly_", 16); ++ ++ snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s", ++ prefix, str[base_type(type)], postfix); ++ return env->type_str_buf; ++} + + static char slot_type_char[] = { + [STACK_INVALID] = '?', +@@ -628,7 +628,7 @@ static void print_verifier_state(struct bpf_verifier_env *env, + continue; + verbose(env, " R%d", i); + print_liveness(env, reg->live); +- verbose(env, "=%s", reg_type_str[t]); ++ verbose(env, "=%s", reg_type_str(env, t)); + if (t == SCALAR_VALUE && reg->precise) + verbose(env, "P"); + if ((t == SCALAR_VALUE || t == PTR_TO_STACK) && +@@ -636,9 +636,8 @@ static void print_verifier_state(struct bpf_verifier_env *env, + /* reg->off should be 0 for SCALAR_VALUE */ + verbose(env, "%lld", reg->var_off.value + reg->off); + } else { +- if (t == PTR_TO_BTF_ID || +- t == PTR_TO_BTF_ID_OR_NULL || +- t == PTR_TO_PERCPU_BTF_ID) ++ if (base_type(t) == PTR_TO_BTF_ID || ++ base_type(t) == PTR_TO_PERCPU_BTF_ID) + verbose(env, "%s", kernel_type_name(reg->btf, reg->btf_id)); + verbose(env, "(id=%d", reg->id); + if (reg_type_may_be_refcounted_or_null(t)) +@@ -647,10 +646,9 @@ static void print_verifier_state(struct bpf_verifier_env *env, + verbose(env, ",off=%d", reg->off); + if (type_is_pkt_pointer(t)) + verbose(env, ",r=%d", reg->range); +- else if (t == CONST_PTR_TO_MAP || +- t == PTR_TO_MAP_KEY || +- t == PTR_TO_MAP_VALUE || +- t == PTR_TO_MAP_VALUE_OR_NULL) ++ else if (base_type(t) == CONST_PTR_TO_MAP || ++ base_type(t) == PTR_TO_MAP_KEY || ++ base_type(t) == PTR_TO_MAP_VALUE) + verbose(env, ",ks=%d,vs=%d", + reg->map_ptr->key_size, + reg->map_ptr->value_size); +@@ -720,7 +718,7 @@ static void print_verifier_state(struct bpf_verifier_env *env, + if (state->stack[i].slot_type[0] == STACK_SPILL) { + reg = &state->stack[i].spilled_ptr; + t = reg->type; +- verbose(env, "=%s", reg_type_str[t]); ++ verbose(env, "=%s", reg_type_str(env, t)); + if (t == SCALAR_VALUE && reg->precise) + verbose(env, "P"); + if (t == SCALAR_VALUE && tnum_is_const(reg->var_off)) +@@ -1133,8 +1131,7 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env, + + static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) + { +- switch (reg->type) { +- case PTR_TO_MAP_VALUE_OR_NULL: { ++ if (base_type(reg->type) == PTR_TO_MAP_VALUE) { + const struct bpf_map *map = reg->map_ptr; + + if (map->inner_map_meta) { +@@ -1153,32 +1150,10 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) + } else { + reg->type = PTR_TO_MAP_VALUE; + } +- break; +- } +- case PTR_TO_SOCKET_OR_NULL: +- reg->type = PTR_TO_SOCKET; +- break; +- case PTR_TO_SOCK_COMMON_OR_NULL: +- reg->type = PTR_TO_SOCK_COMMON; +- break; +- case PTR_TO_TCP_SOCK_OR_NULL: +- reg->type = PTR_TO_TCP_SOCK; +- break; +- case PTR_TO_BTF_ID_OR_NULL: +- reg->type = PTR_TO_BTF_ID; +- break; +- case PTR_TO_MEM_OR_NULL: +- reg->type = PTR_TO_MEM; +- break; +- case PTR_TO_RDONLY_BUF_OR_NULL: +- reg->type = PTR_TO_RDONLY_BUF; +- break; +- case PTR_TO_RDWR_BUF_OR_NULL: +- reg->type = PTR_TO_RDWR_BUF; +- break; +- default: +- WARN_ONCE(1, "unknown nullable register type"); ++ return; + } ++ ++ reg->type &= ~PTR_MAYBE_NULL; + } + + static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg) +@@ -1906,7 +1881,7 @@ static int mark_reg_read(struct bpf_verifier_env *env, + break; + if (parent->live & REG_LIVE_DONE) { + verbose(env, "verifier BUG type %s var_off %lld off %d\n", +- reg_type_str[parent->type], ++ reg_type_str(env, parent->type), + parent->var_off.value, parent->off); + return -EFAULT; + } +@@ -2564,9 +2539,8 @@ static int mark_chain_precision_stack(struct bpf_verifier_env *env, int spi) + + static bool is_spillable_regtype(enum bpf_reg_type type) + { +- switch (type) { ++ switch (base_type(type)) { + case PTR_TO_MAP_VALUE: +- case PTR_TO_MAP_VALUE_OR_NULL: + case PTR_TO_STACK: + case PTR_TO_CTX: + case PTR_TO_PACKET: +@@ -2575,21 +2549,13 @@ static bool is_spillable_regtype(enum bpf_reg_type type) + case PTR_TO_FLOW_KEYS: + case CONST_PTR_TO_MAP: + case PTR_TO_SOCKET: +- case PTR_TO_SOCKET_OR_NULL: + case PTR_TO_SOCK_COMMON: +- case PTR_TO_SOCK_COMMON_OR_NULL: + case PTR_TO_TCP_SOCK: +- case PTR_TO_TCP_SOCK_OR_NULL: + case PTR_TO_XDP_SOCK: + case PTR_TO_BTF_ID: +- case PTR_TO_BTF_ID_OR_NULL: +- case PTR_TO_RDONLY_BUF: +- case PTR_TO_RDONLY_BUF_OR_NULL: +- case PTR_TO_RDWR_BUF: +- case PTR_TO_RDWR_BUF_OR_NULL: ++ case PTR_TO_BUF: + case PTR_TO_PERCPU_BTF_ID: + case PTR_TO_MEM: +- case PTR_TO_MEM_OR_NULL: + case PTR_TO_FUNC: + case PTR_TO_MAP_KEY: + return true; +@@ -3405,7 +3371,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, + */ + *reg_type = info.reg_type; + +- if (*reg_type == PTR_TO_BTF_ID || *reg_type == PTR_TO_BTF_ID_OR_NULL) { ++ if (base_type(*reg_type) == PTR_TO_BTF_ID) { + *btf = info.btf; + *btf_id = info.btf_id; + } else { +@@ -3473,7 +3439,7 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, + } + + verbose(env, "R%d invalid %s access off=%d size=%d\n", +- regno, reg_type_str[reg->type], off, size); ++ regno, reg_type_str(env, reg->type), off, size); + + return -EACCES; + } +@@ -4200,15 +4166,30 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn + mark_reg_unknown(env, regs, value_regno); + } + } +- } else if (reg->type == PTR_TO_MEM) { ++ } else if (base_type(reg->type) == PTR_TO_MEM) { ++ bool rdonly_mem = type_is_rdonly_mem(reg->type); ++ ++ if (type_may_be_null(reg->type)) { ++ verbose(env, "R%d invalid mem access '%s'\n", regno, ++ reg_type_str(env, reg->type)); ++ return -EACCES; ++ } ++ ++ if (t == BPF_WRITE && rdonly_mem) { ++ verbose(env, "R%d cannot write into %s\n", ++ regno, reg_type_str(env, reg->type)); ++ return -EACCES; ++ } ++ + if (t == BPF_WRITE && value_regno >= 0 && + is_pointer_value(env, value_regno)) { + verbose(env, "R%d leaks addr into mem\n", value_regno); + return -EACCES; + } ++ + err = check_mem_region_access(env, regno, off, size, + reg->mem_size, false); +- if (!err && t == BPF_READ && value_regno >= 0) ++ if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem)) + mark_reg_unknown(env, regs, value_regno); + } else if (reg->type == PTR_TO_CTX) { + enum bpf_reg_type reg_type = SCALAR_VALUE; +@@ -4238,7 +4219,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn + } else { + mark_reg_known_zero(env, regs, + value_regno); +- if (reg_type_may_be_null(reg_type)) ++ if (type_may_be_null(reg_type)) + regs[value_regno].id = ++env->id_gen; + /* A load of ctx field could have different + * actual load size with the one encoded in the +@@ -4246,8 +4227,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn + * a sub-register. + */ + regs[value_regno].subreg_def = DEF_NOT_SUBREG; +- if (reg_type == PTR_TO_BTF_ID || +- reg_type == PTR_TO_BTF_ID_OR_NULL) { ++ if (base_type(reg_type) == PTR_TO_BTF_ID) { + regs[value_regno].btf = btf; + regs[value_regno].btf_id = btf_id; + } +@@ -4300,7 +4280,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn + } else if (type_is_sk_pointer(reg->type)) { + if (t == BPF_WRITE) { + verbose(env, "R%d cannot write into %s\n", +- regno, reg_type_str[reg->type]); ++ regno, reg_type_str(env, reg->type)); + return -EACCES; + } + err = check_sock_access(env, insn_idx, regno, off, size, t); +@@ -4316,26 +4296,32 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn + } else if (reg->type == CONST_PTR_TO_MAP) { + err = check_ptr_to_map_access(env, regs, regno, off, size, t, + value_regno); +- } else if (reg->type == PTR_TO_RDONLY_BUF) { +- if (t == BPF_WRITE) { +- verbose(env, "R%d cannot write into %s\n", +- regno, reg_type_str[reg->type]); +- return -EACCES; ++ } else if (base_type(reg->type) == PTR_TO_BUF) { ++ bool rdonly_mem = type_is_rdonly_mem(reg->type); ++ const char *buf_info; ++ u32 *max_access; ++ ++ if (rdonly_mem) { ++ if (t == BPF_WRITE) { ++ verbose(env, "R%d cannot write into %s\n", ++ regno, reg_type_str(env, reg->type)); ++ return -EACCES; ++ } ++ buf_info = "rdonly"; ++ max_access = &env->prog->aux->max_rdonly_access; ++ } else { ++ buf_info = "rdwr"; ++ max_access = &env->prog->aux->max_rdwr_access; + } ++ + err = check_buffer_access(env, reg, regno, off, size, false, +- "rdonly", +- &env->prog->aux->max_rdonly_access); +- if (!err && value_regno >= 0) +- mark_reg_unknown(env, regs, value_regno); +- } else if (reg->type == PTR_TO_RDWR_BUF) { +- err = check_buffer_access(env, reg, regno, off, size, false, +- "rdwr", +- &env->prog->aux->max_rdwr_access); +- if (!err && t == BPF_READ && value_regno >= 0) ++ buf_info, max_access); ++ ++ if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ)) + mark_reg_unknown(env, regs, value_regno); + } else { + verbose(env, "R%d invalid mem access '%s'\n", regno, +- reg_type_str[reg->type]); ++ reg_type_str(env, reg->type)); + return -EACCES; + } + +@@ -4409,7 +4395,7 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i + is_sk_reg(env, insn->dst_reg)) { + verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n", + insn->dst_reg, +- reg_type_str[reg_state(env, insn->dst_reg)->type]); ++ reg_type_str(env, reg_state(env, insn->dst_reg)->type)); + return -EACCES; + } + +@@ -4592,8 +4578,10 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, + struct bpf_call_arg_meta *meta) + { + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; ++ const char *buf_info; ++ u32 *max_access; + +- switch (reg->type) { ++ switch (base_type(reg->type)) { + case PTR_TO_PACKET: + case PTR_TO_PACKET_META: + return check_packet_access(env, regno, reg->off, access_size, +@@ -4612,18 +4600,20 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, + return check_mem_region_access(env, regno, reg->off, + access_size, reg->mem_size, + zero_size_allowed); +- case PTR_TO_RDONLY_BUF: +- if (meta && meta->raw_mode) +- return -EACCES; +- return check_buffer_access(env, reg, regno, reg->off, +- access_size, zero_size_allowed, +- "rdonly", +- &env->prog->aux->max_rdonly_access); +- case PTR_TO_RDWR_BUF: ++ case PTR_TO_BUF: ++ if (type_is_rdonly_mem(reg->type)) { ++ if (meta && meta->raw_mode) ++ return -EACCES; ++ ++ buf_info = "rdonly"; ++ max_access = &env->prog->aux->max_rdonly_access; ++ } else { ++ buf_info = "rdwr"; ++ max_access = &env->prog->aux->max_rdwr_access; ++ } + return check_buffer_access(env, reg, regno, reg->off, + access_size, zero_size_allowed, +- "rdwr", +- &env->prog->aux->max_rdwr_access); ++ buf_info, max_access); + case PTR_TO_STACK: + return check_stack_range_initialized( + env, +@@ -4635,9 +4625,9 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, + register_is_null(reg)) + return 0; + +- verbose(env, "R%d type=%s expected=%s\n", regno, +- reg_type_str[reg->type], +- reg_type_str[PTR_TO_STACK]); ++ verbose(env, "R%d type=%s ", regno, ++ reg_type_str(env, reg->type)); ++ verbose(env, "expected=%s\n", reg_type_str(env, PTR_TO_STACK)); + return -EACCES; + } + } +@@ -4648,7 +4638,7 @@ int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + if (register_is_null(reg)) + return 0; + +- if (reg_type_may_be_null(reg->type)) { ++ if (type_may_be_null(reg->type)) { + /* Assuming that the register contains a value check if the memory + * access is safe. Temporarily save and restore the register's state as + * the conversion shouldn't be visible to a caller. +@@ -4796,9 +4786,8 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno, + + static bool arg_type_is_mem_ptr(enum bpf_arg_type type) + { +- return type == ARG_PTR_TO_MEM || +- type == ARG_PTR_TO_MEM_OR_NULL || +- type == ARG_PTR_TO_UNINIT_MEM; ++ return base_type(type) == ARG_PTR_TO_MEM || ++ base_type(type) == ARG_PTR_TO_UNINIT_MEM; + } + + static bool arg_type_is_mem_size(enum bpf_arg_type type) +@@ -4900,8 +4889,7 @@ static const struct bpf_reg_types mem_types = { + PTR_TO_MAP_KEY, + PTR_TO_MAP_VALUE, + PTR_TO_MEM, +- PTR_TO_RDONLY_BUF, +- PTR_TO_RDWR_BUF, ++ PTR_TO_BUF, + }, + }; + +@@ -4932,31 +4920,26 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { + [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, + [ARG_PTR_TO_MAP_VALUE] = &map_key_value_types, + [ARG_PTR_TO_UNINIT_MAP_VALUE] = &map_key_value_types, +- [ARG_PTR_TO_MAP_VALUE_OR_NULL] = &map_key_value_types, + [ARG_CONST_SIZE] = &scalar_types, + [ARG_CONST_SIZE_OR_ZERO] = &scalar_types, + [ARG_CONST_ALLOC_SIZE_OR_ZERO] = &scalar_types, + [ARG_CONST_MAP_PTR] = &const_map_ptr_types, + [ARG_PTR_TO_CTX] = &context_types, +- [ARG_PTR_TO_CTX_OR_NULL] = &context_types, + [ARG_PTR_TO_SOCK_COMMON] = &sock_types, + #ifdef CONFIG_NET + [ARG_PTR_TO_BTF_ID_SOCK_COMMON] = &btf_id_sock_common_types, + #endif + [ARG_PTR_TO_SOCKET] = &fullsock_types, +- [ARG_PTR_TO_SOCKET_OR_NULL] = &fullsock_types, + [ARG_PTR_TO_BTF_ID] = &btf_ptr_types, + [ARG_PTR_TO_SPIN_LOCK] = &spin_lock_types, + [ARG_PTR_TO_MEM] = &mem_types, +- [ARG_PTR_TO_MEM_OR_NULL] = &mem_types, + [ARG_PTR_TO_UNINIT_MEM] = &mem_types, + [ARG_PTR_TO_ALLOC_MEM] = &alloc_mem_types, +- [ARG_PTR_TO_ALLOC_MEM_OR_NULL] = &alloc_mem_types, + [ARG_PTR_TO_INT] = &int_ptr_types, + [ARG_PTR_TO_LONG] = &int_ptr_types, + [ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types, + [ARG_PTR_TO_FUNC] = &func_ptr_types, +- [ARG_PTR_TO_STACK_OR_NULL] = &stack_ptr_types, ++ [ARG_PTR_TO_STACK] = &stack_ptr_types, + [ARG_PTR_TO_CONST_STR] = &const_str_ptr_types, + [ARG_PTR_TO_TIMER] = &timer_types, + }; +@@ -4970,12 +4953,27 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, + const struct bpf_reg_types *compatible; + int i, j; + +- compatible = compatible_reg_types[arg_type]; ++ compatible = compatible_reg_types[base_type(arg_type)]; + if (!compatible) { + verbose(env, "verifier internal error: unsupported arg type %d\n", arg_type); + return -EFAULT; + } + ++ /* ARG_PTR_TO_MEM + RDONLY is compatible with PTR_TO_MEM and PTR_TO_MEM + RDONLY, ++ * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM and NOT with PTR_TO_MEM + RDONLY ++ * ++ * Same for MAYBE_NULL: ++ * ++ * ARG_PTR_TO_MEM + MAYBE_NULL is compatible with PTR_TO_MEM and PTR_TO_MEM + MAYBE_NULL, ++ * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM but NOT with PTR_TO_MEM + MAYBE_NULL ++ * ++ * Therefore we fold these flags depending on the arg_type before comparison. ++ */ ++ if (arg_type & MEM_RDONLY) ++ type &= ~MEM_RDONLY; ++ if (arg_type & PTR_MAYBE_NULL) ++ type &= ~PTR_MAYBE_NULL; ++ + for (i = 0; i < ARRAY_SIZE(compatible->types); i++) { + expected = compatible->types[i]; + if (expected == NOT_INIT) +@@ -4985,14 +4983,14 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, + goto found; + } + +- verbose(env, "R%d type=%s expected=", regno, reg_type_str[type]); ++ verbose(env, "R%d type=%s expected=", regno, reg_type_str(env, reg->type)); + for (j = 0; j + 1 < i; j++) +- verbose(env, "%s, ", reg_type_str[compatible->types[j]]); +- verbose(env, "%s\n", reg_type_str[compatible->types[j]]); ++ verbose(env, "%s, ", reg_type_str(env, compatible->types[j])); ++ verbose(env, "%s\n", reg_type_str(env, compatible->types[j])); + return -EACCES; + + found: +- if (type == PTR_TO_BTF_ID) { ++ if (reg->type == PTR_TO_BTF_ID) { + if (!arg_btf_id) { + if (!compatible->btf_id) { + verbose(env, "verifier internal error: missing arg compatible BTF ID\n"); +@@ -5051,15 +5049,14 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, + return -EACCES; + } + +- if (arg_type == ARG_PTR_TO_MAP_VALUE || +- arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE || +- arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) { ++ if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE || ++ base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) { + err = resolve_map_arg_type(env, meta, &arg_type); + if (err) + return err; + } + +- if (register_is_null(reg) && arg_type_may_be_null(arg_type)) ++ if (register_is_null(reg) && type_may_be_null(arg_type)) + /* A NULL register has a SCALAR_VALUE type, so skip + * type checking. + */ +@@ -5128,10 +5125,11 @@ skip_type_check: + err = check_helper_mem_access(env, regno, + meta->map_ptr->key_size, false, + NULL); +- } else if (arg_type == ARG_PTR_TO_MAP_VALUE || +- (arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL && +- !register_is_null(reg)) || +- arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) { ++ } else if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE || ++ base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) { ++ if (type_may_be_null(arg_type) && register_is_null(reg)) ++ return 0; ++ + /* bpf_map_xxx(..., map_ptr, ..., value) call: + * check [value, value + map->value_size) validity + */ +@@ -6206,6 +6204,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn + int *insn_idx_p) + { + const struct bpf_func_proto *fn = NULL; ++ enum bpf_return_type ret_type; ++ enum bpf_type_flag ret_flag; + struct bpf_reg_state *regs; + struct bpf_call_arg_meta meta; + int insn_idx = *insn_idx_p; +@@ -6339,13 +6339,14 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn + regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG; + + /* update return register (already marked as written above) */ +- if (fn->ret_type == RET_INTEGER) { ++ ret_type = fn->ret_type; ++ ret_flag = type_flag(fn->ret_type); ++ if (ret_type == RET_INTEGER) { + /* sets type to SCALAR_VALUE */ + mark_reg_unknown(env, regs, BPF_REG_0); +- } else if (fn->ret_type == RET_VOID) { ++ } else if (ret_type == RET_VOID) { + regs[BPF_REG_0].type = NOT_INIT; +- } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL || +- fn->ret_type == RET_PTR_TO_MAP_VALUE) { ++ } else if (base_type(ret_type) == RET_PTR_TO_MAP_VALUE) { + /* There is no offset yet applied, variable or fixed */ + mark_reg_known_zero(env, regs, BPF_REG_0); + /* remember map_ptr, so that check_map_access() +@@ -6359,28 +6360,25 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn + } + regs[BPF_REG_0].map_ptr = meta.map_ptr; + regs[BPF_REG_0].map_uid = meta.map_uid; +- if (fn->ret_type == RET_PTR_TO_MAP_VALUE) { +- regs[BPF_REG_0].type = PTR_TO_MAP_VALUE; +- if (map_value_has_spin_lock(meta.map_ptr)) +- regs[BPF_REG_0].id = ++env->id_gen; +- } else { +- regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; ++ regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag; ++ if (!type_may_be_null(ret_type) && ++ map_value_has_spin_lock(meta.map_ptr)) { ++ regs[BPF_REG_0].id = ++env->id_gen; + } +- } else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) { ++ } else if (base_type(ret_type) == RET_PTR_TO_SOCKET) { + mark_reg_known_zero(env, regs, BPF_REG_0); +- regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL; +- } else if (fn->ret_type == RET_PTR_TO_SOCK_COMMON_OR_NULL) { ++ regs[BPF_REG_0].type = PTR_TO_SOCKET | ret_flag; ++ } else if (base_type(ret_type) == RET_PTR_TO_SOCK_COMMON) { + mark_reg_known_zero(env, regs, BPF_REG_0); +- regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON_OR_NULL; +- } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) { ++ regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON | ret_flag; ++ } else if (base_type(ret_type) == RET_PTR_TO_TCP_SOCK) { + mark_reg_known_zero(env, regs, BPF_REG_0); +- regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; +- } else if (fn->ret_type == RET_PTR_TO_ALLOC_MEM_OR_NULL) { ++ regs[BPF_REG_0].type = PTR_TO_TCP_SOCK | ret_flag; ++ } else if (base_type(ret_type) == RET_PTR_TO_ALLOC_MEM) { + mark_reg_known_zero(env, regs, BPF_REG_0); +- regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL; ++ regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag; + regs[BPF_REG_0].mem_size = meta.mem_size; +- } else if (fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL || +- fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID) { ++ } else if (base_type(ret_type) == RET_PTR_TO_MEM_OR_BTF_ID) { + const struct btf_type *t; + + mark_reg_known_zero(env, regs, BPF_REG_0); +@@ -6398,29 +6396,30 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn + tname, PTR_ERR(ret)); + return -EINVAL; + } +- regs[BPF_REG_0].type = +- fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ? +- PTR_TO_MEM : PTR_TO_MEM_OR_NULL; ++ regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag; + regs[BPF_REG_0].mem_size = tsize; + } else { +- regs[BPF_REG_0].type = +- fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ? +- PTR_TO_BTF_ID : PTR_TO_BTF_ID_OR_NULL; ++ /* MEM_RDONLY may be carried from ret_flag, but it ++ * doesn't apply on PTR_TO_BTF_ID. Fold it, otherwise ++ * it will confuse the check of PTR_TO_BTF_ID in ++ * check_mem_access(). ++ */ ++ ret_flag &= ~MEM_RDONLY; ++ ++ regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag; + regs[BPF_REG_0].btf = meta.ret_btf; + regs[BPF_REG_0].btf_id = meta.ret_btf_id; + } +- } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL || +- fn->ret_type == RET_PTR_TO_BTF_ID) { ++ } else if (base_type(ret_type) == RET_PTR_TO_BTF_ID) { + int ret_btf_id; + + mark_reg_known_zero(env, regs, BPF_REG_0); +- regs[BPF_REG_0].type = fn->ret_type == RET_PTR_TO_BTF_ID ? +- PTR_TO_BTF_ID : +- PTR_TO_BTF_ID_OR_NULL; ++ regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag; + ret_btf_id = *fn->ret_btf_id; + if (ret_btf_id == 0) { +- verbose(env, "invalid return type %d of func %s#%d\n", +- fn->ret_type, func_id_name(func_id), func_id); ++ verbose(env, "invalid return type %u of func %s#%d\n", ++ base_type(ret_type), func_id_name(func_id), ++ func_id); + return -EINVAL; + } + /* current BPF helper definitions are only coming from +@@ -6429,12 +6428,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn + regs[BPF_REG_0].btf = btf_vmlinux; + regs[BPF_REG_0].btf_id = ret_btf_id; + } else { +- verbose(env, "unknown return type %d of func %s#%d\n", +- fn->ret_type, func_id_name(func_id), func_id); ++ verbose(env, "unknown return type %u of func %s#%d\n", ++ base_type(ret_type), func_id_name(func_id), func_id); + return -EINVAL; + } + +- if (reg_type_may_be_null(regs[BPF_REG_0].type)) ++ if (type_may_be_null(regs[BPF_REG_0].type)) + regs[BPF_REG_0].id = ++env->id_gen; + + if (is_ptr_cast_function(func_id)) { +@@ -6633,25 +6632,25 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env, + + if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) { + verbose(env, "math between %s pointer and %lld is not allowed\n", +- reg_type_str[type], val); ++ reg_type_str(env, type), val); + return false; + } + + if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) { + verbose(env, "%s pointer offset %d is not allowed\n", +- reg_type_str[type], reg->off); ++ reg_type_str(env, type), reg->off); + return false; + } + + if (smin == S64_MIN) { + verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n", +- reg_type_str[type]); ++ reg_type_str(env, type)); + return false; + } + + if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) { + verbose(env, "value %lld makes %s pointer be out of bounds\n", +- smin, reg_type_str[type]); ++ smin, reg_type_str(env, type)); + return false; + } + +@@ -7028,11 +7027,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, + return -EACCES; + } + +- switch (ptr_reg->type) { +- case PTR_TO_MAP_VALUE_OR_NULL: ++ if (ptr_reg->type & PTR_MAYBE_NULL) { + verbose(env, "R%d pointer arithmetic on %s prohibited, null-check it first\n", +- dst, reg_type_str[ptr_reg->type]); ++ dst, reg_type_str(env, ptr_reg->type)); + return -EACCES; ++ } ++ ++ switch (base_type(ptr_reg->type)) { + case CONST_PTR_TO_MAP: + /* smin_val represents the known value */ + if (known && smin_val == 0 && opcode == BPF_ADD) +@@ -7045,10 +7046,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, + case PTR_TO_XDP_SOCK: + reject: + verbose(env, "R%d pointer arithmetic on %s prohibited\n", +- dst, reg_type_str[ptr_reg->type]); ++ dst, reg_type_str(env, ptr_reg->type)); + return -EACCES; + default: +- if (reg_type_may_be_null(ptr_reg->type)) ++ if (type_may_be_null(ptr_reg->type)) + goto reject; + break; + } +@@ -8770,7 +8771,7 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, + struct bpf_reg_state *reg, u32 id, + bool is_null) + { +- if (reg_type_may_be_null(reg->type) && reg->id == id && ++ if (type_may_be_null(reg->type) && reg->id == id && + !WARN_ON_ONCE(!reg->id)) { + if (WARN_ON_ONCE(reg->smin_value || reg->smax_value || + !tnum_equals_const(reg->var_off, 0) || +@@ -9148,7 +9149,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, + */ + if (!is_jmp32 && BPF_SRC(insn->code) == BPF_K && + insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) && +- reg_type_may_be_null(dst_reg->type)) { ++ type_may_be_null(dst_reg->type)) { + /* Mark all identical registers in each branch as either + * safe or unknown depending R == 0 or R != 0 conditional. + */ +@@ -9207,7 +9208,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) + + if (insn->src_reg == BPF_PSEUDO_BTF_ID) { + dst_reg->type = aux->btf_var.reg_type; +- switch (dst_reg->type) { ++ switch (base_type(dst_reg->type)) { + case PTR_TO_MEM: + dst_reg->mem_size = aux->btf_var.mem_size; + break; +@@ -9404,7 +9405,7 @@ static int check_return_code(struct bpf_verifier_env *env) + /* enforce return zero from async callbacks like timer */ + if (reg->type != SCALAR_VALUE) { + verbose(env, "In async callback the register R0 is not a known value (%s)\n", +- reg_type_str[reg->type]); ++ reg_type_str(env, reg->type)); + return -EINVAL; + } + +@@ -9418,7 +9419,7 @@ static int check_return_code(struct bpf_verifier_env *env) + if (is_subprog) { + if (reg->type != SCALAR_VALUE) { + verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n", +- reg_type_str[reg->type]); ++ reg_type_str(env, reg->type)); + return -EINVAL; + } + return 0; +@@ -9482,7 +9483,7 @@ static int check_return_code(struct bpf_verifier_env *env) + + if (reg->type != SCALAR_VALUE) { + verbose(env, "At program exit the register R0 is not a known value (%s)\n", +- reg_type_str[reg->type]); ++ reg_type_str(env, reg->type)); + return -EINVAL; + } + +@@ -10263,7 +10264,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, + return true; + if (rcur->type == NOT_INIT) + return false; +- switch (rold->type) { ++ switch (base_type(rold->type)) { + case SCALAR_VALUE: + if (env->explore_alu_limits) + return false; +@@ -10285,6 +10286,22 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, + } + case PTR_TO_MAP_KEY: + case PTR_TO_MAP_VALUE: ++ /* a PTR_TO_MAP_VALUE could be safe to use as a ++ * PTR_TO_MAP_VALUE_OR_NULL into the same map. ++ * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL- ++ * checked, doing so could have affected others with the same ++ * id, and we can't check for that because we lost the id when ++ * we converted to a PTR_TO_MAP_VALUE. ++ */ ++ if (type_may_be_null(rold->type)) { ++ if (!type_may_be_null(rcur->type)) ++ return false; ++ if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id))) ++ return false; ++ /* Check our ids match any regs they're supposed to */ ++ return check_ids(rold->id, rcur->id, idmap); ++ } ++ + /* If the new min/max/var_off satisfy the old ones and + * everything else matches, we are OK. + * 'id' is not compared, since it's only used for maps with +@@ -10296,20 +10313,6 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, + return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && + range_within(rold, rcur) && + tnum_in(rold->var_off, rcur->var_off); +- case PTR_TO_MAP_VALUE_OR_NULL: +- /* a PTR_TO_MAP_VALUE could be safe to use as a +- * PTR_TO_MAP_VALUE_OR_NULL into the same map. +- * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL- +- * checked, doing so could have affected others with the same +- * id, and we can't check for that because we lost the id when +- * we converted to a PTR_TO_MAP_VALUE. +- */ +- if (rcur->type != PTR_TO_MAP_VALUE_OR_NULL) +- return false; +- if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id))) +- return false; +- /* Check our ids match any regs they're supposed to */ +- return check_ids(rold->id, rcur->id, idmap); + case PTR_TO_PACKET_META: + case PTR_TO_PACKET: + if (rcur->type != rold->type) +@@ -10338,11 +10341,8 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, + case PTR_TO_PACKET_END: + case PTR_TO_FLOW_KEYS: + case PTR_TO_SOCKET: +- case PTR_TO_SOCKET_OR_NULL: + case PTR_TO_SOCK_COMMON: +- case PTR_TO_SOCK_COMMON_OR_NULL: + case PTR_TO_TCP_SOCK: +- case PTR_TO_TCP_SOCK_OR_NULL: + case PTR_TO_XDP_SOCK: + /* Only valid matches are exact, which memcmp() above + * would have accepted +@@ -10868,17 +10868,13 @@ next: + /* Return true if it's OK to have the same insn return a different type. */ + static bool reg_type_mismatch_ok(enum bpf_reg_type type) + { +- switch (type) { ++ switch (base_type(type)) { + case PTR_TO_CTX: + case PTR_TO_SOCKET: +- case PTR_TO_SOCKET_OR_NULL: + case PTR_TO_SOCK_COMMON: +- case PTR_TO_SOCK_COMMON_OR_NULL: + case PTR_TO_TCP_SOCK: +- case PTR_TO_TCP_SOCK_OR_NULL: + case PTR_TO_XDP_SOCK: + case PTR_TO_BTF_ID: +- case PTR_TO_BTF_ID_OR_NULL: + return false; + default: + return true; +@@ -11102,7 +11098,7 @@ static int do_check(struct bpf_verifier_env *env) + if (is_ctx_reg(env, insn->dst_reg)) { + verbose(env, "BPF_ST stores into R%d %s is not allowed\n", + insn->dst_reg, +- reg_type_str[reg_state(env, insn->dst_reg)->type]); ++ reg_type_str(env, reg_state(env, insn->dst_reg)->type)); + return -EACCES; + } + +@@ -11353,7 +11349,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, + err = -EINVAL; + goto err_put; + } +- aux->btf_var.reg_type = PTR_TO_MEM; ++ aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY; + aux->btf_var.mem_size = tsize; + } else { + aux->btf_var.reg_type = PTR_TO_BTF_ID; +@@ -13175,7 +13171,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) + mark_reg_known_zero(env, regs, i); + else if (regs[i].type == SCALAR_VALUE) + mark_reg_unknown(env, regs, i); +- else if (regs[i].type == PTR_TO_MEM_OR_NULL) { ++ else if (base_type(regs[i].type) == PTR_TO_MEM) { + const u32 mem_size = regs[i].mem_size; + + mark_reg_known_zero(env, regs, i); +diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c +index 5a18b861fcf75..c289010b0964e 100644 +--- a/kernel/trace/bpf_trace.c ++++ b/kernel/trace/bpf_trace.c +@@ -345,7 +345,7 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = { + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, +- .arg2_type = ARG_PTR_TO_MEM, ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, + }; + +@@ -394,7 +394,7 @@ static const struct bpf_func_proto bpf_trace_printk_proto = { + .func = bpf_trace_printk, + .gpl_only = true, + .ret_type = RET_INTEGER, +- .arg1_type = ARG_PTR_TO_MEM, ++ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg2_type = ARG_CONST_SIZE, + }; + +@@ -446,9 +446,9 @@ static const struct bpf_func_proto bpf_seq_printf_proto = { + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_seq_file_ids[0], +- .arg2_type = ARG_PTR_TO_MEM, ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, +- .arg4_type = ARG_PTR_TO_MEM_OR_NULL, ++ .arg4_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, + }; + +@@ -463,7 +463,7 @@ static const struct bpf_func_proto bpf_seq_write_proto = { + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_seq_file_ids[0], +- .arg2_type = ARG_PTR_TO_MEM, ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + }; + +@@ -487,7 +487,7 @@ static const struct bpf_func_proto bpf_seq_printf_btf_proto = { + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_seq_file_ids[0], +- .arg2_type = ARG_PTR_TO_MEM, ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, + }; +@@ -648,7 +648,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = { + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +- .arg4_type = ARG_PTR_TO_MEM, ++ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, + }; + +@@ -958,7 +958,7 @@ const struct bpf_func_proto bpf_snprintf_btf_proto = { + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE, +- .arg3_type = ARG_PTR_TO_MEM, ++ .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg4_type = ARG_CONST_SIZE, + .arg5_type = ARG_ANYTHING, + }; +@@ -1207,7 +1207,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = { + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +- .arg4_type = ARG_PTR_TO_MEM, ++ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, + }; + +@@ -1429,7 +1429,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +- .arg4_type = ARG_PTR_TO_MEM, ++ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, + }; + +@@ -1483,7 +1483,7 @@ static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = { + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +- .arg2_type = ARG_PTR_TO_MEM, ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, + }; +diff --git a/lib/iov_iter.c b/lib/iov_iter.c +index c5b2f0f4b8a84..6d146f77601d7 100644 +--- a/lib/iov_iter.c ++++ b/lib/iov_iter.c +@@ -191,7 +191,7 @@ static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t b + buf = iov->iov_base + skip; + copy = min(bytes, iov->iov_len - skip); + +- if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_writeable(buf, copy)) { ++ if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_writeable(buf, copy)) { + kaddr = kmap_atomic(page); + from = kaddr + offset; + +@@ -275,7 +275,7 @@ static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t + buf = iov->iov_base + skip; + copy = min(bytes, iov->iov_len - skip); + +- if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_readable(buf, copy)) { ++ if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_readable(buf, copy)) { + kaddr = kmap_atomic(page); + to = kaddr + offset; + +@@ -431,35 +431,81 @@ out: + } + + /* ++ * fault_in_iov_iter_readable - fault in iov iterator for reading ++ * @i: iterator ++ * @size: maximum length ++ * + * Fault in one or more iovecs of the given iov_iter, to a maximum length of +- * bytes. For each iovec, fault in each page that constitutes the iovec. ++ * @size. For each iovec, fault in each page that constitutes the iovec. ++ * ++ * Returns the number of bytes not faulted in (like copy_to_user() and ++ * copy_from_user()). + * +- * Return 0 on success, or non-zero if the memory could not be accessed (i.e. +- * because it is an invalid address). ++ * Always returns 0 for non-userspace iterators. + */ +-int iov_iter_fault_in_readable(const struct iov_iter *i, size_t bytes) ++size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size) + { + if (iter_is_iovec(i)) { ++ size_t count = min(size, iov_iter_count(i)); + const struct iovec *p; + size_t skip; + +- if (bytes > i->count) +- bytes = i->count; +- for (p = i->iov, skip = i->iov_offset; bytes; p++, skip = 0) { +- size_t len = min(bytes, p->iov_len - skip); +- int err; ++ size -= count; ++ for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) { ++ size_t len = min(count, p->iov_len - skip); ++ size_t ret; + + if (unlikely(!len)) + continue; +- err = fault_in_pages_readable(p->iov_base + skip, len); +- if (unlikely(err)) +- return err; +- bytes -= len; ++ ret = fault_in_readable(p->iov_base + skip, len); ++ count -= len - ret; ++ if (ret) ++ break; + } ++ return count + size; + } + return 0; + } +-EXPORT_SYMBOL(iov_iter_fault_in_readable); ++EXPORT_SYMBOL(fault_in_iov_iter_readable); ++ ++/* ++ * fault_in_iov_iter_writeable - fault in iov iterator for writing ++ * @i: iterator ++ * @size: maximum length ++ * ++ * Faults in the iterator using get_user_pages(), i.e., without triggering ++ * hardware page faults. This is primarily useful when we already know that ++ * some or all of the pages in @i aren't in memory. ++ * ++ * Returns the number of bytes not faulted in, like copy_to_user() and ++ * copy_from_user(). ++ * ++ * Always returns 0 for non-user-space iterators. ++ */ ++size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size) ++{ ++ if (iter_is_iovec(i)) { ++ size_t count = min(size, iov_iter_count(i)); ++ const struct iovec *p; ++ size_t skip; ++ ++ size -= count; ++ for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) { ++ size_t len = min(count, p->iov_len - skip); ++ size_t ret; ++ ++ if (unlikely(!len)) ++ continue; ++ ret = fault_in_safe_writeable(p->iov_base + skip, len); ++ count -= len - ret; ++ if (ret) ++ break; ++ } ++ return count + size; ++ } ++ return 0; ++} ++EXPORT_SYMBOL(fault_in_iov_iter_writeable); + + void iov_iter_init(struct iov_iter *i, unsigned int direction, + const struct iovec *iov, unsigned long nr_segs, +@@ -468,6 +514,7 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction, + WARN_ON(direction & ~(READ | WRITE)); + *i = (struct iov_iter) { + .iter_type = ITER_IOVEC, ++ .nofault = false, + .data_source = direction, + .iov = iov, + .nr_segs = nr_segs, +@@ -1483,13 +1530,17 @@ ssize_t iov_iter_get_pages(struct iov_iter *i, + return 0; + + if (likely(iter_is_iovec(i))) { ++ unsigned int gup_flags = 0; + unsigned long addr; + ++ if (iov_iter_rw(i) != WRITE) ++ gup_flags |= FOLL_WRITE; ++ if (i->nofault) ++ gup_flags |= FOLL_NOFAULT; ++ + addr = first_iovec_segment(i, &len, start, maxsize, maxpages); + n = DIV_ROUND_UP(len, PAGE_SIZE); +- res = get_user_pages_fast(addr, n, +- iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0, +- pages); ++ res = get_user_pages_fast(addr, n, gup_flags, pages); + if (unlikely(res <= 0)) + return res; + return (res == n ? len : res * PAGE_SIZE) - *start; +@@ -1605,15 +1656,20 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, + return 0; + + if (likely(iter_is_iovec(i))) { ++ unsigned int gup_flags = 0; + unsigned long addr; + ++ if (iov_iter_rw(i) != WRITE) ++ gup_flags |= FOLL_WRITE; ++ if (i->nofault) ++ gup_flags |= FOLL_NOFAULT; ++ + addr = first_iovec_segment(i, &len, start, maxsize, ~0U); + n = DIV_ROUND_UP(len, PAGE_SIZE); + p = get_pages_array(n); + if (!p) + return -ENOMEM; +- res = get_user_pages_fast(addr, n, +- iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0, p); ++ res = get_user_pages_fast(addr, n, gup_flags, p); + if (unlikely(res <= 0)) { + kvfree(p); + *pages = NULL; +diff --git a/mm/filemap.c b/mm/filemap.c +index 1293c3409e429..00e391e758801 100644 +--- a/mm/filemap.c ++++ b/mm/filemap.c +@@ -90,7 +90,7 @@ + * ->lock_page (filemap_fault, access_process_vm) + * + * ->i_rwsem (generic_perform_write) +- * ->mmap_lock (fault_in_pages_readable->do_page_fault) ++ * ->mmap_lock (fault_in_readable->do_page_fault) + * + * bdi->wb.list_lock + * sb_lock (fs/fs-writeback.c) +@@ -3760,7 +3760,7 @@ again: + * same page as we're writing to, without it being marked + * up-to-date. + */ +- if (unlikely(iov_iter_fault_in_readable(i, bytes))) { ++ if (unlikely(fault_in_iov_iter_readable(i, bytes))) { + status = -EFAULT; + break; + } +diff --git a/mm/gup.c b/mm/gup.c +index 52f08e3177e9f..ba2ab7a223f8e 100644 +--- a/mm/gup.c ++++ b/mm/gup.c +@@ -943,6 +943,8 @@ static int faultin_page(struct vm_area_struct *vma, + /* mlock all present pages, but do not fault in new pages */ + if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK) + return -ENOENT; ++ if (*flags & FOLL_NOFAULT) ++ return -EFAULT; + if (*flags & FOLL_WRITE) + fault_flags |= FAULT_FLAG_WRITE; + if (*flags & FOLL_REMOTE) +@@ -1681,6 +1683,122 @@ finish_or_fault: + } + #endif /* !CONFIG_MMU */ + ++/** ++ * fault_in_writeable - fault in userspace address range for writing ++ * @uaddr: start of address range ++ * @size: size of address range ++ * ++ * Returns the number of bytes not faulted in (like copy_to_user() and ++ * copy_from_user()). ++ */ ++size_t fault_in_writeable(char __user *uaddr, size_t size) ++{ ++ char __user *start = uaddr, *end; ++ ++ if (unlikely(size == 0)) ++ return 0; ++ if (!PAGE_ALIGNED(uaddr)) { ++ if (unlikely(__put_user(0, uaddr) != 0)) ++ return size; ++ uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr); ++ } ++ end = (char __user *)PAGE_ALIGN((unsigned long)start + size); ++ if (unlikely(end < start)) ++ end = NULL; ++ while (uaddr != end) { ++ if (unlikely(__put_user(0, uaddr) != 0)) ++ goto out; ++ uaddr += PAGE_SIZE; ++ } ++ ++out: ++ if (size > uaddr - start) ++ return size - (uaddr - start); ++ return 0; ++} ++EXPORT_SYMBOL(fault_in_writeable); ++ ++/* ++ * fault_in_safe_writeable - fault in an address range for writing ++ * @uaddr: start of address range ++ * @size: length of address range ++ * ++ * Faults in an address range for writing. This is primarily useful when we ++ * already know that some or all of the pages in the address range aren't in ++ * memory. ++ * ++ * Unlike fault_in_writeable(), this function is non-destructive. ++ * ++ * Note that we don't pin or otherwise hold the pages referenced that we fault ++ * in. There's no guarantee that they'll stay in memory for any duration of ++ * time. ++ * ++ * Returns the number of bytes not faulted in, like copy_to_user() and ++ * copy_from_user(). ++ */ ++size_t fault_in_safe_writeable(const char __user *uaddr, size_t size) ++{ ++ unsigned long start = (unsigned long)uaddr, end; ++ struct mm_struct *mm = current->mm; ++ bool unlocked = false; ++ ++ if (unlikely(size == 0)) ++ return 0; ++ end = PAGE_ALIGN(start + size); ++ if (end < start) ++ end = 0; ++ ++ mmap_read_lock(mm); ++ do { ++ if (fixup_user_fault(mm, start, FAULT_FLAG_WRITE, &unlocked)) ++ break; ++ start = (start + PAGE_SIZE) & PAGE_MASK; ++ } while (start != end); ++ mmap_read_unlock(mm); ++ ++ if (size > (unsigned long)uaddr - start) ++ return size - ((unsigned long)uaddr - start); ++ return 0; ++} ++EXPORT_SYMBOL(fault_in_safe_writeable); ++ ++/** ++ * fault_in_readable - fault in userspace address range for reading ++ * @uaddr: start of user address range ++ * @size: size of user address range ++ * ++ * Returns the number of bytes not faulted in (like copy_to_user() and ++ * copy_from_user()). ++ */ ++size_t fault_in_readable(const char __user *uaddr, size_t size) ++{ ++ const char __user *start = uaddr, *end; ++ volatile char c; ++ ++ if (unlikely(size == 0)) ++ return 0; ++ if (!PAGE_ALIGNED(uaddr)) { ++ if (unlikely(__get_user(c, uaddr) != 0)) ++ return size; ++ uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr); ++ } ++ end = (const char __user *)PAGE_ALIGN((unsigned long)start + size); ++ if (unlikely(end < start)) ++ end = NULL; ++ while (uaddr != end) { ++ if (unlikely(__get_user(c, uaddr) != 0)) ++ goto out; ++ uaddr += PAGE_SIZE; ++ } ++ ++out: ++ (void)c; ++ if (size > uaddr - start) ++ return size - (uaddr - start); ++ return 0; ++} ++EXPORT_SYMBOL(fault_in_readable); ++ + /** + * get_dump_page() - pin user page in memory while writing it to core dump + * @addr: user address +@@ -2733,7 +2851,7 @@ static int internal_get_user_pages_fast(unsigned long start, + + if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM | + FOLL_FORCE | FOLL_PIN | FOLL_GET | +- FOLL_FAST_ONLY))) ++ FOLL_FAST_ONLY | FOLL_NOFAULT))) + return -EINVAL; + + if (gup_flags & FOLL_PIN) +diff --git a/mm/kfence/core.c b/mm/kfence/core.c +index 86260e8f28302..66076d8742b78 100644 +--- a/mm/kfence/core.c ++++ b/mm/kfence/core.c +@@ -528,6 +528,8 @@ static bool __init kfence_init_pool(void) + * enters __slab_free() slow-path. + */ + for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) { ++ struct page *page = &pages[i]; ++ + if (!i || (i % 2)) + continue; + +@@ -535,7 +537,11 @@ static bool __init kfence_init_pool(void) + if (WARN_ON(compound_head(&pages[i]) != &pages[i])) + goto err; + +- __SetPageSlab(&pages[i]); ++ __SetPageSlab(page); ++#ifdef CONFIG_MEMCG ++ page->memcg_data = (unsigned long)&kfence_metadata[i / 2 - 1].objcg | ++ MEMCG_DATA_OBJCGS; ++#endif + } + + /* +@@ -911,6 +917,9 @@ void __kfence_free(void *addr) + { + struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr); + ++#ifdef CONFIG_MEMCG ++ KFENCE_WARN_ON(meta->objcg); ++#endif + /* + * If the objects of the cache are SLAB_TYPESAFE_BY_RCU, defer freeing + * the object, as the object page may be recycled for other-typed +diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h +index 92bf6eff6060d..600f2e2431d6d 100644 +--- a/mm/kfence/kfence.h ++++ b/mm/kfence/kfence.h +@@ -89,6 +89,9 @@ struct kfence_metadata { + struct kfence_track free_track; + /* For updating alloc_covered on frees. */ + u32 alloc_stack_hash; ++#ifdef CONFIG_MEMCG ++ struct obj_cgroup *objcg; ++#endif + }; + + extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS]; +diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c +index 68d2cbf8331ac..ea61dfe19c869 100644 +--- a/net/core/bpf_sk_storage.c ++++ b/net/core/bpf_sk_storage.c +@@ -929,7 +929,7 @@ static struct bpf_iter_reg bpf_sk_storage_map_reg_info = { + { offsetof(struct bpf_iter__bpf_sk_storage_map, sk), + PTR_TO_BTF_ID_OR_NULL }, + { offsetof(struct bpf_iter__bpf_sk_storage_map, value), +- PTR_TO_RDWR_BUF_OR_NULL }, ++ PTR_TO_BUF | PTR_MAYBE_NULL }, + }, + .seq_info = &iter_seq_info, + }; +diff --git a/net/core/filter.c b/net/core/filter.c +index cdd7e92db3030..821278b906b71 100644 +--- a/net/core/filter.c ++++ b/net/core/filter.c +@@ -1713,7 +1713,7 @@ static const struct bpf_func_proto bpf_skb_store_bytes_proto = { + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +- .arg3_type = ARG_PTR_TO_MEM, ++ .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg4_type = ARG_CONST_SIZE, + .arg5_type = ARG_ANYTHING, + }; +@@ -2018,9 +2018,9 @@ static const struct bpf_func_proto bpf_csum_diff_proto = { + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_INTEGER, +- .arg1_type = ARG_PTR_TO_MEM_OR_NULL, ++ .arg1_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, +- .arg3_type = ARG_PTR_TO_MEM_OR_NULL, ++ .arg3_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, + .arg4_type = ARG_CONST_SIZE_OR_ZERO, + .arg5_type = ARG_ANYTHING, + }; +@@ -2541,7 +2541,7 @@ static const struct bpf_func_proto bpf_redirect_neigh_proto = { + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, +- .arg2_type = ARG_PTR_TO_MEM_OR_NULL, ++ .arg2_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, + }; +@@ -4177,7 +4177,7 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = { + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +- .arg4_type = ARG_PTR_TO_MEM, ++ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, + }; + +@@ -4191,7 +4191,7 @@ const struct bpf_func_proto bpf_skb_output_proto = { + .arg1_btf_id = &bpf_skb_output_btf_ids[0], + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +- .arg4_type = ARG_PTR_TO_MEM, ++ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, + }; + +@@ -4374,7 +4374,7 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +- .arg2_type = ARG_PTR_TO_MEM, ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + }; +@@ -4400,7 +4400,7 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = { + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +- .arg2_type = ARG_PTR_TO_MEM, ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, + }; + +@@ -4570,7 +4570,7 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = { + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +- .arg4_type = ARG_PTR_TO_MEM, ++ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, + }; + +@@ -4584,7 +4584,7 @@ const struct bpf_func_proto bpf_xdp_output_proto = { + .arg1_btf_id = &bpf_xdp_output_btf_ids[0], + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +- .arg4_type = ARG_PTR_TO_MEM, ++ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, + }; + +@@ -5072,7 +5072,7 @@ const struct bpf_func_proto bpf_sk_setsockopt_proto = { + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +- .arg4_type = ARG_PTR_TO_MEM, ++ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE, + }; + +@@ -5106,7 +5106,7 @@ static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = { + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +- .arg4_type = ARG_PTR_TO_MEM, ++ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE, + }; + +@@ -5140,7 +5140,7 @@ static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = { + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +- .arg4_type = ARG_PTR_TO_MEM, ++ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE, + }; + +@@ -5315,7 +5315,7 @@ static const struct bpf_func_proto bpf_bind_proto = { + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +- .arg2_type = ARG_PTR_TO_MEM, ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, + }; + +@@ -5903,7 +5903,7 @@ static const struct bpf_func_proto bpf_lwt_in_push_encap_proto = { + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +- .arg3_type = ARG_PTR_TO_MEM, ++ .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg4_type = ARG_CONST_SIZE + }; + +@@ -5913,7 +5913,7 @@ static const struct bpf_func_proto bpf_lwt_xmit_push_encap_proto = { + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +- .arg3_type = ARG_PTR_TO_MEM, ++ .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg4_type = ARG_CONST_SIZE + }; + +@@ -5956,7 +5956,7 @@ static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = { + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +- .arg3_type = ARG_PTR_TO_MEM, ++ .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg4_type = ARG_CONST_SIZE + }; + +@@ -6044,7 +6044,7 @@ static const struct bpf_func_proto bpf_lwt_seg6_action_proto = { + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +- .arg3_type = ARG_PTR_TO_MEM, ++ .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg4_type = ARG_CONST_SIZE + }; + +@@ -6269,7 +6269,7 @@ static const struct bpf_func_proto bpf_skc_lookup_tcp_proto = { + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, +- .arg2_type = ARG_PTR_TO_MEM, ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +@@ -6288,7 +6288,7 @@ static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = { + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, +- .arg2_type = ARG_PTR_TO_MEM, ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +@@ -6307,7 +6307,7 @@ static const struct bpf_func_proto bpf_sk_lookup_udp_proto = { + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, +- .arg2_type = ARG_PTR_TO_MEM, ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +@@ -6344,7 +6344,7 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = { + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, +- .arg2_type = ARG_PTR_TO_MEM, ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +@@ -6367,7 +6367,7 @@ static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = { + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, +- .arg2_type = ARG_PTR_TO_MEM, ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +@@ -6390,7 +6390,7 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = { + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, +- .arg2_type = ARG_PTR_TO_MEM, ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +@@ -6409,7 +6409,7 @@ static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = { + .gpl_only = false, + .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, +- .arg2_type = ARG_PTR_TO_MEM, ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +@@ -6428,7 +6428,7 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = { + .gpl_only = false, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, +- .arg2_type = ARG_PTR_TO_MEM, ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +@@ -6447,7 +6447,7 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { + .gpl_only = false, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, +- .arg2_type = ARG_PTR_TO_MEM, ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +@@ -6769,9 +6769,9 @@ static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = { + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, +- .arg2_type = ARG_PTR_TO_MEM, ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, +- .arg4_type = ARG_PTR_TO_MEM, ++ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE, + }; + +@@ -6838,9 +6838,9 @@ static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = { + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, +- .arg2_type = ARG_PTR_TO_MEM, ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, +- .arg4_type = ARG_PTR_TO_MEM, ++ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE, + }; + +@@ -7069,7 +7069,7 @@ static const struct bpf_func_proto bpf_sock_ops_store_hdr_opt_proto = { + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +- .arg2_type = ARG_PTR_TO_MEM, ++ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + }; +diff --git a/net/core/sock_map.c b/net/core/sock_map.c +index 8288b5382f08d..6351b6af7aca9 100644 +--- a/net/core/sock_map.c ++++ b/net/core/sock_map.c +@@ -1575,7 +1575,7 @@ static struct bpf_iter_reg sock_map_iter_reg = { + .ctx_arg_info_size = 2, + .ctx_arg_info = { + { offsetof(struct bpf_iter__sockmap, key), +- PTR_TO_RDONLY_BUF_OR_NULL }, ++ PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY }, + { offsetof(struct bpf_iter__sockmap, sk), + PTR_TO_BTF_ID_OR_NULL }, + }, +diff --git a/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c b/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c +index cf3acfa5a91d5..69455fe90ac3e 100644 +--- a/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c ++++ b/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c +@@ -7,6 +7,7 @@ + #include "test_ksyms_btf.skel.h" + #include "test_ksyms_btf_null_check.skel.h" + #include "test_ksyms_weak.skel.h" ++#include "test_ksyms_btf_write_check.skel.h" + + static int duration; + +@@ -109,6 +110,16 @@ cleanup: + test_ksyms_weak__destroy(skel); + } + ++static void test_write_check(void) ++{ ++ struct test_ksyms_btf_write_check *skel; ++ ++ skel = test_ksyms_btf_write_check__open_and_load(); ++ ASSERT_ERR_PTR(skel, "unexpected load of a prog writing to ksym memory\n"); ++ ++ test_ksyms_btf_write_check__destroy(skel); ++} ++ + void test_ksyms_btf(void) + { + int percpu_datasec; +@@ -136,4 +147,7 @@ void test_ksyms_btf(void) + + if (test__start_subtest("weak_ksyms")) + test_weak_syms(); ++ ++ if (test__start_subtest("write_check")) ++ test_write_check(); + } +diff --git a/tools/testing/selftests/bpf/progs/test_ksyms_btf_write_check.c b/tools/testing/selftests/bpf/progs/test_ksyms_btf_write_check.c +new file mode 100644 +index 0000000000000..2180c41cd890f +--- /dev/null ++++ b/tools/testing/selftests/bpf/progs/test_ksyms_btf_write_check.c +@@ -0,0 +1,29 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* Copyright (c) 2021 Google */ ++ ++#include "vmlinux.h" ++ ++#include <bpf/bpf_helpers.h> ++ ++extern const int bpf_prog_active __ksym; /* int type global var. */ ++ ++SEC("raw_tp/sys_enter") ++int handler(const void *ctx) ++{ ++ int *active; ++ __u32 cpu; ++ ++ cpu = bpf_get_smp_processor_id(); ++ active = (int *)bpf_per_cpu_ptr(&bpf_prog_active, cpu); ++ if (active) { ++ /* Kernel memory obtained from bpf_{per,this}_cpu_ptr ++ * is read-only, should _not_ pass verification. ++ */ ++ /* WRITE_ONCE */ ++ *(volatile int *)active = -1; ++ } ++ ++ return 0; ++} ++ ++char _license[] SEC("license") = "GPL"; +diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c +index 336a749673d19..2e701e7f69680 100644 +--- a/tools/testing/selftests/bpf/verifier/calls.c ++++ b/tools/testing/selftests/bpf/verifier/calls.c +@@ -107,6 +107,25 @@ + .result = REJECT, + .errstr = "R0 min value is outside of the allowed memory range", + }, ++{ ++ "calls: trigger reg2btf_ids[reg->type] for reg->type > __BPF_REG_TYPE_MAX", ++ .insns = { ++ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), ++ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), ++ BPF_ST_MEM(BPF_DW, BPF_REG_1, 0, 0), ++ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0), ++ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), ++ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0), ++ BPF_EXIT_INSN(), ++ }, ++ .prog_type = BPF_PROG_TYPE_SCHED_CLS, ++ .result = REJECT, ++ .errstr = "arg#0 pointer type STRUCT prog_test_ref_kfunc must point", ++ .fixup_kfunc_btf_id = { ++ { "bpf_kfunc_call_test_acquire", 3 }, ++ { "bpf_kfunc_call_test_release", 5 }, ++ }, ++}, + { + "calls: overlapping caller/callee", + .insns = { |