Linux patch 4.9.514.9-53

author: Mike Pagano <mpagano@gentoo.org> 2017-09-20 06:10:55 -0400
committer: Mike Pagano <mpagano@gentoo.org> 2017-09-20 06:10:55 -0400
commit: 55c1db3954d3cdf7dcc7c95dc15d7827827f9294 (patch)
tree: 285a9ca070a3c7cf3dcfe8b908940f43fea79c76
parent: Remove redundant patch (diff)
download: linux-patches-4.9-53.tar.gz
linux-patches-4.9-53.tar.bz2
linux-patches-4.9-53.zip
2 files changed, 3916 insertions, 0 deletions
diff --git a/0000_README b/0000_README
index d21869ee..54efac88 100644
--- a/0000_README
+++ b/0000_README
@@ -243,6 +243,10 @@ Patch:  1049_linux-4.9.50.patch
 From:   http://www.kernel.org
 Desc:   Linux 4.9.50
 
+Patch:  1050_linux-4.9.51.patch
+From:   http://www.kernel.org
+Desc:   Linux 4.9.51
+
 Patch:  1500_XATTR_USER_PREFIX.patch
 From:   https://bugs.gentoo.org/show_bug.cgi?id=470644
 Desc:   Support for namespace user.pax.* on tmpfs.
diff --git a/1050_linux-4.9.51.patch b/1050_linux-4.9.51.patch
new file mode 100644
index 00000000..5dcc1f29
--- /dev/null
+++ b/1050_linux-4.9.51.patch
@@ -0,0 +1,3912 @@
+diff --git a/Makefile b/Makefile
+index 038d126a15fc..b48aebbe187f 100644
+--- a/Makefile
++++ b/Makefile
+@@ -1,6 +1,6 @@
+ VERSION = 4
+ PATCHLEVEL = 9
+-SUBLEVEL = 50
++SUBLEVEL = 51
+ EXTRAVERSION =
+ NAME = Roaring Lionus
+ 
+diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
+index b31761ecce63..7bcd138c3aa9 100644
+--- a/arch/x86/include/asm/elf.h
++++ b/arch/x86/include/asm/elf.h
+@@ -204,6 +204,7 @@ void set_personality_ia32(bool);
+ 
+ #define ELF_CORE_COPY_REGS(pr_reg, regs)			\
+ do {								\
++	unsigned long base;					\
+ 	unsigned v;						\
+ 	(pr_reg)[0] = (regs)->r15;				\
+ 	(pr_reg)[1] = (regs)->r14;				\
+@@ -226,8 +227,8 @@ do {								\
+ 	(pr_reg)[18] = (regs)->flags;				\
+ 	(pr_reg)[19] = (regs)->sp;				\
+ 	(pr_reg)[20] = (regs)->ss;				\
+-	(pr_reg)[21] = current->thread.fsbase;			\
+-	(pr_reg)[22] = current->thread.gsbase;			\
++	rdmsrl(MSR_FS_BASE, base); (pr_reg)[21] = base;		\
++	rdmsrl(MSR_KERNEL_GS_BASE, base); (pr_reg)[22] = base;	\
+ 	asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v;	\
+ 	asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v;	\
+ 	asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v;	\
+diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
+index b3760b3c1ca0..0887d2ae3797 100644
+--- a/arch/x86/kernel/process_64.c
++++ b/arch/x86/kernel/process_64.c
+@@ -136,6 +136,123 @@ void release_thread(struct task_struct *dead_task)
+ 	}
+ }
+ 
++enum which_selector {
++	FS,
++	GS
++};
++
++/*
++ * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
++ * not available.  The goal is to be reasonably fast on non-FSGSBASE systems.
++ * It's forcibly inlined because it'll generate better code and this function
++ * is hot.
++ */
++static __always_inline void save_base_legacy(struct task_struct *prev_p,
++					     unsigned short selector,
++					     enum which_selector which)
++{
++	if (likely(selector == 0)) {
++		/*
++		 * On Intel (without X86_BUG_NULL_SEG), the segment base could
++		 * be the pre-existing saved base or it could be zero.  On AMD
++		 * (with X86_BUG_NULL_SEG), the segment base could be almost
++		 * anything.
++		 *
++		 * This branch is very hot (it's hit twice on almost every
++		 * context switch between 64-bit programs), and avoiding
++		 * the RDMSR helps a lot, so we just assume that whatever
++		 * value is already saved is correct.  This matches historical
++		 * Linux behavior, so it won't break existing applications.
++		 *
++		 * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
++		 * report that the base is zero, it needs to actually be zero:
++		 * see the corresponding logic in load_seg_legacy.
++		 */
++	} else {
++		/*
++		 * If the selector is 1, 2, or 3, then the base is zero on
++		 * !X86_BUG_NULL_SEG CPUs and could be anything on
++		 * X86_BUG_NULL_SEG CPUs.  In the latter case, Linux
++		 * has never attempted to preserve the base across context
++		 * switches.
++		 *
++		 * If selector > 3, then it refers to a real segment, and
++		 * saving the base isn't necessary.
++		 */
++		if (which == FS)
++			prev_p->thread.fsbase = 0;
++		else
++			prev_p->thread.gsbase = 0;
++	}
++}
++
++static __always_inline void save_fsgs(struct task_struct *task)
++{
++	savesegment(fs, task->thread.fsindex);
++	savesegment(gs, task->thread.gsindex);
++	save_base_legacy(task, task->thread.fsindex, FS);
++	save_base_legacy(task, task->thread.gsindex, GS);
++}
++
++static __always_inline void loadseg(enum which_selector which,
++				    unsigned short sel)
++{
++	if (which == FS)
++		loadsegment(fs, sel);
++	else
++		load_gs_index(sel);
++}
++
++static __always_inline void load_seg_legacy(unsigned short prev_index,
++					    unsigned long prev_base,
++					    unsigned short next_index,
++					    unsigned long next_base,
++					    enum which_selector which)
++{
++	if (likely(next_index <= 3)) {
++		/*
++		 * The next task is using 64-bit TLS, is not using this
++		 * segment at all, or is having fun with arcane CPU features.
++		 */
++		if (next_base == 0) {
++			/*
++			 * Nasty case: on AMD CPUs, we need to forcibly zero
++			 * the base.
++			 */
++			if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
++				loadseg(which, __USER_DS);
++				loadseg(which, next_index);
++			} else {
++				/*
++				 * We could try to exhaustively detect cases
++				 * under which we can skip the segment load,
++				 * but there's really only one case that matters
++				 * for performance: if both the previous and
++				 * next states are fully zeroed, we can skip
++				 * the load.
++				 *
++				 * (This assumes that prev_base == 0 has no
++				 * false positives.  This is the case on
++				 * Intel-style CPUs.)
++				 */
++				if (likely(prev_index | next_index | prev_base))
++					loadseg(which, next_index);
++			}
++		} else {
++			if (prev_index != next_index)
++				loadseg(which, next_index);
++			wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
++			       next_base);
++		}
++	} else {
++		/*
++		 * The next task is using a real segment.  Loading the selector
++		 * is sufficient.
++		 */
++		loadseg(which, next_index);
++	}
++}
++
+ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
+ 		unsigned long arg, struct task_struct *p, unsigned long tls)
+ {
+@@ -216,10 +333,19 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
+ 		    unsigned long new_sp,
+ 		    unsigned int _cs, unsigned int _ss, unsigned int _ds)
+ {
++	WARN_ON_ONCE(regs != current_pt_regs());
++
++	if (static_cpu_has(X86_BUG_NULL_SEG)) {
++		/* Loading zero below won't clear the base. */
++		loadsegment(fs, __USER_DS);
++		load_gs_index(__USER_DS);
++	}
++
+ 	loadsegment(fs, 0);
+ 	loadsegment(es, _ds);
+ 	loadsegment(ds, _ds);
+ 	load_gs_index(0);
++
+ 	regs->ip		= new_ip;
+ 	regs->sp		= new_sp;
+ 	regs->cs		= _cs;
+@@ -264,7 +390,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+ 	struct fpu *next_fpu = &next->fpu;
+ 	int cpu = smp_processor_id();
+ 	struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
+-	unsigned prev_fsindex, prev_gsindex;
+ 	fpu_switch_t fpu_switch;
+ 
+ 	fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu);
+@@ -274,8 +399,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+ 	 *
+ 	 * (e.g. xen_load_tls())
+ 	 */
+-	savesegment(fs, prev_fsindex);
+-	savesegment(gs, prev_gsindex);
++	save_fsgs(prev_p);
+ 
+ 	/*
+ 	 * Load TLS before restoring any segments so that segment loads
+@@ -314,108 +438,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+ 	if (unlikely(next->ds | prev->ds))
+ 		loadsegment(ds, next->ds);
+ 
+-	/*
+-	 * Switch FS and GS.
+-	 *
+-	 * These are even more complicated than DS and ES: they have
+-	 * 64-bit bases are that controlled by arch_prctl.  The bases
+-	 * don't necessarily match the selectors, as user code can do
+-	 * any number of things to cause them to be inconsistent.
+-	 *
+-	 * We don't promise to preserve the bases if the selectors are
+-	 * nonzero.  We also don't promise to preserve the base if the
+-	 * selector is zero and the base doesn't match whatever was
+-	 * most recently passed to ARCH_SET_FS/GS.  (If/when the
+-	 * FSGSBASE instructions are enabled, we'll need to offer
+-	 * stronger guarantees.)
+-	 *
+-	 * As an invariant,
+-	 * (fsbase != 0 && fsindex != 0) || (gsbase != 0 && gsindex != 0) is
+-	 * impossible.
+-	 */
+-	if (next->fsindex) {
+-		/* Loading a nonzero value into FS sets the index and base. */
+-		loadsegment(fs, next->fsindex);
+-	} else {
+-		if (next->fsbase) {
+-			/* Next index is zero but next base is nonzero. */
+-			if (prev_fsindex)
+-				loadsegment(fs, 0);
+-			wrmsrl(MSR_FS_BASE, next->fsbase);
+-		} else {
+-			/* Next base and index are both zero. */
+-			if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
+-				/*
+-				 * We don't know the previous base and can't
+-				 * find out without RDMSR.  Forcibly clear it.
+-				 */
+-				loadsegment(fs, __USER_DS);
+-				loadsegment(fs, 0);
+-			} else {
+-				/*
+-				 * If the previous index is zero and ARCH_SET_FS
+-				 * didn't change the base, then the base is
+-				 * also zero and we don't need to do anything.
+-				 */
+-				if (prev->fsbase || prev_fsindex)
+-					loadsegment(fs, 0);
+-			}
+-		}
+-	}
+-	/*
+-	 * Save the old state and preserve the invariant.
+-	 * NB: if prev_fsindex == 0, then we can't reliably learn the base
+-	 * without RDMSR because Intel user code can zero it without telling
+-	 * us and AMD user code can program any 32-bit value without telling
+-	 * us.
+-	 */
+-	if (prev_fsindex)
+-		prev->fsbase = 0;
+-	prev->fsindex = prev_fsindex;
+-
+-	if (next->gsindex) {
+-		/* Loading a nonzero value into GS sets the index and base. */
+-		load_gs_index(next->gsindex);
+-	} else {
+-		if (next->gsbase) {
+-			/* Next index is zero but next base is nonzero. */
+-			if (prev_gsindex)
+-				load_gs_index(0);
+-			wrmsrl(MSR_KERNEL_GS_BASE, next->gsbase);
+-		} else {
+-			/* Next base and index are both zero. */
+-			if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
+-				/*
+-				 * We don't know the previous base and can't
+-				 * find out without RDMSR.  Forcibly clear it.
+-				 *
+-				 * This contains a pointless SWAPGS pair.
+-				 * Fixing it would involve an explicit check
+-				 * for Xen or a new pvop.
+-				 */
+-				load_gs_index(__USER_DS);
+-				load_gs_index(0);
+-			} else {
+-				/*
+-				 * If the previous index is zero and ARCH_SET_GS
+-				 * didn't change the base, then the base is
+-				 * also zero and we don't need to do anything.
+-				 */
+-				if (prev->gsbase || prev_gsindex)
+-					load_gs_index(0);
+-			}
+-		}
+-	}
+-	/*
+-	 * Save the old state and preserve the invariant.
+-	 * NB: if prev_gsindex == 0, then we can't reliably learn the base
+-	 * without RDMSR because Intel user code can zero it without telling
+-	 * us and AMD user code can program any 32-bit value without telling
+-	 * us.
+-	 */
+-	if (prev_gsindex)
+-		prev->gsbase = 0;
+-	prev->gsindex = prev_gsindex;
++	load_seg_legacy(prev->fsindex, prev->fsbase,
++			next->fsindex, next->fsbase, FS);
++	load_seg_legacy(prev->gsindex, prev->gsbase,
++			next->gsindex, next->gsbase, GS);
+ 
+ 	switch_fpu_finish(next_fpu, fpu_switch);
+ 
+diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
+index 383f19c6bf24..549b4afd12e1 100644
+--- a/drivers/md/raid5.c
++++ b/drivers/md/raid5.c
+@@ -5844,6 +5844,8 @@ static void raid5_do_work(struct work_struct *work)
+ 
+ 	spin_unlock_irq(&conf->device_lock);
+ 
++	r5l_flush_stripe_to_raid(conf->log);
++
+ 	async_tx_issue_pending_all();
+ 	blk_finish_plug(&plug);
+ 
+diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
+index e8139514d32c..9e073fb6870a 100644
+--- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
++++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
+@@ -317,12 +317,12 @@ int t4_wr_mbox_meat_timeout(struct adapter *adap, int mbox, const void *cmd,
+ 
+ 	if (v != MBOX_OWNER_DRV) {
+ 		ret = (v == MBOX_OWNER_FW) ? -EBUSY : -ETIMEDOUT;
+-		t4_record_mbox(adap, cmd, MBOX_LEN, access, ret);
++		t4_record_mbox(adap, cmd, size, access, ret);
+ 		return ret;
+ 	}
+ 
+ 	/* Copy in the new mailbox command and send it on its way ... */
+-	t4_record_mbox(adap, cmd, MBOX_LEN, access, 0);
++	t4_record_mbox(adap, cmd, size, access, 0);
+ 	for (i = 0; i < size; i += 8)
+ 		t4_write_reg64(adap, data_reg + i, be64_to_cpu(*p++));
+ 
+@@ -371,7 +371,7 @@ int t4_wr_mbox_meat_timeout(struct adapter *adap, int mbox, const void *cmd,
+ 	}
+ 
+ 	ret = (pcie_fw & PCIE_FW_ERR_F) ? -ENXIO : -ETIMEDOUT;
+-	t4_record_mbox(adap, cmd, MBOX_LEN, access, ret);
++	t4_record_mbox(adap, cmd, size, access, ret);
+ 	dev_err(adap->pdev_dev, "command %#x in mailbox %d timed out\n",
+ 		*(const u8 *)cmd, mbox);
+ 	t4_report_fw_error(adap);
+diff --git a/drivers/net/ethernet/freescale/fman/mac.c b/drivers/net/ethernet/freescale/fman/mac.c
+index 736db9d9b0ad..81021f87e4f3 100644
+--- a/drivers/net/ethernet/freescale/fman/mac.c
++++ b/drivers/net/ethernet/freescale/fman/mac.c
+@@ -622,6 +622,9 @@ static struct platform_device *dpaa_eth_add_device(int fman_id,
+ 		goto no_mem;
+ 	}
+ 
++	pdev->dev.of_node = node;
++	pdev->dev.parent = priv->dev;
++
+ 	ret = platform_device_add_data(pdev, &data, sizeof(data));
+ 	if (ret)
+ 		goto err;
+diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c
+index 3f4e71148808..fd206889a433 100644
+--- a/drivers/net/ethernet/freescale/gianfar.c
++++ b/drivers/net/ethernet/freescale/gianfar.c
+@@ -3690,7 +3690,7 @@ static noinline void gfar_update_link_state(struct gfar_private *priv)
+ 		u32 tempval1 = gfar_read(&regs->maccfg1);
+ 		u32 tempval = gfar_read(&regs->maccfg2);
+ 		u32 ecntrl = gfar_read(&regs->ecntrl);
+-		u32 tx_flow_oldval = (tempval & MACCFG1_TX_FLOW);
++		u32 tx_flow_oldval = (tempval1 & MACCFG1_TX_FLOW);
+ 
+ 		if (phydev->duplex != priv->oldduplex) {
+ 			if (!(phydev->duplex))
+diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+index f902c4d3de99..1806b1fc6e4c 100644
+--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
++++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+@@ -4172,6 +4172,8 @@ static int mlxsw_sp_netdevice_port_upper_event(struct net_device *dev,
+ 			return -EINVAL;
+ 		if (!info->linking)
+ 			break;
++		if (netdev_has_any_upper_dev(upper_dev))
++			return -EINVAL;
+ 		/* HW limitation forbids to put ports to multiple bridges. */
+ 		if (netif_is_bridge_master(upper_dev) &&
+ 		    !mlxsw_sp_master_bridge_check(mlxsw_sp, upper_dev))
+@@ -4185,6 +4187,10 @@ static int mlxsw_sp_netdevice_port_upper_event(struct net_device *dev,
+ 		if (netif_is_lag_port(dev) && is_vlan_dev(upper_dev) &&
+ 		    !netif_is_lag_master(vlan_dev_real_dev(upper_dev)))
+ 			return -EINVAL;
++		if (!info->linking)
++			break;
++		if (netdev_has_any_upper_dev(upper_dev))
++			return -EINVAL;
+ 		break;
+ 	case NETDEV_CHANGEUPPER:
+ 		upper_dev = info->upper_dev;
+diff --git a/drivers/net/ethernet/qlogic/qlge/qlge_dbg.c b/drivers/net/ethernet/qlogic/qlge/qlge_dbg.c
+index 829be21f97b2..be258d90de9e 100644
+--- a/drivers/net/ethernet/qlogic/qlge/qlge_dbg.c
++++ b/drivers/net/ethernet/qlogic/qlge/qlge_dbg.c
+@@ -724,7 +724,7 @@ static void ql_build_coredump_seg_header(
+ 	seg_hdr->cookie = MPI_COREDUMP_COOKIE;
+ 	seg_hdr->segNum = seg_number;
+ 	seg_hdr->segSize = seg_size;
+-	memcpy(seg_hdr->description, desc, (sizeof(seg_hdr->description)) - 1);
++	strncpy(seg_hdr->description, desc, (sizeof(seg_hdr->description)) - 1);
+ }
+ 
+ /*
+diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
+index ff038e507fd6..36a04e182af1 100644
+--- a/drivers/net/hyperv/netvsc_drv.c
++++ b/drivers/net/hyperv/netvsc_drv.c
+@@ -1084,7 +1084,12 @@ static void netvsc_link_change(struct work_struct *w)
+ 	bool notify = false, reschedule = false;
+ 	unsigned long flags, next_reconfig, delay;
+ 
+-	rtnl_lock();
++	/* if changes are happening, comeback later */
++	if (!rtnl_trylock()) {
++		schedule_delayed_work(&ndev_ctx->dwork, LINKCHANGE_INT);
++		return;
++	}
++
+ 	if (ndev_ctx->start_remove)
+ 		goto out_unlock;
+ 
+diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c
+index a5d66e205bb2..2caac0c37059 100644
+--- a/drivers/net/macsec.c
++++ b/drivers/net/macsec.c
+@@ -3510,6 +3510,7 @@ module_init(macsec_init);
+ module_exit(macsec_exit);
+ 
+ MODULE_ALIAS_RTNL_LINK("macsec");
++MODULE_ALIAS_GENL_FAMILY("macsec");
+ 
+ MODULE_DESCRIPTION("MACsec IEEE 802.1AE");
+ MODULE_LICENSE("GPL v2");
+diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
+index 775a6e1fdef9..6e12401b5102 100644
+--- a/drivers/net/phy/phy.c
++++ b/drivers/net/phy/phy.c
+@@ -674,9 +674,6 @@ void phy_stop_machine(struct phy_device *phydev)
+ 	if (phydev->state > PHY_UP && phydev->state != PHY_HALTED)
+ 		phydev->state = PHY_UP;
+ 	mutex_unlock(&phydev->lock);
+-
+-	/* Now we can run the state machine synchronously */
+-	phy_state_machine(&phydev->state_queue.work);
+ }
+ 
+ /**
+diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
+index 5dc128a8da83..96a0661011fd 100644
+--- a/drivers/vhost/net.c
++++ b/drivers/vhost/net.c
+@@ -537,8 +537,13 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
+ 
+ 		preempt_enable();
+ 
+-		if (vhost_enable_notify(&net->dev, vq))
++		if (!vhost_vq_avail_empty(&net->dev, vq))
+ 			vhost_poll_queue(&vq->poll);
++		else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
++			vhost_disable_notify(&net->dev, vq);
++			vhost_poll_queue(&vq->poll);
++		}
++
+ 		mutex_unlock(&vq->mutex);
+ 
+ 		len = peek_head_len(sk);
+diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
+index 2fc84a991325..98c1a63a4614 100644
+--- a/fs/f2fs/recovery.c
++++ b/fs/f2fs/recovery.c
+@@ -316,7 +316,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
+ 		return 0;
+ 
+ 	/* Get the previous summary */
+-	for (i = CURSEG_WARM_DATA; i <= CURSEG_COLD_DATA; i++) {
++	for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
+ 		struct curseg_info *curseg = CURSEG_I(sbi, i);
+ 		if (curseg->segno == segno) {
+ 			sum = curseg->sum_blk->entries[blkoff];
+@@ -626,8 +626,6 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
+ 	}
+ 
+ 	clear_sbi_flag(sbi, SBI_POR_DOING);
+-	if (err)
+-		set_ckpt_flags(sbi, CP_ERROR_FLAG);
+ 	mutex_unlock(&sbi->cp_mutex);
+ 
+ 	/* let's drop all the directory inodes for clean checkpoint */
+diff --git a/fs/inode.c b/fs/inode.c
+index 88110fd0b282..920aa0b1c6b0 100644
+--- a/fs/inode.c
++++ b/fs/inode.c
+@@ -637,6 +637,7 @@ void evict_inodes(struct super_block *sb)
+ 
+ 	dispose_list(&dispose);
+ }
++EXPORT_SYMBOL_GPL(evict_inodes);
+ 
+ /**
+  * invalidate_inodes	- attempt to free all inodes on a superblock
+diff --git a/fs/internal.h b/fs/internal.h
+index f4da3341b4a3..8b7143b0211c 100644
+--- a/fs/internal.h
++++ b/fs/internal.h
+@@ -136,7 +136,6 @@ extern bool atime_needs_update_rcu(const struct path *, struct inode *);
+ extern void inode_io_list_del(struct inode *inode);
+ 
+ extern long get_nr_dirty_inodes(void);
+-extern void evict_inodes(struct super_block *);
+ extern int invalidate_inodes(struct super_block *, bool);
+ 
+ /*
+diff --git a/fs/iomap.c b/fs/iomap.c
+index 798c291cbc75..a49db8806a3a 100644
+--- a/fs/iomap.c
++++ b/fs/iomap.c
+@@ -281,7 +281,7 @@ iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
+ 		unsigned long bytes;	/* Bytes to write to page */
+ 
+ 		offset = (pos & (PAGE_SIZE - 1));
+-		bytes = min_t(unsigned long, PAGE_SIZE - offset, length);
++		bytes = min_t(loff_t, PAGE_SIZE - offset, length);
+ 
+ 		rpage = __iomap_read_page(inode, pos);
+ 		if (IS_ERR(rpage))
+@@ -376,7 +376,7 @@ iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
+ 		unsigned offset, bytes;
+ 
+ 		offset = pos & (PAGE_SIZE - 1); /* Within page */
+-		bytes = min_t(unsigned, PAGE_SIZE - offset, count);
++		bytes = min_t(loff_t, PAGE_SIZE - offset, count);
+ 
+ 		if (IS_DAX(inode))
+ 			status = iomap_dax_zero(pos, offset, bytes, iomap);
+diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
+index 2852521fc8ec..c6c15e5717e4 100644
+--- a/fs/xfs/libxfs/xfs_attr_leaf.c
++++ b/fs/xfs/libxfs/xfs_attr_leaf.c
+@@ -351,7 +351,7 @@ xfs_attr3_leaf_read(
+ 
+ 	err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
+ 				XFS_ATTR_FORK, &xfs_attr3_leaf_buf_ops);
+-	if (!err && tp)
++	if (!err && tp && *bpp)
+ 		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF);
+ 	return err;
+ }
+diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
+index 2a8cbd15d5d1..d2f4ab175096 100644
+--- a/fs/xfs/libxfs/xfs_bmap.c
++++ b/fs/xfs/libxfs/xfs_bmap.c
+@@ -579,7 +579,7 @@ xfs_bmap_validate_ret(
+ 
+ #else
+ #define xfs_bmap_check_leaf_extents(cur, ip, whichfork)		do { } while (0)
+-#define	xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap)
++#define	xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap)	do { } while (0)
+ #endif /* DEBUG */
+ 
+ /*
+@@ -5555,6 +5555,8 @@ __xfs_bunmapi(
+ 	int			whichfork;	/* data or attribute fork */
+ 	xfs_fsblock_t		sum;
+ 	xfs_filblks_t		len = *rlen;	/* length to unmap in file */
++	xfs_fileoff_t		max_len;
++	xfs_agnumber_t		prev_agno = NULLAGNUMBER, agno;
+ 
+ 	trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_);
+ 
+@@ -5576,6 +5578,16 @@ __xfs_bunmapi(
+ 	ASSERT(len > 0);
+ 	ASSERT(nexts >= 0);
+ 
++	/*
++	 * Guesstimate how many blocks we can unmap without running the risk of
++	 * blowing out the transaction with a mix of EFIs and reflink
++	 * adjustments.
++	 */
++	if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK)
++		max_len = min(len, xfs_refcount_max_unmap(tp->t_log_res));
++	else
++		max_len = len;
++
+ 	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+ 	    (error = xfs_iread_extents(tp, ip, whichfork)))
+ 		return error;
+@@ -5621,7 +5633,7 @@ __xfs_bunmapi(
+ 
+ 	extno = 0;
+ 	while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 &&
+-	       (nexts == 0 || extno < nexts)) {
++	       (nexts == 0 || extno < nexts) && max_len > 0) {
+ 		/*
+ 		 * Is the found extent after a hole in which bno lives?
+ 		 * Just back up to the previous extent, if so.
+@@ -5647,6 +5659,17 @@ __xfs_bunmapi(
+ 		ASSERT(ep != NULL);
+ 		del = got;
+ 		wasdel = isnullstartblock(del.br_startblock);
++
++		/*
++		 * Make sure we don't touch multiple AGF headers out of order
++		 * in a single transaction, as that could cause AB-BA deadlocks.
++		 */
++		if (!wasdel) {
++			agno = XFS_FSB_TO_AGNO(mp, del.br_startblock);
++			if (prev_agno != NULLAGNUMBER && prev_agno > agno)
++				break;
++			prev_agno = agno;
++		}
+ 		if (got.br_startoff < start) {
+ 			del.br_startoff = start;
+ 			del.br_blockcount -= start - got.br_startoff;
+@@ -5655,6 +5678,15 @@ __xfs_bunmapi(
+ 		}
+ 		if (del.br_startoff + del.br_blockcount > bno + 1)
+ 			del.br_blockcount = bno + 1 - del.br_startoff;
++
++		/* How much can we safely unmap? */
++		if (max_len < del.br_blockcount) {
++			del.br_startoff += del.br_blockcount - max_len;
++			if (!wasdel)
++				del.br_startblock += del.br_blockcount - max_len;
++			del.br_blockcount = max_len;
++		}
++
+ 		sum = del.br_startblock + del.br_blockcount;
+ 		if (isrt &&
+ 		    (mod = do_mod(sum, mp->m_sb.sb_rextsize))) {
+@@ -5835,6 +5867,7 @@ __xfs_bunmapi(
+ 		if (!isrt && wasdel)
+ 			xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount, false);
+ 
++		max_len -= del.br_blockcount;
+ 		bno = del.br_startoff - 1;
+ nodelete:
+ 		/*
+@@ -6604,25 +6637,33 @@ xfs_bmap_finish_one(
+ 	int				whichfork,
+ 	xfs_fileoff_t			startoff,
+ 	xfs_fsblock_t			startblock,
+-	xfs_filblks_t			blockcount,
++	xfs_filblks_t			*blockcount,
+ 	xfs_exntst_t			state)
+ {
+ 	struct xfs_bmbt_irec		bmap;
+ 	int				nimaps = 1;
+ 	xfs_fsblock_t			firstfsb;
+ 	int				flags = XFS_BMAPI_REMAP;
+-	int				done;
+ 	int				error = 0;
+ 
+ 	bmap.br_startblock = startblock;
+ 	bmap.br_startoff = startoff;
+-	bmap.br_blockcount = blockcount;
++	bmap.br_blockcount = *blockcount;
+ 	bmap.br_state = state;
+ 
++	/*
++	 * firstfsb is tied to the transaction lifetime and is used to
++	 * ensure correct AG locking order and schedule work item
++	 * continuations.  XFS_BUI_MAX_FAST_EXTENTS (== 1) restricts us
++	 * to only making one bmap call per transaction, so it should
++	 * be safe to have it as a local variable here.
++	 */
++	firstfsb = NULLFSBLOCK;
++
+ 	trace_xfs_bmap_deferred(tp->t_mountp,
+ 			XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type,
+ 			XFS_FSB_TO_AGBNO(tp->t_mountp, startblock),
+-			ip->i_ino, whichfork, startoff, blockcount, state);
++			ip->i_ino, whichfork, startoff, *blockcount, state);
+ 
+ 	if (whichfork != XFS_DATA_FORK && whichfork != XFS_ATTR_FORK)
+ 		return -EFSCORRUPTED;
+@@ -6641,12 +6682,11 @@ xfs_bmap_finish_one(
+ 					bmap.br_blockcount, flags, &firstfsb,
+ 					bmap.br_blockcount, &bmap, &nimaps,
+ 					dfops);
++		*blockcount = 0;
+ 		break;
+ 	case XFS_BMAP_UNMAP:
+-		error = xfs_bunmapi(tp, ip, bmap.br_startoff,
+-				bmap.br_blockcount, flags, 1, &firstfsb,
+-				dfops, &done);
+-		ASSERT(done);
++		error = __xfs_bunmapi(tp, ip, startoff, blockcount,
++				XFS_BMAPI_REMAP, 1, &firstfsb, dfops);
+ 		break;
+ 	default:
+ 		ASSERT(0);
+diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
+index e7d40b39f18f..db53ac7ff6df 100644
+--- a/fs/xfs/libxfs/xfs_bmap.h
++++ b/fs/xfs/libxfs/xfs_bmap.h
+@@ -265,7 +265,7 @@ struct xfs_bmap_intent {
+ int	xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_defer_ops *dfops,
+ 		struct xfs_inode *ip, enum xfs_bmap_intent_type type,
+ 		int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock,
+-		xfs_filblks_t blockcount, xfs_exntst_t state);
++		xfs_filblks_t *blockcount, xfs_exntst_t state);
+ int	xfs_bmap_map_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
+ 		struct xfs_inode *ip, struct xfs_bmbt_irec *imap);
+ int	xfs_bmap_unmap_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
+diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
+index 5c3918678bb6..9968a746c649 100644
+--- a/fs/xfs/libxfs/xfs_bmap_btree.c
++++ b/fs/xfs/libxfs/xfs_bmap_btree.c
+@@ -888,6 +888,7 @@ xfs_bmbt_change_owner(
+ 	cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
+ 	if (!cur)
+ 		return -ENOMEM;
++	cur->bc_private.b.flags |= XFS_BTCUR_BPRV_INVALID_OWNER;
+ 
+ 	error = xfs_btree_change_owner(cur, new_owner, buffer_list);
+ 	xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
+index 91c68913d495..4ad1e214b1b2 100644
+--- a/fs/xfs/libxfs/xfs_btree.c
++++ b/fs/xfs/libxfs/xfs_btree.c
+@@ -714,7 +714,8 @@ xfs_btree_firstrec(
+ 	 * Get the block pointer for this level.
+ 	 */
+ 	block = xfs_btree_get_block(cur, level, &bp);
+-	xfs_btree_check_block(cur, block, level, bp);
++	if (xfs_btree_check_block(cur, block, level, bp))
++		return 0;
+ 	/*
+ 	 * It's empty, there is no such record.
+ 	 */
+@@ -743,7 +744,8 @@ xfs_btree_lastrec(
+ 	 * Get the block pointer for this level.
+ 	 */
+ 	block = xfs_btree_get_block(cur, level, &bp);
+-	xfs_btree_check_block(cur, block, level, bp);
++	if (xfs_btree_check_block(cur, block, level, bp))
++		return 0;
+ 	/*
+ 	 * It's empty, there is no such record.
+ 	 */
+@@ -1772,6 +1774,7 @@ xfs_btree_lookup_get_block(
+ 
+ 	/* Check the inode owner since the verifiers don't. */
+ 	if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) &&
++	    !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_INVALID_OWNER) &&
+ 	    (cur->bc_flags & XFS_BTREE_LONG_PTRS) &&
+ 	    be64_to_cpu((*blkp)->bb_u.l.bb_owner) !=
+ 			cur->bc_private.b.ip->i_ino)
+@@ -4432,10 +4435,15 @@ xfs_btree_block_change_owner(
+ 
+ 	/* modify the owner */
+ 	block = xfs_btree_get_block(cur, level, &bp);
+-	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
++	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
++		if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner))
++			return 0;
+ 		block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner);
+-	else
++	} else {
++		if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner))
++			return 0;
+ 		block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner);
++	}
+ 
+ 	/*
+ 	 * If the block is a root block hosted in an inode, we might not have a
+@@ -4444,16 +4452,19 @@ xfs_btree_block_change_owner(
+ 	 * block is formatted into the on-disk inode fork. We still change it,
+ 	 * though, so everything is consistent in memory.
+ 	 */
+-	if (bp) {
+-		if (cur->bc_tp) {
+-			xfs_trans_ordered_buf(cur->bc_tp, bp);
++	if (!bp) {
++		ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
++		ASSERT(level == cur->bc_nlevels - 1);
++		return 0;
++	}
++
++	if (cur->bc_tp) {
++		if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) {
+ 			xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
+-		} else {
+-			xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
++			return -EAGAIN;
+ 		}
+ 	} else {
+-		ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+-		ASSERT(level == cur->bc_nlevels - 1);
++		xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
+ 	}
+ 
+ 	return 0;
+diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
+index 3b0fc1afada5..33c7be2357b9 100644
+--- a/fs/xfs/libxfs/xfs_btree.h
++++ b/fs/xfs/libxfs/xfs_btree.h
+@@ -268,7 +268,8 @@ typedef struct xfs_btree_cur
+ 			short		forksize;	/* fork's inode space */
+ 			char		whichfork;	/* data or attr fork */
+ 			char		flags;		/* flags */
+-#define	XFS_BTCUR_BPRV_WASDEL	1			/* was delayed */
++#define	XFS_BTCUR_BPRV_WASDEL		(1<<0)		/* was delayed */
++#define	XFS_BTCUR_BPRV_INVALID_OWNER	(1<<1)		/* for ext swap */
+ 		} b;
+ 	}		bc_private;	/* per-btree type data */
+ } xfs_btree_cur_t;
+diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
+index 1bdf2888295b..b305dbfd81c4 100644
+--- a/fs/xfs/libxfs/xfs_da_btree.c
++++ b/fs/xfs/libxfs/xfs_da_btree.c
+@@ -263,7 +263,7 @@ xfs_da3_node_read(
+ 
+ 	err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
+ 					which_fork, &xfs_da3_node_buf_ops);
+-	if (!err && tp) {
++	if (!err && tp && *bpp) {
+ 		struct xfs_da_blkinfo	*info = (*bpp)->b_addr;
+ 		int			type;
+ 
+diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
+index aa17cb788946..43c902f7a68d 100644
+--- a/fs/xfs/libxfs/xfs_dir2_block.c
++++ b/fs/xfs/libxfs/xfs_dir2_block.c
+@@ -139,7 +139,7 @@ xfs_dir3_block_read(
+ 
+ 	err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, -1, bpp,
+ 				XFS_DATA_FORK, &xfs_dir3_block_buf_ops);
+-	if (!err && tp)
++	if (!err && tp && *bpp)
+ 		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF);
+ 	return err;
+ }
+diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
+index b887fb2a2bcf..f2e342e05365 100644
+--- a/fs/xfs/libxfs/xfs_dir2_leaf.c
++++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
+@@ -268,7 +268,7 @@ xfs_dir3_leaf_read(
+ 
+ 	err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+ 				XFS_DATA_FORK, &xfs_dir3_leaf1_buf_ops);
+-	if (!err && tp)
++	if (!err && tp && *bpp)
+ 		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAF1_BUF);
+ 	return err;
+ }
+@@ -285,7 +285,7 @@ xfs_dir3_leafn_read(
+ 
+ 	err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+ 				XFS_DATA_FORK, &xfs_dir3_leafn_buf_ops);
+-	if (!err && tp)
++	if (!err && tp && *bpp)
+ 		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAFN_BUF);
+ 	return err;
+ }
+diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
+index a2818f6e8598..42fef0731e2a 100644
+--- a/fs/xfs/libxfs/xfs_ialloc.c
++++ b/fs/xfs/libxfs/xfs_ialloc.c
+@@ -368,8 +368,6 @@ xfs_ialloc_inode_init(
+ 				 * transaction and pin the log appropriately.
+ 				 */
+ 				xfs_trans_ordered_buf(tp, fbuf);
+-				xfs_trans_log_buf(tp, fbuf, 0,
+-						  BBTOB(fbuf->b_length) - 1);
+ 			}
+ 		} else {
+ 			fbuf->b_flags |= XBF_DONE;
+@@ -1123,6 +1121,7 @@ xfs_dialloc_ag_inobt(
+ 	int			error;
+ 	int			offset;
+ 	int			i, j;
++	int			searchdistance = 10;
+ 
+ 	pag = xfs_perag_get(mp, agno);
+ 
+@@ -1149,7 +1148,6 @@ xfs_dialloc_ag_inobt(
+ 	if (pagno == agno) {
+ 		int		doneleft;	/* done, to the left */
+ 		int		doneright;	/* done, to the right */
+-		int		searchdistance = 10;
+ 
+ 		error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
+ 		if (error)
+@@ -1210,21 +1208,9 @@ xfs_dialloc_ag_inobt(
+ 		/*
+ 		 * Loop until we find an inode chunk with a free inode.
+ 		 */
+-		while (!doneleft || !doneright) {
++		while (--searchdistance > 0 && (!doneleft || !doneright)) {
+ 			int	useleft;  /* using left inode chunk this time */
+ 
+-			if (!--searchdistance) {
+-				/*
+-				 * Not in range - save last search
+-				 * location and allocate a new inode
+-				 */
+-				xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+-				pag->pagl_leftrec = trec.ir_startino;
+-				pag->pagl_rightrec = rec.ir_startino;
+-				pag->pagl_pagino = pagino;
+-				goto newino;
+-			}
+-
+ 			/* figure out the closer block if both are valid. */
+ 			if (!doneleft && !doneright) {
+ 				useleft = pagino -
+@@ -1236,13 +1222,13 @@ xfs_dialloc_ag_inobt(
+ 
+ 			/* free inodes to the left? */
+ 			if (useleft && trec.ir_freecount) {
+-				rec = trec;
+ 				xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ 				cur = tcur;
+ 
+ 				pag->pagl_leftrec = trec.ir_startino;
+ 				pag->pagl_rightrec = rec.ir_startino;
+ 				pag->pagl_pagino = pagino;
++				rec = trec;
+ 				goto alloc_inode;
+ 			}
+ 
+@@ -1268,26 +1254,37 @@ xfs_dialloc_ag_inobt(
+ 				goto error1;
+ 		}
+ 
+-		/*
+-		 * We've reached the end of the btree. because
+-		 * we are only searching a small chunk of the
+-		 * btree each search, there is obviously free
+-		 * inodes closer to the parent inode than we
+-		 * are now. restart the search again.
+-		 */
+-		pag->pagl_pagino = NULLAGINO;
+-		pag->pagl_leftrec = NULLAGINO;
+-		pag->pagl_rightrec = NULLAGINO;
+-		xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+-		xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+-		goto restart_pagno;
++		if (searchdistance <= 0) {
++			/*
++			 * Not in range - save last search
++			 * location and allocate a new inode
++			 */
++			xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
++			pag->pagl_leftrec = trec.ir_startino;
++			pag->pagl_rightrec = rec.ir_startino;
++			pag->pagl_pagino = pagino;
++
++		} else {
++			/*
++			 * We've reached the end of the btree. because
++			 * we are only searching a small chunk of the
++			 * btree each search, there is obviously free
++			 * inodes closer to the parent inode than we
++			 * are now. restart the search again.
++			 */
++			pag->pagl_pagino = NULLAGINO;
++			pag->pagl_leftrec = NULLAGINO;
++			pag->pagl_rightrec = NULLAGINO;
++			xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
++			xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
++			goto restart_pagno;
++		}
+ 	}
+ 
+ 	/*
+ 	 * In a different AG from the parent.
+ 	 * See if the most recently allocated block has any free.
+ 	 */
+-newino:
+ 	if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
+ 		error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
+ 					 XFS_LOOKUP_EQ, &i);
+diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
+index 8a37efe04de3..4e30448c4465 100644
+--- a/fs/xfs/libxfs/xfs_inode_fork.c
++++ b/fs/xfs/libxfs/xfs_inode_fork.c
+@@ -1539,14 +1539,11 @@ xfs_iext_realloc_indirect(
+ 	xfs_ifork_t	*ifp,		/* inode fork pointer */
+ 	int		new_size)	/* new indirection array size */
+ {
+-	int		nlists;		/* number of irec's (ex lists) */
+-	int		size;		/* current indirection array size */
+-
+ 	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+-	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+-	size = nlists * sizeof(xfs_ext_irec_t);
+ 	ASSERT(ifp->if_real_bytes);
+-	ASSERT((new_size >= 0) && (new_size != size));
++	ASSERT((new_size >= 0) &&
++	       (new_size != ((ifp->if_real_bytes / XFS_IEXT_BUFSZ) *
++			     sizeof(xfs_ext_irec_t))));
+ 	if (new_size == 0) {
+ 		xfs_iext_destroy(ifp);
+ 	} else {
+diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
+index 82a38d86ebad..d71cb63cdea3 100644
+--- a/fs/xfs/libxfs/xfs_refcount.c
++++ b/fs/xfs/libxfs/xfs_refcount.c
+@@ -784,14 +784,6 @@ xfs_refcount_merge_extents(
+ }
+ 
+ /*
+- * While we're adjusting the refcounts records of an extent, we have
+- * to keep an eye on the number of extents we're dirtying -- run too
+- * many in a single transaction and we'll exceed the transaction's
+- * reservation and crash the fs.  Each record adds 12 bytes to the
+- * log (plus any key updates) so we'll conservatively assume 24 bytes
+- * per record.  We must also leave space for btree splits on both ends
+- * of the range and space for the CUD and a new CUI.
+- *
+  * XXX: This is a pretty hand-wavy estimate.  The penalty for guessing
+  * true incorrectly is a shutdown FS; the penalty for guessing false
+  * incorrectly is more transaction rolls than might be necessary.
+@@ -822,7 +814,7 @@ xfs_refcount_still_have_space(
+ 	else if (overhead > cur->bc_tp->t_log_res)
+ 		return false;
+ 	return  cur->bc_tp->t_log_res - overhead >
+-		cur->bc_private.a.priv.refc.nr_ops * 32;
++		cur->bc_private.a.priv.refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD;
+ }
+ 
+ /*
+@@ -1648,6 +1640,10 @@ xfs_refcount_recover_cow_leftovers(
+ 	error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
+ 	if (error)
+ 		goto out_trans;
++	if (!agbp) {
++		error = -ENOMEM;
++		goto out_trans;
++	}
+ 	cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, NULL);
+ 
+ 	/* Find all the leftover CoW staging extents. */
+diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
+index 098dc668ab2c..eafb9d1f3b37 100644
+--- a/fs/xfs/libxfs/xfs_refcount.h
++++ b/fs/xfs/libxfs/xfs_refcount.h
+@@ -67,4 +67,20 @@ extern int xfs_refcount_free_cow_extent(struct xfs_mount *mp,
+ extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
+ 		xfs_agnumber_t agno);
+ 
++/*
++ * While we're adjusting the refcounts records of an extent, we have
++ * to keep an eye on the number of extents we're dirtying -- run too
++ * many in a single transaction and we'll exceed the transaction's
++ * reservation and crash the fs.  Each record adds 12 bytes to the
++ * log (plus any key updates) so we'll conservatively assume 32 bytes
++ * per record.  We must also leave space for btree splits on both ends
++ * of the range and space for the CUD and a new CUI.
++ */
++#define XFS_REFCOUNT_ITEM_OVERHEAD	32
++
++static inline xfs_fileoff_t xfs_refcount_max_unmap(int log_res)
++{
++	return (log_res * 3 / 4) / XFS_REFCOUNT_ITEM_OVERHEAD;
++}
++
+ #endif	/* __XFS_REFCOUNT_H__ */
+diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
+index 578981412615..d23889e0bedc 100644
+--- a/fs/xfs/xfs_aops.c
++++ b/fs/xfs/xfs_aops.c
+@@ -90,11 +90,11 @@ xfs_find_bdev_for_inode(
+  * associated buffer_heads, paying attention to the start and end offsets that
+  * we need to process on the page.
+  *
+- * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last
+- * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or
+- * the page at all, as we may be racing with memory reclaim and it can free both
+- * the bufferhead chain and the page as it will see the page as clean and
+- * unused.
++ * Note that we open code the action in end_buffer_async_write here so that we
++ * only have to iterate over the buffers attached to the page once.  This is not
++ * only more efficient, but also ensures that we only calls end_page_writeback
++ * at the end of the iteration, and thus avoids the pitfall of having the page
++ * and buffers potentially freed after every call to end_buffer_async_write.
+  */
+ static void
+ xfs_finish_page_writeback(
+@@ -102,29 +102,45 @@ xfs_finish_page_writeback(
+ 	struct bio_vec		*bvec,
+ 	int			error)
+ {
+-	unsigned int		end = bvec->bv_offset + bvec->bv_len - 1;
+-	struct buffer_head	*head, *bh, *next;
++	struct buffer_head	*head = page_buffers(bvec->bv_page), *bh = head;
++	bool			busy = false;
+ 	unsigned int		off = 0;
+-	unsigned int		bsize;
++	unsigned long		flags;
+ 
+ 	ASSERT(bvec->bv_offset < PAGE_SIZE);
+ 	ASSERT((bvec->bv_offset & (i_blocksize(inode) - 1)) == 0);
+-	ASSERT(end < PAGE_SIZE);
++	ASSERT(bvec->bv_offset + bvec->bv_len <= PAGE_SIZE);
+ 	ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0);
+ 
+-	bh = head = page_buffers(bvec->bv_page);
+-
+-	bsize = bh->b_size;
++	local_irq_save(flags);
++	bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
+ 	do {
+-		if (off > end)
+-			break;
+-		next = bh->b_this_page;
+-		if (off < bvec->bv_offset)
+-			goto next_bh;
+-		bh->b_end_io(bh, !error);
+-next_bh:
+-		off += bsize;
+-	} while ((bh = next) != head);
++		if (off >= bvec->bv_offset &&
++		    off < bvec->bv_offset + bvec->bv_len) {
++			ASSERT(buffer_async_write(bh));
++			ASSERT(bh->b_end_io == NULL);
++
++			if (error) {
++				mapping_set_error(bvec->bv_page->mapping, -EIO);
++				set_buffer_write_io_error(bh);
++				clear_buffer_uptodate(bh);
++				SetPageError(bvec->bv_page);
++			} else {
++				set_buffer_uptodate(bh);
++			}
++			clear_buffer_async_write(bh);
++			unlock_buffer(bh);
++		} else if (buffer_async_write(bh)) {
++			ASSERT(buffer_locked(bh));
++			busy = true;
++		}
++		off += bh->b_size;
++	} while ((bh = bh->b_this_page) != head);
++	bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
++	local_irq_restore(flags);
++
++	if (!busy)
++		end_page_writeback(bvec->bv_page);
+ }
+ 
+ /*
+@@ -138,8 +154,10 @@ xfs_destroy_ioend(
+ 	int			error)
+ {
+ 	struct inode		*inode = ioend->io_inode;
+-	struct bio		*last = ioend->io_bio;
+-	struct bio		*bio, *next;
++	struct bio		*bio = &ioend->io_inline_bio;
++	struct bio		*last = ioend->io_bio, *next;
++	u64			start = bio->bi_iter.bi_sector;
++	bool			quiet = bio_flagged(bio, BIO_QUIET);
+ 
+ 	for (bio = &ioend->io_inline_bio; bio; bio = next) {
+ 		struct bio_vec	*bvec;
+@@ -160,6 +178,11 @@ xfs_destroy_ioend(
+ 
+ 		bio_put(bio);
+ 	}
++
++	if (unlikely(error && !quiet)) {
++		xfs_err_ratelimited(XFS_I(inode)->i_mount,
++			"writeback error on sector %llu", start);
++	}
+ }
+ 
+ /*
+@@ -427,7 +450,8 @@ xfs_start_buffer_writeback(
+ 	ASSERT(!buffer_delay(bh));
+ 	ASSERT(!buffer_unwritten(bh));
+ 
+-	mark_buffer_async_write(bh);
++	bh->b_end_io = NULL;
++	set_buffer_async_write(bh);
+ 	set_buffer_uptodate(bh);
+ 	clear_buffer_dirty(bh);
+ }
+@@ -1566,9 +1590,12 @@ xfs_vm_bmap(
+ 	 * The swap code (ab-)uses ->bmap to get a block mapping and then
+ 	 * bypasseѕ the file system for actual I/O.  We really can't allow
+ 	 * that on reflinks inodes, so we have to skip out here.  And yes,
+-	 * 0 is the magic code for a bmap error..
++	 * 0 is the magic code for a bmap error.
++	 *
++	 * Since we don't pass back blockdev info, we can't return bmap
++	 * information for rt files either.
+ 	 */
+-	if (xfs_is_reflink_inode(ip)) {
++	if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip)) {
+ 		xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+ 		return 0;
+ 	}
+diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
+index c4b90e794e41..5a54dcd7e7b1 100644
+--- a/fs/xfs/xfs_bmap_item.c
++++ b/fs/xfs/xfs_bmap_item.c
+@@ -395,6 +395,7 @@ xfs_bui_recover(
+ 	struct xfs_map_extent		*bmap;
+ 	xfs_fsblock_t			startblock_fsb;
+ 	xfs_fsblock_t			inode_fsb;
++	xfs_filblks_t			count;
+ 	bool				op_ok;
+ 	struct xfs_bud_log_item		*budp;
+ 	enum xfs_bmap_intent_type	type;
+@@ -403,6 +404,7 @@ xfs_bui_recover(
+ 	struct xfs_trans		*tp;
+ 	struct xfs_inode		*ip = NULL;
+ 	struct xfs_defer_ops		dfops;
++	struct xfs_bmbt_irec		irec;
+ 	xfs_fsblock_t			firstfsb;
+ 
+ 	ASSERT(!test_bit(XFS_BUI_RECOVERED, &buip->bui_flags));
+@@ -480,13 +482,24 @@ xfs_bui_recover(
+ 	}
+ 	xfs_trans_ijoin(tp, ip, 0);
+ 
++	count = bmap->me_len;
+ 	error = xfs_trans_log_finish_bmap_update(tp, budp, &dfops, type,
+ 			ip, whichfork, bmap->me_startoff,
+-			bmap->me_startblock, bmap->me_len,
+-			state);
++			bmap->me_startblock, &count, state);
+ 	if (error)
+ 		goto err_dfops;
+ 
++	if (count > 0) {
++		ASSERT(type == XFS_BMAP_UNMAP);
++		irec.br_startblock = bmap->me_startblock;
++		irec.br_blockcount = count;
++		irec.br_startoff = bmap->me_startoff;
++		irec.br_state = state;
++		error = xfs_bmap_unmap_extent(tp->t_mountp, &dfops, ip, &irec);
++		if (error)
++			goto err_dfops;
++	}
++
+ 	/* Finish transaction, free inodes. */
+ 	error = xfs_defer_finish(&tp, &dfops, NULL);
+ 	if (error)
+diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
+index 87b495e2f15a..5ffefac081f7 100644
+--- a/fs/xfs/xfs_bmap_util.c
++++ b/fs/xfs/xfs_bmap_util.c
+@@ -1825,29 +1825,18 @@ xfs_swap_extent_forks(
+ 	}
+ 
+ 	/*
+-	 * Before we've swapped the forks, lets set the owners of the forks
+-	 * appropriately. We have to do this as we are demand paging the btree
+-	 * buffers, and so the validation done on read will expect the owner
+-	 * field to be correctly set. Once we change the owners, we can swap the
+-	 * inode forks.
++	 * Btree format (v3) inodes have the inode number stamped in the bmbt
++	 * block headers. We can't start changing the bmbt blocks until the
++	 * inode owner change is logged so recovery does the right thing in the
++	 * event of a crash. Set the owner change log flags now and leave the
++	 * bmbt scan as the last step.
+ 	 */
+ 	if (ip->i_d.di_version == 3 &&
+-	    ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
++	    ip->i_d.di_format == XFS_DINODE_FMT_BTREE)
+ 		(*target_log_flags) |= XFS_ILOG_DOWNER;
+-		error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK,
+-					      tip->i_ino, NULL);
+-		if (error)
+-			return error;
+-	}
+-
+ 	if (tip->i_d.di_version == 3 &&
+-	    tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
++	    tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
+ 		(*src_log_flags) |= XFS_ILOG_DOWNER;
+-		error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK,
+-					      ip->i_ino, NULL);
+-		if (error)
+-			return error;
+-	}
+ 
+ 	/*
+ 	 * Swap the data forks of the inodes
+@@ -1925,6 +1914,48 @@ xfs_swap_extent_forks(
+ 	return 0;
+ }
+ 
++/*
++ * Fix up the owners of the bmbt blocks to refer to the current inode. The
++ * change owner scan attempts to order all modified buffers in the current
++ * transaction. In the event of ordered buffer failure, the offending buffer is
++ * physically logged as a fallback and the scan returns -EAGAIN. We must roll
++ * the transaction in this case to replenish the fallback log reservation and
++ * restart the scan. This process repeats until the scan completes.
++ */
++static int
++xfs_swap_change_owner(
++	struct xfs_trans	**tpp,
++	struct xfs_inode	*ip,
++	struct xfs_inode	*tmpip)
++{
++	int			error;
++	struct xfs_trans	*tp = *tpp;
++
++	do {
++		error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, ip->i_ino,
++					      NULL);
++		/* success or fatal error */
++		if (error != -EAGAIN)
++			break;
++
++		error = xfs_trans_roll(tpp, NULL);
++		if (error)
++			break;
++		tp = *tpp;
++
++		/*
++		 * Redirty both inodes so they can relog and keep the log tail
++		 * moving forward.
++		 */
++		xfs_trans_ijoin(tp, ip, 0);
++		xfs_trans_ijoin(tp, tmpip, 0);
++		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
++		xfs_trans_log_inode(tp, tmpip, XFS_ILOG_CORE);
++	} while (true);
++
++	return error;
++}
++
+ int
+ xfs_swap_extents(
+ 	struct xfs_inode	*ip,	/* target inode */
+@@ -1938,8 +1969,8 @@ xfs_swap_extents(
+ 	int			error = 0;
+ 	int			lock_flags;
+ 	struct xfs_ifork	*cowfp;
+-	__uint64_t		f;
+-	int			resblks;
++	uint64_t		f;
++	int			resblks = 0;
+ 
+ 	/*
+ 	 * Lock the inodes against other IO, page faults and truncate to
+@@ -1987,11 +2018,8 @@ xfs_swap_extents(
+ 			  XFS_SWAP_RMAP_SPACE_RES(mp,
+ 				XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK),
+ 				XFS_DATA_FORK);
+-		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks,
+-				0, 0, &tp);
+-	} else
+-		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0,
+-				0, 0, &tp);
++	}
++	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
+ 	if (error)
+ 		goto out_unlock;
+ 
+@@ -2076,6 +2104,23 @@ xfs_swap_extents(
+ 	xfs_trans_log_inode(tp, ip,  src_log_flags);
+ 	xfs_trans_log_inode(tp, tip, target_log_flags);
+ 
++	/*
++	 * The extent forks have been swapped, but crc=1,rmapbt=0 filesystems
++	 * have inode number owner values in the bmbt blocks that still refer to
++	 * the old inode. Scan each bmbt to fix up the owner values with the
++	 * inode number of the current inode.
++	 */
++	if (src_log_flags & XFS_ILOG_DOWNER) {
++		error = xfs_swap_change_owner(&tp, ip, tip);
++		if (error)
++			goto out_trans_cancel;
++	}
++	if (target_log_flags & XFS_ILOG_DOWNER) {
++		error = xfs_swap_change_owner(&tp, tip, ip);
++		if (error)
++			goto out_trans_cancel;
++	}
++
+ 	/*
+ 	 * If this is a synchronous mount, make sure that the
+ 	 * transaction goes to disk before returning to the user.
+diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
+index 16269271ebd6..eca7baecc9f0 100644
+--- a/fs/xfs/xfs_buf.c
++++ b/fs/xfs/xfs_buf.c
+@@ -116,7 +116,7 @@ static inline void
+ __xfs_buf_ioacct_dec(
+ 	struct xfs_buf	*bp)
+ {
+-	ASSERT(spin_is_locked(&bp->b_lock));
++	lockdep_assert_held(&bp->b_lock);
+ 
+ 	if (bp->b_state & XFS_BSTATE_IN_FLIGHT) {
+ 		bp->b_state &= ~XFS_BSTATE_IN_FLIGHT;
+@@ -2022,6 +2022,66 @@ xfs_buf_delwri_submit(
+ 	return error;
+ }
+ 
++/*
++ * Push a single buffer on a delwri queue.
++ *
++ * The purpose of this function is to submit a single buffer of a delwri queue
++ * and return with the buffer still on the original queue. The waiting delwri
++ * buffer submission infrastructure guarantees transfer of the delwri queue
++ * buffer reference to a temporary wait list. We reuse this infrastructure to
++ * transfer the buffer back to the original queue.
++ *
++ * Note the buffer transitions from the queued state, to the submitted and wait
++ * listed state and back to the queued state during this call. The buffer
++ * locking and queue management logic between _delwri_pushbuf() and
++ * _delwri_queue() guarantee that the buffer cannot be queued to another list
++ * before returning.
++ */
++int
++xfs_buf_delwri_pushbuf(
++	struct xfs_buf		*bp,
++	struct list_head	*buffer_list)
++{
++	LIST_HEAD		(submit_list);
++	int			error;
++
++	ASSERT(bp->b_flags & _XBF_DELWRI_Q);
++
++	trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_);
++
++	/*
++	 * Isolate the buffer to a new local list so we can submit it for I/O
++	 * independently from the rest of the original list.
++	 */
++	xfs_buf_lock(bp);
++	list_move(&bp->b_list, &submit_list);
++	xfs_buf_unlock(bp);
++
++	/*
++	 * Delwri submission clears the DELWRI_Q buffer flag and returns with
++	 * the buffer on the wait list with an associated reference. Rather than
++	 * bounce the buffer from a local wait list back to the original list
++	 * after I/O completion, reuse the original list as the wait list.
++	 */
++	xfs_buf_delwri_submit_buffers(&submit_list, buffer_list);
++
++	/*
++	 * The buffer is now under I/O and wait listed as during typical delwri
++	 * submission. Lock the buffer to wait for I/O completion. Rather than
++	 * remove the buffer from the wait list and release the reference, we
++	 * want to return with the buffer queued to the original list. The
++	 * buffer already sits on the original list with a wait list reference,
++	 * however. If we let the queue inherit that wait list reference, all we
++	 * need to do is reset the DELWRI_Q flag.
++	 */
++	xfs_buf_lock(bp);
++	error = bp->b_error;
++	bp->b_flags |= _XBF_DELWRI_Q;
++	xfs_buf_unlock(bp);
++
++	return error;
++}
++
+ int __init
+ xfs_buf_init(void)
+ {
+diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
+index ad514a8025dd..f961b19b9cc2 100644
+--- a/fs/xfs/xfs_buf.h
++++ b/fs/xfs/xfs_buf.h
+@@ -333,6 +333,7 @@ extern void xfs_buf_delwri_cancel(struct list_head *);
+ extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *);
+ extern int xfs_buf_delwri_submit(struct list_head *);
+ extern int xfs_buf_delwri_submit_nowait(struct list_head *);
++extern int xfs_buf_delwri_pushbuf(struct xfs_buf *, struct list_head *);
+ 
+ /* Buffer Daemon Setup Routines */
+ extern int xfs_buf_init(void);
+diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
+index 0306168af332..e0a0af0946f2 100644
+--- a/fs/xfs/xfs_buf_item.c
++++ b/fs/xfs/xfs_buf_item.c
+@@ -29,6 +29,7 @@
+ #include "xfs_error.h"
+ #include "xfs_trace.h"
+ #include "xfs_log.h"
++#include "xfs_inode.h"
+ 
+ 
+ kmem_zone_t	*xfs_buf_item_zone;
+@@ -322,6 +323,8 @@ xfs_buf_item_format(
+ 	ASSERT((bip->bli_flags & XFS_BLI_STALE) ||
+ 	       (xfs_blft_from_flags(&bip->__bli_format) > XFS_BLFT_UNKNOWN_BUF
+ 	        && xfs_blft_from_flags(&bip->__bli_format) < XFS_BLFT_MAX_BUF));
++	ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED) ||
++	       (bip->bli_flags & XFS_BLI_STALE));
+ 
+ 
+ 	/*
+@@ -346,16 +349,6 @@ xfs_buf_item_format(
+ 		bip->bli_flags &= ~XFS_BLI_INODE_BUF;
+ 	}
+ 
+-	if ((bip->bli_flags & (XFS_BLI_ORDERED|XFS_BLI_STALE)) ==
+-							XFS_BLI_ORDERED) {
+-		/*
+-		 * The buffer has been logged just to order it.  It is not being
+-		 * included in the transaction commit, so don't format it.
+-		 */
+-		trace_xfs_buf_item_format_ordered(bip);
+-		return;
+-	}
+-
+ 	for (i = 0; i < bip->bli_format_count; i++) {
+ 		xfs_buf_item_format_segment(bip, lv, &vecp, offset,
+ 					    &bip->bli_formats[i]);
+@@ -574,26 +567,20 @@ xfs_buf_item_unlock(
+ {
+ 	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
+ 	struct xfs_buf		*bp = bip->bli_buf;
+-	bool			clean;
+-	bool			aborted;
+-	int			flags;
++	bool			aborted = !!(lip->li_flags & XFS_LI_ABORTED);
++	bool			hold = !!(bip->bli_flags & XFS_BLI_HOLD);
++	bool			dirty = !!(bip->bli_flags & XFS_BLI_DIRTY);
++#if defined(DEBUG) || defined(XFS_WARN)
++	bool			ordered = !!(bip->bli_flags & XFS_BLI_ORDERED);
++#endif
+ 
+ 	/* Clear the buffer's association with this transaction. */
+ 	bp->b_transp = NULL;
+ 
+ 	/*
+-	 * If this is a transaction abort, don't return early.  Instead, allow
+-	 * the brelse to happen.  Normally it would be done for stale
+-	 * (cancelled) buffers at unpin time, but we'll never go through the
+-	 * pin/unpin cycle if we abort inside commit.
+-	 */
+-	aborted = (lip->li_flags & XFS_LI_ABORTED) ? true : false;
+-	/*
+-	 * Before possibly freeing the buf item, copy the per-transaction state
+-	 * so we can reference it safely later after clearing it from the
+-	 * buffer log item.
++	 * The per-transaction state has been copied above so clear it from the
++	 * bli.
+ 	 */
+-	flags = bip->bli_flags;
+ 	bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED);
+ 
+ 	/*
+@@ -601,7 +588,7 @@ xfs_buf_item_unlock(
+ 	 * unlock the buffer and free the buf item when the buffer is unpinned
+ 	 * for the last time.
+ 	 */
+-	if (flags & XFS_BLI_STALE) {
++	if (bip->bli_flags & XFS_BLI_STALE) {
+ 		trace_xfs_buf_item_unlock_stale(bip);
+ 		ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
+ 		if (!aborted) {
+@@ -619,40 +606,34 @@ xfs_buf_item_unlock(
+ 	 * regardless of whether it is dirty or not. A dirty abort implies a
+ 	 * shutdown, anyway.
+ 	 *
+-	 * Ordered buffers are dirty but may have no recorded changes, so ensure
+-	 * we only release clean items here.
++	 * The bli dirty state should match whether the blf has logged segments
++	 * except for ordered buffers, where only the bli should be dirty.
+ 	 */
+-	clean = (flags & XFS_BLI_DIRTY) ? false : true;
+-	if (clean) {
+-		int i;
+-		for (i = 0; i < bip->bli_format_count; i++) {
+-			if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
+-				     bip->bli_formats[i].blf_map_size)) {
+-				clean = false;
+-				break;
+-			}
+-		}
+-	}
++	ASSERT((!ordered && dirty == xfs_buf_item_dirty_format(bip)) ||
++	       (ordered && dirty && !xfs_buf_item_dirty_format(bip)));
+ 
+ 	/*
+ 	 * Clean buffers, by definition, cannot be in the AIL. However, aborted
+-	 * buffers may be dirty and hence in the AIL. Therefore if we are
+-	 * aborting a buffer and we've just taken the last refernce away, we
+-	 * have to check if it is in the AIL before freeing it. We need to free
+-	 * it in this case, because an aborted transaction has already shut the
+-	 * filesystem down and this is the last chance we will have to do so.
++	 * buffers may be in the AIL regardless of dirty state. An aborted
++	 * transaction that invalidates a buffer already in the AIL may have
++	 * marked it stale and cleared the dirty state, for example.
++	 *
++	 * Therefore if we are aborting a buffer and we've just taken the last
++	 * reference away, we have to check if it is in the AIL before freeing
++	 * it. We need to free it in this case, because an aborted transaction
++	 * has already shut the filesystem down and this is the last chance we
++	 * will have to do so.
+ 	 */
+ 	if (atomic_dec_and_test(&bip->bli_refcount)) {
+-		if (clean)
+-			xfs_buf_item_relse(bp);
+-		else if (aborted) {
++		if (aborted) {
+ 			ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp));
+ 			xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR);
+ 			xfs_buf_item_relse(bp);
+-		}
++		} else if (!dirty)
++			xfs_buf_item_relse(bp);
+ 	}
+ 
+-	if (!(flags & XFS_BLI_HOLD))
++	if (!hold)
+ 		xfs_buf_relse(bp);
+ }
+ 
+@@ -942,14 +923,22 @@ xfs_buf_item_log(
+ 
+ 
+ /*
+- * Return 1 if the buffer has been logged or ordered in a transaction (at any
+- * point, not just the current transaction) and 0 if not.
++ * Return true if the buffer has any ranges logged/dirtied by a transaction,
++ * false otherwise.
+  */
+-uint
+-xfs_buf_item_dirty(
+-	xfs_buf_log_item_t	*bip)
++bool
++xfs_buf_item_dirty_format(
++	struct xfs_buf_log_item	*bip)
+ {
+-	return (bip->bli_flags & XFS_BLI_DIRTY);
++	int			i;
++
++	for (i = 0; i < bip->bli_format_count; i++) {
++		if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
++			     bip->bli_formats[i].blf_map_size))
++			return true;
++	}
++
++	return false;
+ }
+ 
+ STATIC void
+@@ -1051,6 +1040,31 @@ xfs_buf_do_callbacks(
+ 	}
+ }
+ 
++/*
++ * Invoke the error state callback for each log item affected by the failed I/O.
++ *
++ * If a metadata buffer write fails with a non-permanent error, the buffer is
++ * eventually resubmitted and so the completion callbacks are not run. The error
++ * state may need to be propagated to the log items attached to the buffer,
++ * however, so the next AIL push of the item knows hot to handle it correctly.
++ */
++STATIC void
++xfs_buf_do_callbacks_fail(
++	struct xfs_buf		*bp)
++{
++	struct xfs_log_item	*next;
++	struct xfs_log_item	*lip = bp->b_fspriv;
++	struct xfs_ail		*ailp = lip->li_ailp;
++
++	spin_lock(&ailp->xa_lock);
++	for (; lip; lip = next) {
++		next = lip->li_bio_list;
++		if (lip->li_ops->iop_error)
++			lip->li_ops->iop_error(lip, bp);
++	}
++	spin_unlock(&ailp->xa_lock);
++}
++
+ static bool
+ xfs_buf_iodone_callback_error(
+ 	struct xfs_buf		*bp)
+@@ -1120,7 +1134,11 @@ xfs_buf_iodone_callback_error(
+ 	if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount)
+ 		goto permanent_error;
+ 
+-	/* still a transient error, higher layers will retry */
++	/*
++	 * Still a transient error, run IO completion failure callbacks and let
++	 * the higher layers retry the buffer.
++	 */
++	xfs_buf_do_callbacks_fail(bp);
+ 	xfs_buf_ioerror(bp, 0);
+ 	xfs_buf_relse(bp);
+ 	return true;
+@@ -1201,3 +1219,31 @@ xfs_buf_iodone(
+ 	xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE);
+ 	xfs_buf_item_free(BUF_ITEM(lip));
+ }
++
++/*
++ * Requeue a failed buffer for writeback
++ *
++ * Return true if the buffer has been re-queued properly, false otherwise
++ */
++bool
++xfs_buf_resubmit_failed_buffers(
++	struct xfs_buf		*bp,
++	struct xfs_log_item	*lip,
++	struct list_head	*buffer_list)
++{
++	struct xfs_log_item	*next;
++
++	/*
++	 * Clear XFS_LI_FAILED flag from all items before resubmit
++	 *
++	 * XFS_LI_FAILED set/clear is protected by xa_lock, caller  this
++	 * function already have it acquired
++	 */
++	for (; lip; lip = next) {
++		next = lip->li_bio_list;
++		xfs_clear_li_failed(lip);
++	}
++
++	/* Add this buffer back to the delayed write list */
++	return xfs_buf_delwri_queue(bp, buffer_list);
++}
+diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
+index f7eba99d19dd..9690ce62c9a7 100644
+--- a/fs/xfs/xfs_buf_item.h
++++ b/fs/xfs/xfs_buf_item.h
+@@ -64,12 +64,15 @@ typedef struct xfs_buf_log_item {
+ int	xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
+ void	xfs_buf_item_relse(struct xfs_buf *);
+ void	xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
+-uint	xfs_buf_item_dirty(xfs_buf_log_item_t *);
++bool	xfs_buf_item_dirty_format(struct xfs_buf_log_item *);
+ void	xfs_buf_attach_iodone(struct xfs_buf *,
+ 			      void(*)(struct xfs_buf *, xfs_log_item_t *),
+ 			      xfs_log_item_t *);
+ void	xfs_buf_iodone_callbacks(struct xfs_buf *);
+ void	xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *);
++bool	xfs_buf_resubmit_failed_buffers(struct xfs_buf *,
++					struct xfs_log_item *,
++					struct list_head *);
+ 
+ extern kmem_zone_t	*xfs_buf_item_zone;
+ 
+diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
+index df206cfc21f7..586b398f268d 100644
+--- a/fs/xfs/xfs_file.c
++++ b/fs/xfs/xfs_file.c
+@@ -729,6 +729,7 @@ xfs_file_buffered_aio_write(
+ 		xfs_rw_iunlock(ip, iolock);
+ 		eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
+ 		xfs_icache_free_eofblocks(ip->i_mount, &eofb);
++		xfs_icache_free_cowblocks(ip->i_mount, &eofb);
+ 		goto write_retry;
+ 	}
+ 
+@@ -1139,29 +1140,8 @@ xfs_find_get_desired_pgoff(
+ 		want = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1;
+ 		nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
+ 					  want);
+-		/*
+-		 * No page mapped into given range.  If we are searching holes
+-		 * and if this is the first time we got into the loop, it means
+-		 * that the given offset is landed in a hole, return it.
+-		 *
+-		 * If we have already stepped through some block buffers to find
+-		 * holes but they all contains data.  In this case, the last
+-		 * offset is already updated and pointed to the end of the last
+-		 * mapped page, if it does not reach the endpoint to search,
+-		 * that means there should be a hole between them.
+-		 */
+-		if (nr_pages == 0) {
+-			/* Data search found nothing */
+-			if (type == DATA_OFF)
+-				break;
+-
+-			ASSERT(type == HOLE_OFF);
+-			if (lastoff == startoff || lastoff < endoff) {
+-				found = true;
+-				*offset = lastoff;
+-			}
++		if (nr_pages == 0)
+ 			break;
+-		}
+ 
+ 		for (i = 0; i < nr_pages; i++) {
+ 			struct page	*page = pvec.pages[i];
+@@ -1227,21 +1207,20 @@ xfs_find_get_desired_pgoff(
+ 
+ 		/*
+ 		 * The number of returned pages less than our desired, search
+-		 * done.  In this case, nothing was found for searching data,
+-		 * but we found a hole behind the last offset.
++		 * done.
+ 		 */
+-		if (nr_pages < want) {
+-			if (type == HOLE_OFF) {
+-				*offset = lastoff;
+-				found = true;
+-			}
++		if (nr_pages < want)
+ 			break;
+-		}
+ 
+ 		index = pvec.pages[i - 1]->index + 1;
+ 		pagevec_release(&pvec);
+ 	} while (index <= end);
+ 
++	/* No page at lastoff and we are not done - we found a hole. */
++	if (type == HOLE_OFF && lastoff < endoff) {
++		*offset = lastoff;
++		found = true;
++	}
+ out:
+ 	pagevec_release(&pvec);
+ 	return found;
+diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
+index 74304b6ce84b..86a4911520cc 100644
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -66,7 +66,6 @@ xfs_inode_alloc(
+ 
+ 	XFS_STATS_INC(mp, vn_active);
+ 	ASSERT(atomic_read(&ip->i_pincount) == 0);
+-	ASSERT(!spin_is_locked(&ip->i_flags_lock));
+ 	ASSERT(!xfs_isiflocked(ip));
+ 	ASSERT(ip->i_ino == 0);
+ 
+@@ -192,7 +191,7 @@ xfs_perag_set_reclaim_tag(
+ {
+ 	struct xfs_mount	*mp = pag->pag_mount;
+ 
+-	ASSERT(spin_is_locked(&pag->pag_ici_lock));
++	lockdep_assert_held(&pag->pag_ici_lock);
+ 	if (pag->pag_ici_reclaimable++)
+ 		return;
+ 
+@@ -214,7 +213,7 @@ xfs_perag_clear_reclaim_tag(
+ {
+ 	struct xfs_mount	*mp = pag->pag_mount;
+ 
+-	ASSERT(spin_is_locked(&pag->pag_ici_lock));
++	lockdep_assert_held(&pag->pag_ici_lock);
+ 	if (--pag->pag_ici_reclaimable)
+ 		return;
+ 
+@@ -1079,11 +1078,11 @@ xfs_reclaim_inode(
+ 	 * Because we use RCU freeing we need to ensure the inode always appears
+ 	 * to be reclaimed with an invalid inode number when in the free state.
+ 	 * We do this as early as possible under the ILOCK so that
+-	 * xfs_iflush_cluster() can be guaranteed to detect races with us here.
+-	 * By doing this, we guarantee that once xfs_iflush_cluster has locked
+-	 * XFS_ILOCK that it will see either a valid, flushable inode that will
+-	 * serialise correctly, or it will see a clean (and invalid) inode that
+-	 * it can skip.
++	 * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to
++	 * detect races with us here. By doing this, we guarantee that once
++	 * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that
++	 * it will see either a valid inode that will serialise correctly, or it
++	 * will see an invalid inode that it can skip.
+ 	 */
+ 	spin_lock(&ip->i_flags_lock);
+ 	ip->i_flags = XFS_IRECLAIM;
+diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
+index 7a0b4eeb99e4..9e795ab08a53 100644
+--- a/fs/xfs/xfs_inode.c
++++ b/fs/xfs/xfs_inode.c
+@@ -881,7 +881,6 @@ xfs_ialloc(
+ 	case S_IFREG:
+ 	case S_IFDIR:
+ 		if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
+-			uint64_t	di_flags2 = 0;
+ 			uint		di_flags = 0;
+ 
+ 			if (S_ISDIR(mode)) {
+@@ -918,20 +917,23 @@ xfs_ialloc(
+ 				di_flags |= XFS_DIFLAG_NODEFRAG;
+ 			if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
+ 				di_flags |= XFS_DIFLAG_FILESTREAM;
+-			if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
+-				di_flags2 |= XFS_DIFLAG2_DAX;
+ 
+ 			ip->i_d.di_flags |= di_flags;
+-			ip->i_d.di_flags2 |= di_flags2;
+ 		}
+ 		if (pip &&
+ 		    (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY) &&
+ 		    pip->i_d.di_version == 3 &&
+ 		    ip->i_d.di_version == 3) {
++			uint64_t	di_flags2 = 0;
++
+ 			if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) {
+-				ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
++				di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
+ 				ip->i_d.di_cowextsize = pip->i_d.di_cowextsize;
+ 			}
++			if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
++				di_flags2 |= XFS_DIFLAG2_DAX;
++
++			ip->i_d.di_flags2 |= di_flags2;
+ 		}
+ 		/* FALLTHROUGH */
+ 	case S_IFLNK:
+@@ -2366,11 +2368,24 @@ xfs_ifree_cluster(
+ 			 * already marked stale. If we can't lock it, back off
+ 			 * and retry.
+ 			 */
+-			if (ip != free_ip &&
+-			    !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
+-				rcu_read_unlock();
+-				delay(1);
+-				goto retry;
++			if (ip != free_ip) {
++				if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
++					rcu_read_unlock();
++					delay(1);
++					goto retry;
++				}
++
++				/*
++				 * Check the inode number again in case we're
++				 * racing with freeing in xfs_reclaim_inode().
++				 * See the comments in that function for more
++				 * information as to why the initial check is
++				 * not sufficient.
++				 */
++				if (ip->i_ino != inum + i) {
++					xfs_iunlock(ip, XFS_ILOCK_EXCL);
++					continue;
++				}
+ 			}
+ 			rcu_read_unlock();
+ 
+diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
+index d90e7811ccdd..94915747042c 100644
+--- a/fs/xfs/xfs_inode_item.c
++++ b/fs/xfs/xfs_inode_item.c
+@@ -27,6 +27,7 @@
+ #include "xfs_error.h"
+ #include "xfs_trace.h"
+ #include "xfs_trans_priv.h"
++#include "xfs_buf_item.h"
+ #include "xfs_log.h"
+ 
+ 
+@@ -475,6 +476,23 @@ xfs_inode_item_unpin(
+ 		wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT);
+ }
+ 
++/*
++ * Callback used to mark a buffer with XFS_LI_FAILED when items in the buffer
++ * have been failed during writeback
++ *
++ * This informs the AIL that the inode is already flush locked on the next push,
++ * and acquires a hold on the buffer to ensure that it isn't reclaimed before
++ * dirty data makes it to disk.
++ */
++STATIC void
++xfs_inode_item_error(
++	struct xfs_log_item	*lip,
++	struct xfs_buf		*bp)
++{
++	ASSERT(xfs_isiflocked(INODE_ITEM(lip)->ili_inode));
++	xfs_set_li_failed(lip, bp);
++}
++
+ STATIC uint
+ xfs_inode_item_push(
+ 	struct xfs_log_item	*lip,
+@@ -484,13 +502,28 @@ xfs_inode_item_push(
+ {
+ 	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+ 	struct xfs_inode	*ip = iip->ili_inode;
+-	struct xfs_buf		*bp = NULL;
++	struct xfs_buf		*bp = lip->li_buf;
+ 	uint			rval = XFS_ITEM_SUCCESS;
+ 	int			error;
+ 
+ 	if (xfs_ipincount(ip) > 0)
+ 		return XFS_ITEM_PINNED;
+ 
++	/*
++	 * The buffer containing this item failed to be written back
++	 * previously. Resubmit the buffer for IO.
++	 */
++	if (lip->li_flags & XFS_LI_FAILED) {
++		if (!xfs_buf_trylock(bp))
++			return XFS_ITEM_LOCKED;
++
++		if (!xfs_buf_resubmit_failed_buffers(bp, lip, buffer_list))
++			rval = XFS_ITEM_FLUSHING;
++
++		xfs_buf_unlock(bp);
++		return rval;
++	}
++
+ 	if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
+ 		return XFS_ITEM_LOCKED;
+ 
+@@ -622,7 +655,8 @@ static const struct xfs_item_ops xfs_inode_item_ops = {
+ 	.iop_unlock	= xfs_inode_item_unlock,
+ 	.iop_committed	= xfs_inode_item_committed,
+ 	.iop_push	= xfs_inode_item_push,
+-	.iop_committing = xfs_inode_item_committing
++	.iop_committing = xfs_inode_item_committing,
++	.iop_error	= xfs_inode_item_error
+ };
+ 
+ 
+@@ -710,7 +744,8 @@ xfs_iflush_done(
+ 		 * the AIL lock.
+ 		 */
+ 		iip = INODE_ITEM(blip);
+-		if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn)
++		if ((iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) ||
++		    lip->li_flags & XFS_LI_FAILED)
+ 			need_ail++;
+ 
+ 		blip = next;
+@@ -718,7 +753,8 @@ xfs_iflush_done(
+ 
+ 	/* make sure we capture the state of the initial inode. */
+ 	iip = INODE_ITEM(lip);
+-	if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn)
++	if ((iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) ||
++	    lip->li_flags & XFS_LI_FAILED)
+ 		need_ail++;
+ 
+ 	/*
+@@ -731,22 +767,30 @@ xfs_iflush_done(
+ 	 * holding the lock before removing the inode from the AIL.
+ 	 */
+ 	if (need_ail) {
+-		struct xfs_log_item *log_items[need_ail];
+-		int i = 0;
++		bool			mlip_changed = false;
++
++		/* this is an opencoded batch version of xfs_trans_ail_delete */
+ 		spin_lock(&ailp->xa_lock);
+ 		for (blip = lip; blip; blip = blip->li_bio_list) {
+-			iip = INODE_ITEM(blip);
+-			if (iip->ili_logged &&
+-			    blip->li_lsn == iip->ili_flush_lsn) {
+-				log_items[i++] = blip;
++			if (INODE_ITEM(blip)->ili_logged &&
++			    blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn)
++				mlip_changed |= xfs_ail_delete_one(ailp, blip);
++			else {
++				xfs_clear_li_failed(blip);
+ 			}
+-			ASSERT(i <= need_ail);
+ 		}
+-		/* xfs_trans_ail_delete_bulk() drops the AIL lock. */
+-		xfs_trans_ail_delete_bulk(ailp, log_items, i,
+-					  SHUTDOWN_CORRUPT_INCORE);
+-	}
+ 
++		if (mlip_changed) {
++			if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount))
++				xlog_assign_tail_lsn_locked(ailp->xa_mount);
++			if (list_empty(&ailp->xa_ail))
++				wake_up_all(&ailp->xa_empty);
++		}
++		spin_unlock(&ailp->xa_lock);
++
++		if (mlip_changed)
++			xfs_log_space_wake(ailp->xa_mount);
++	}
+ 
+ 	/*
+ 	 * clean up and unlock the flush lock now we are done. We can clear the
+diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
+index 73cfc7179124..bce2e260f55e 100644
+--- a/fs/xfs/xfs_ioctl.c
++++ b/fs/xfs/xfs_ioctl.c
+@@ -928,16 +928,15 @@ xfs_ioc_fsgetxattr(
+ 	return 0;
+ }
+ 
+-STATIC void
+-xfs_set_diflags(
++STATIC uint16_t
++xfs_flags2diflags(
+ 	struct xfs_inode	*ip,
+ 	unsigned int		xflags)
+ {
+-	unsigned int		di_flags;
+-	uint64_t		di_flags2;
+-
+ 	/* can't set PREALLOC this way, just preserve it */
+-	di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
++	uint16_t		di_flags =
++		(ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
++
+ 	if (xflags & FS_XFLAG_IMMUTABLE)
+ 		di_flags |= XFS_DIFLAG_IMMUTABLE;
+ 	if (xflags & FS_XFLAG_APPEND)
+@@ -967,19 +966,24 @@ xfs_set_diflags(
+ 		if (xflags & FS_XFLAG_EXTSIZE)
+ 			di_flags |= XFS_DIFLAG_EXTSIZE;
+ 	}
+-	ip->i_d.di_flags = di_flags;
+ 
+-	/* diflags2 only valid for v3 inodes. */
+-	if (ip->i_d.di_version < 3)
+-		return;
++	return di_flags;
++}
++
++STATIC uint64_t
++xfs_flags2diflags2(
++	struct xfs_inode	*ip,
++	unsigned int		xflags)
++{
++	uint64_t		di_flags2 =
++		(ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK);
+ 
+-	di_flags2 = (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK);
+ 	if (xflags & FS_XFLAG_DAX)
+ 		di_flags2 |= XFS_DIFLAG2_DAX;
+ 	if (xflags & FS_XFLAG_COWEXTSIZE)
+ 		di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
+ 
+-	ip->i_d.di_flags2 = di_flags2;
++	return di_flags2;
+ }
+ 
+ STATIC void
+@@ -1005,11 +1009,12 @@ xfs_diflags_to_linux(
+ 		inode->i_flags |= S_NOATIME;
+ 	else
+ 		inode->i_flags &= ~S_NOATIME;
++#if 0	/* disabled until the flag switching races are sorted out */
+ 	if (xflags & FS_XFLAG_DAX)
+ 		inode->i_flags |= S_DAX;
+ 	else
+ 		inode->i_flags &= ~S_DAX;
+-
++#endif
+ }
+ 
+ static int
+@@ -1019,6 +1024,7 @@ xfs_ioctl_setattr_xflags(
+ 	struct fsxattr		*fa)
+ {
+ 	struct xfs_mount	*mp = ip->i_mount;
++	uint64_t		di_flags2;
+ 
+ 	/* Can't change realtime flag if any extents are allocated. */
+ 	if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
+@@ -1049,7 +1055,14 @@ xfs_ioctl_setattr_xflags(
+ 	    !capable(CAP_LINUX_IMMUTABLE))
+ 		return -EPERM;
+ 
+-	xfs_set_diflags(ip, fa->fsx_xflags);
++	/* diflags2 only valid for v3 inodes. */
++	di_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags);
++	if (di_flags2 && ip->i_d.di_version < 3)
++		return -EINVAL;
++
++	ip->i_d.di_flags = xfs_flags2diflags(ip, fa->fsx_xflags);
++	ip->i_d.di_flags2 = di_flags2;
++
+ 	xfs_diflags_to_linux(ip);
+ 	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
+ 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
+index a1247c3c1efb..5b81f7f41b80 100644
+--- a/fs/xfs/xfs_iops.c
++++ b/fs/xfs/xfs_iops.c
+@@ -802,7 +802,7 @@ xfs_vn_setattr_nonsize(
+  * Caution: The caller of this function is responsible for calling
+  * setattr_prepare() or otherwise verifying the change is fine.
+  */
+-int
++STATIC int
+ xfs_setattr_size(
+ 	struct xfs_inode	*ip,
+ 	struct iattr		*iattr)
+diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
+index b57ab34fbf3c..33c9a3aae948 100644
+--- a/fs/xfs/xfs_log.c
++++ b/fs/xfs/xfs_log.c
+@@ -743,15 +743,45 @@ xfs_log_mount_finish(
+ 	struct xfs_mount	*mp)
+ {
+ 	int	error = 0;
++	bool	readonly = (mp->m_flags & XFS_MOUNT_RDONLY);
+ 
+ 	if (mp->m_flags & XFS_MOUNT_NORECOVERY) {
+ 		ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
+ 		return 0;
++	} else if (readonly) {
++		/* Allow unlinked processing to proceed */
++		mp->m_flags &= ~XFS_MOUNT_RDONLY;
+ 	}
+ 
++	/*
++	 * During the second phase of log recovery, we need iget and
++	 * iput to behave like they do for an active filesystem.
++	 * xfs_fs_drop_inode needs to be able to prevent the deletion
++	 * of inodes before we're done replaying log items on those
++	 * inodes.  Turn it off immediately after recovery finishes
++	 * so that we don't leak the quota inodes if subsequent mount
++	 * activities fail.
++	 *
++	 * We let all inodes involved in redo item processing end up on
++	 * the LRU instead of being evicted immediately so that if we do
++	 * something to an unlinked inode, the irele won't cause
++	 * premature truncation and freeing of the inode, which results
++	 * in log recovery failure.  We have to evict the unreferenced
++	 * lru inodes after clearing MS_ACTIVE because we don't
++	 * otherwise clean up the lru if there's a subsequent failure in
++	 * xfs_mountfs, which leads to us leaking the inodes if nothing
++	 * else (e.g. quotacheck) references the inodes before the
++	 * mount failure occurs.
++	 */
++	mp->m_super->s_flags |= MS_ACTIVE;
+ 	error = xlog_recover_finish(mp->m_log);
+ 	if (!error)
+ 		xfs_log_work_queue(mp);
++	mp->m_super->s_flags &= ~MS_ACTIVE;
++	evict_inodes(mp->m_super);
++
++	if (readonly)
++		mp->m_flags |= XFS_MOUNT_RDONLY;
+ 
+ 	return error;
+ }
+@@ -801,11 +831,14 @@ xfs_log_unmount_write(xfs_mount_t *mp)
+ 	int		 error;
+ 
+ 	/*
+-	 * Don't write out unmount record on read-only mounts.
++	 * Don't write out unmount record on norecovery mounts or ro devices.
+ 	 * Or, if we are doing a forced umount (typically because of IO errors).
+ 	 */
+-	if (mp->m_flags & XFS_MOUNT_RDONLY)
++	if (mp->m_flags & XFS_MOUNT_NORECOVERY ||
++	    xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) {
++		ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
+ 		return 0;
++	}
+ 
+ 	error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
+ 	ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log)));
+@@ -3304,8 +3337,6 @@ _xfs_log_force(
+ 		 */
+ 		if (iclog->ic_state & XLOG_STATE_IOERROR)
+ 			return -EIO;
+-		if (log_flushed)
+-			*log_flushed = 1;
+ 	} else {
+ 
+ no_sleep:
+@@ -3409,8 +3440,6 @@ _xfs_log_force_lsn(
+ 
+ 				xlog_wait(&iclog->ic_prev->ic_write_wait,
+ 							&log->l_icloglock);
+-				if (log_flushed)
+-					*log_flushed = 1;
+ 				already_slept = 1;
+ 				goto try_again;
+ 			}
+@@ -3444,9 +3473,6 @@ _xfs_log_force_lsn(
+ 			 */
+ 			if (iclog->ic_state & XLOG_STATE_IOERROR)
+ 				return -EIO;
+-
+-			if (log_flushed)
+-				*log_flushed = 1;
+ 		} else {		/* just return */
+ 			spin_unlock(&log->l_icloglock);
+ 		}
+diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
+index 9b3d7c76915d..05909269f973 100644
+--- a/fs/xfs/xfs_log_recover.c
++++ b/fs/xfs/xfs_log_recover.c
+@@ -1029,61 +1029,106 @@ xlog_seek_logrec_hdr(
+ }
+ 
+ /*
+- * Check the log tail for torn writes. This is required when torn writes are
+- * detected at the head and the head had to be walked back to a previous record.
+- * The tail of the previous record must now be verified to ensure the torn
+- * writes didn't corrupt the previous tail.
++ * Calculate distance from head to tail (i.e., unused space in the log).
++ */
++static inline int
++xlog_tail_distance(
++	struct xlog	*log,
++	xfs_daddr_t	head_blk,
++	xfs_daddr_t	tail_blk)
++{
++	if (head_blk < tail_blk)
++		return tail_blk - head_blk;
++
++	return tail_blk + (log->l_logBBsize - head_blk);
++}
++
++/*
++ * Verify the log tail. This is particularly important when torn or incomplete
++ * writes have been detected near the front of the log and the head has been
++ * walked back accordingly.
+  *
+- * Return an error if CRC verification fails as recovery cannot proceed.
++ * We also have to handle the case where the tail was pinned and the head
++ * blocked behind the tail right before a crash. If the tail had been pushed
++ * immediately prior to the crash and the subsequent checkpoint was only
++ * partially written, it's possible it overwrote the last referenced tail in the
++ * log with garbage. This is not a coherency problem because the tail must have
++ * been pushed before it can be overwritten, but appears as log corruption to
++ * recovery because we have no way to know the tail was updated if the
++ * subsequent checkpoint didn't write successfully.
++ *
++ * Therefore, CRC check the log from tail to head. If a failure occurs and the
++ * offending record is within max iclog bufs from the head, walk the tail
++ * forward and retry until a valid tail is found or corruption is detected out
++ * of the range of a possible overwrite.
+  */
+ STATIC int
+ xlog_verify_tail(
+ 	struct xlog		*log,
+ 	xfs_daddr_t		head_blk,
+-	xfs_daddr_t		tail_blk)
++	xfs_daddr_t		*tail_blk,
++	int			hsize)
+ {
+ 	struct xlog_rec_header	*thead;
+ 	struct xfs_buf		*bp;
+ 	xfs_daddr_t		first_bad;
+-	int			count;
+ 	int			error = 0;
+ 	bool			wrapped;
+-	xfs_daddr_t		tmp_head;
++	xfs_daddr_t		tmp_tail;
++	xfs_daddr_t		orig_tail = *tail_blk;
+ 
+ 	bp = xlog_get_bp(log, 1);
+ 	if (!bp)
+ 		return -ENOMEM;
+ 
+ 	/*
+-	 * Seek XLOG_MAX_ICLOGS + 1 records past the current tail record to get
+-	 * a temporary head block that points after the last possible
+-	 * concurrently written record of the tail.
++	 * Make sure the tail points to a record (returns positive count on
++	 * success).
+ 	 */
+-	count = xlog_seek_logrec_hdr(log, head_blk, tail_blk,
+-				     XLOG_MAX_ICLOGS + 1, bp, &tmp_head, &thead,
+-				     &wrapped);
+-	if (count < 0) {
+-		error = count;
++	error = xlog_seek_logrec_hdr(log, head_blk, *tail_blk, 1, bp,
++			&tmp_tail, &thead, &wrapped);
++	if (error < 0)
+ 		goto out;
+-	}
+-
+-	/*
+-	 * If the call above didn't find XLOG_MAX_ICLOGS + 1 records, we ran
+-	 * into the actual log head. tmp_head points to the start of the record
+-	 * so update it to the actual head block.
+-	 */
+-	if (count < XLOG_MAX_ICLOGS + 1)
+-		tmp_head = head_blk;
++	if (*tail_blk != tmp_tail)
++		*tail_blk = tmp_tail;
+ 
+ 	/*
+-	 * We now have a tail and temporary head block that covers at least
+-	 * XLOG_MAX_ICLOGS records from the tail. We need to verify that these
+-	 * records were completely written. Run a CRC verification pass from
+-	 * tail to head and return the result.
++	 * Run a CRC check from the tail to the head. We can't just check
++	 * MAX_ICLOGS records past the tail because the tail may point to stale
++	 * blocks cleared during the search for the head/tail. These blocks are
++	 * overwritten with zero-length records and thus record count is not a
++	 * reliable indicator of the iclog state before a crash.
+ 	 */
+-	error = xlog_do_recovery_pass(log, tmp_head, tail_blk,
++	first_bad = 0;
++	error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
+ 				      XLOG_RECOVER_CRCPASS, &first_bad);
++	while ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
++		int	tail_distance;
++
++		/*
++		 * Is corruption within range of the head? If so, retry from
++		 * the next record. Otherwise return an error.
++		 */
++		tail_distance = xlog_tail_distance(log, head_blk, first_bad);
++		if (tail_distance > BTOBB(XLOG_MAX_ICLOGS * hsize))
++			break;
++
++		/* skip to the next record; returns positive count on success */
++		error = xlog_seek_logrec_hdr(log, head_blk, first_bad, 2, bp,
++				&tmp_tail, &thead, &wrapped);
++		if (error < 0)
++			goto out;
++
++		*tail_blk = tmp_tail;
++		first_bad = 0;
++		error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
++					      XLOG_RECOVER_CRCPASS, &first_bad);
++	}
+ 
++	if (!error && *tail_blk != orig_tail)
++		xfs_warn(log->l_mp,
++		"Tail block (0x%llx) overwrite detected. Updated to 0x%llx",
++			 orig_tail, *tail_blk);
+ out:
+ 	xlog_put_bp(bp);
+ 	return error;
+@@ -1143,7 +1188,7 @@ xlog_verify_head(
+ 	 */
+ 	error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk,
+ 				      XLOG_RECOVER_CRCPASS, &first_bad);
+-	if (error == -EFSBADCRC) {
++	if ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
+ 		/*
+ 		 * We've hit a potential torn write. Reset the error and warn
+ 		 * about it.
+@@ -1183,31 +1228,12 @@ xlog_verify_head(
+ 			ASSERT(0);
+ 			return 0;
+ 		}
+-
+-		/*
+-		 * Now verify the tail based on the updated head. This is
+-		 * required because the torn writes trimmed from the head could
+-		 * have been written over the tail of a previous record. Return
+-		 * any errors since recovery cannot proceed if the tail is
+-		 * corrupt.
+-		 *
+-		 * XXX: This leaves a gap in truly robust protection from torn
+-		 * writes in the log. If the head is behind the tail, the tail
+-		 * pushes forward to create some space and then a crash occurs
+-		 * causing the writes into the previous record's tail region to
+-		 * tear, log recovery isn't able to recover.
+-		 *
+-		 * How likely is this to occur? If possible, can we do something
+-		 * more intelligent here? Is it safe to push the tail forward if
+-		 * we can determine that the tail is within the range of the
+-		 * torn write (e.g., the kernel can only overwrite the tail if
+-		 * it has actually been pushed forward)? Alternatively, could we
+-		 * somehow prevent this condition at runtime?
+-		 */
+-		error = xlog_verify_tail(log, *head_blk, *tail_blk);
+ 	}
++	if (error)
++		return error;
+ 
+-	return error;
++	return xlog_verify_tail(log, *head_blk, tail_blk,
++				be32_to_cpu((*rhead)->h_size));
+ }
+ 
+ /*
+@@ -4152,7 +4178,7 @@ xlog_recover_commit_trans(
+ 
+ 	#define XLOG_RECOVER_COMMIT_QUEUE_MAX 100
+ 
+-	hlist_del(&trans->r_list);
++	hlist_del_init(&trans->r_list);
+ 
+ 	error = xlog_recover_reorder_trans(log, trans, pass);
+ 	if (error)
+@@ -4354,6 +4380,8 @@ xlog_recover_free_trans(
+ 	xlog_recover_item_t	*item, *n;
+ 	int			i;
+ 
++	hlist_del_init(&trans->r_list);
++
+ 	list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
+ 		/* Free the regions in the item. */
+ 		list_del(&item->ri_list);
+@@ -4799,12 +4827,16 @@ xlog_recover_process_intents(
+ 	int			error = 0;
+ 	struct xfs_ail_cursor	cur;
+ 	struct xfs_ail		*ailp;
++#if defined(DEBUG) || defined(XFS_WARN)
+ 	xfs_lsn_t		last_lsn;
++#endif
+ 
+ 	ailp = log->l_ailp;
+ 	spin_lock(&ailp->xa_lock);
+ 	lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
++#if defined(DEBUG) || defined(XFS_WARN)
+ 	last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block);
++#endif
+ 	while (lip != NULL) {
+ 		/*
+ 		 * We're done when we see something other than an intent.
+@@ -5214,7 +5246,7 @@ xlog_do_recovery_pass(
+ 	xfs_daddr_t		*first_bad)	/* out: first bad log rec */
+ {
+ 	xlog_rec_header_t	*rhead;
+-	xfs_daddr_t		blk_no;
++	xfs_daddr_t		blk_no, rblk_no;
+ 	xfs_daddr_t		rhead_blk;
+ 	char			*offset;
+ 	xfs_buf_t		*hbp, *dbp;
+@@ -5222,11 +5254,15 @@ xlog_do_recovery_pass(
+ 	int			error2 = 0;
+ 	int			bblks, split_bblks;
+ 	int			hblks, split_hblks, wrapped_hblks;
++	int			i;
+ 	struct hlist_head	rhash[XLOG_RHASH_SIZE];
+ 	LIST_HEAD		(buffer_list);
+ 
+ 	ASSERT(head_blk != tail_blk);
+-	rhead_blk = 0;
++	blk_no = rhead_blk = tail_blk;
++
++	for (i = 0; i < XLOG_RHASH_SIZE; i++)
++		INIT_HLIST_HEAD(&rhash[i]);
+ 
+ 	/*
+ 	 * Read the header of the tail block and get the iclog buffer size from
+@@ -5301,7 +5337,6 @@ xlog_do_recovery_pass(
+ 	}
+ 
+ 	memset(rhash, 0, sizeof(rhash));
+-	blk_no = rhead_blk = tail_blk;
+ 	if (tail_blk > head_blk) {
+ 		/*
+ 		 * Perform recovery around the end of the physical log.
+@@ -5363,9 +5398,19 @@ xlog_do_recovery_pass(
+ 			bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
+ 			blk_no += hblks;
+ 
+-			/* Read in data for log record */
+-			if (blk_no + bblks <= log->l_logBBsize) {
+-				error = xlog_bread(log, blk_no, bblks, dbp,
++			/*
++			 * Read the log record data in multiple reads if it
++			 * wraps around the end of the log. Note that if the
++			 * header already wrapped, blk_no could point past the
++			 * end of the log. The record data is contiguous in
++			 * that case.
++			 */
++			if (blk_no + bblks <= log->l_logBBsize ||
++			    blk_no >= log->l_logBBsize) {
++				/* mod blk_no in case the header wrapped and
++				 * pushed it beyond the end of the log */
++				rblk_no = do_mod(blk_no, log->l_logBBsize);
++				error = xlog_bread(log, rblk_no, bblks, dbp,
+ 						   &offset);
+ 				if (error)
+ 					goto bread_err2;
+@@ -5464,6 +5509,19 @@ xlog_do_recovery_pass(
+ 	if (error && first_bad)
+ 		*first_bad = rhead_blk;
+ 
++	/*
++	 * Transactions are freed at commit time but transactions without commit
++	 * records on disk are never committed. Free any that may be left in the
++	 * hash table.
++	 */
++	for (i = 0; i < XLOG_RHASH_SIZE; i++) {
++		struct hlist_node	*tmp;
++		struct xlog_recover	*trans;
++
++		hlist_for_each_entry_safe(trans, tmp, &rhash[i], r_list)
++			xlog_recover_free_trans(trans);
++	}
++
+ 	return error ? error : error2;
+ }
+ 
+@@ -5542,6 +5600,8 @@ xlog_do_recover(
+ 	xfs_buf_t	*bp;
+ 	xfs_sb_t	*sbp;
+ 
++	trace_xfs_log_recover(log, head_blk, tail_blk);
++
+ 	/*
+ 	 * First replay the images in the log.
+ 	 */
+diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
+index 13796f212f98..d4ce8d277992 100644
+--- a/fs/xfs/xfs_mount.c
++++ b/fs/xfs/xfs_mount.c
+@@ -924,15 +924,6 @@ xfs_mountfs(
+ 		}
+ 	}
+ 
+-	/*
+-	 * During the second phase of log recovery, we need iget and
+-	 * iput to behave like they do for an active filesystem.
+-	 * xfs_fs_drop_inode needs to be able to prevent the deletion
+-	 * of inodes before we're done replaying log items on those
+-	 * inodes.
+-	 */
+-	mp->m_super->s_flags |= MS_ACTIVE;
+-
+ 	/*
+ 	 * Finish recovering the file system.  This part needed to be delayed
+ 	 * until after the root and real-time bitmap inodes were consistently
+@@ -1008,12 +999,13 @@ xfs_mountfs(
+  out_quota:
+ 	xfs_qm_unmount_quotas(mp);
+  out_rtunmount:
+-	mp->m_super->s_flags &= ~MS_ACTIVE;
+ 	xfs_rtunmount_inodes(mp);
+  out_rele_rip:
+ 	IRELE(rip);
+ 	cancel_delayed_work_sync(&mp->m_reclaim_work);
+ 	xfs_reclaim_inodes(mp, SYNC_WAIT);
++	/* Clean out dquots that might be in memory after quotacheck. */
++	xfs_qm_unmount(mp);
+  out_log_dealloc:
+ 	mp->m_flags |= XFS_MOUNT_UNMOUNTING;
+ 	xfs_log_mount_cancel(mp);
+diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
+index 8b9a9f15f022..1fdd3face2d9 100644
+--- a/fs/xfs/xfs_qm.c
++++ b/fs/xfs/xfs_qm.c
+@@ -111,6 +111,9 @@ xfs_qm_dquot_walk(
+ 			skipped = 0;
+ 			break;
+ 		}
++		/* we're done if id overflows back to zero */
++		if (!next_index)
++			break;
+ 	}
+ 
+ 	if (skipped) {
+@@ -1247,6 +1250,7 @@ xfs_qm_flush_one(
+ 	struct xfs_dquot	*dqp,
+ 	void			*data)
+ {
++	struct xfs_mount	*mp = dqp->q_mount;
+ 	struct list_head	*buffer_list = data;
+ 	struct xfs_buf		*bp = NULL;
+ 	int			error = 0;
+@@ -1257,7 +1261,32 @@ xfs_qm_flush_one(
+ 	if (!XFS_DQ_IS_DIRTY(dqp))
+ 		goto out_unlock;
+ 
+-	xfs_dqflock(dqp);
++	/*
++	 * The only way the dquot is already flush locked by the time quotacheck
++	 * gets here is if reclaim flushed it before the dqadjust walk dirtied
++	 * it for the final time. Quotacheck collects all dquot bufs in the
++	 * local delwri queue before dquots are dirtied, so reclaim can't have
++	 * possibly queued it for I/O. The only way out is to push the buffer to
++	 * cycle the flush lock.
++	 */
++	if (!xfs_dqflock_nowait(dqp)) {
++		/* buf is pinned in-core by delwri list */
++		DEFINE_SINGLE_BUF_MAP(map, dqp->q_blkno,
++				      mp->m_quotainfo->qi_dqchunklen);
++		bp = _xfs_buf_find(mp->m_ddev_targp, &map, 1, 0, NULL);
++		if (!bp) {
++			error = -EINVAL;
++			goto out_unlock;
++		}
++		xfs_buf_unlock(bp);
++
++		xfs_buf_delwri_pushbuf(bp, buffer_list);
++		xfs_buf_rele(bp);
++
++		error = -EAGAIN;
++		goto out_unlock;
++	}
++
+ 	error = xfs_qm_dqflush(dqp, &bp);
+ 	if (error)
+ 		goto out_unlock;
+diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
+index 29a75ecb2425..0015c19c7455 100644
+--- a/fs/xfs/xfs_reflink.c
++++ b/fs/xfs/xfs_reflink.c
+@@ -169,6 +169,8 @@ xfs_reflink_find_shared(
+ 	error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
+ 	if (error)
+ 		return error;
++	if (!agbp)
++		return -ENOMEM;
+ 
+ 	cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL);
+ 
+@@ -333,7 +335,7 @@ xfs_reflink_convert_cow_extent(
+ 	struct xfs_defer_ops		*dfops)
+ {
+ 	struct xfs_bmbt_irec		irec = *imap;
+-	xfs_fsblock_t			first_block;
++	xfs_fsblock_t			first_block = NULLFSBLOCK;
+ 	int				nimaps = 1;
+ 
+ 	if (imap->br_state == XFS_EXT_NORM)
+diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
+index 882fb8524fcb..67d589e0a49f 100644
+--- a/fs/xfs/xfs_super.c
++++ b/fs/xfs/xfs_super.c
+@@ -1214,7 +1214,7 @@ xfs_test_remount_options(
+ 	tmp_mp->m_super = sb;
+ 	error = xfs_parseargs(tmp_mp, options);
+ 	xfs_free_fsname(tmp_mp);
+-	kfree(tmp_mp);
++	kmem_free(tmp_mp);
+ 
+ 	return error;
+ }
+diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
+index 828f383df121..bdf69e1c7410 100644
+--- a/fs/xfs/xfs_trace.h
++++ b/fs/xfs/xfs_trace.h
+@@ -366,6 +366,7 @@ DEFINE_BUF_EVENT(xfs_buf_iowait_done);
+ DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
+ DEFINE_BUF_EVENT(xfs_buf_delwri_queued);
+ DEFINE_BUF_EVENT(xfs_buf_delwri_split);
++DEFINE_BUF_EVENT(xfs_buf_delwri_pushbuf);
+ DEFINE_BUF_EVENT(xfs_buf_get_uncached);
+ DEFINE_BUF_EVENT(xfs_bdstrat_shut);
+ DEFINE_BUF_EVENT(xfs_buf_item_relse);
+@@ -519,7 +520,6 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size);
+ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_ordered);
+ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale);
+ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format);
+-DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_ordered);
+ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale);
+ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_ordered);
+ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin);
+@@ -1990,6 +1990,24 @@ DEFINE_EVENT(xfs_swap_extent_class, name, \
+ DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before);
+ DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after);
+ 
++TRACE_EVENT(xfs_log_recover,
++	TP_PROTO(struct xlog *log, xfs_daddr_t headblk, xfs_daddr_t tailblk),
++	TP_ARGS(log, headblk, tailblk),
++	TP_STRUCT__entry(
++		__field(dev_t, dev)
++		__field(xfs_daddr_t, headblk)
++		__field(xfs_daddr_t, tailblk)
++	),
++	TP_fast_assign(
++		__entry->dev = log->l_mp->m_super->s_dev;
++		__entry->headblk = headblk;
++		__entry->tailblk = tailblk;
++	),
++	TP_printk("dev %d:%d headblk 0x%llx tailblk 0x%llx",
++		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->headblk,
++		  __entry->tailblk)
++)
++
+ TRACE_EVENT(xfs_log_recover_record,
+ 	TP_PROTO(struct xlog *log, struct xlog_rec_header *rhead, int pass),
+ 	TP_ARGS(log, rhead, pass),
+diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
+index 98024cb933ef..5669cf00bae0 100644
+--- a/fs/xfs/xfs_trans.h
++++ b/fs/xfs/xfs_trans.h
+@@ -50,6 +50,7 @@ typedef struct xfs_log_item {
+ 	struct xfs_ail			*li_ailp;	/* ptr to AIL */
+ 	uint				li_type;	/* item type */
+ 	uint				li_flags;	/* misc flags */
++	struct xfs_buf			*li_buf;	/* real buffer pointer */
+ 	struct xfs_log_item		*li_bio_list;	/* buffer item list */
+ 	void				(*li_cb)(struct xfs_buf *,
+ 						 struct xfs_log_item *);
+@@ -65,11 +66,13 @@ typedef struct xfs_log_item {
+ } xfs_log_item_t;
+ 
+ #define	XFS_LI_IN_AIL	0x1
+-#define XFS_LI_ABORTED	0x2
++#define	XFS_LI_ABORTED	0x2
++#define	XFS_LI_FAILED	0x4
+ 
+ #define XFS_LI_FLAGS \
+ 	{ XFS_LI_IN_AIL,	"IN_AIL" }, \
+-	{ XFS_LI_ABORTED,	"ABORTED" }
++	{ XFS_LI_ABORTED,	"ABORTED" }, \
++	{ XFS_LI_FAILED,	"FAILED" }
+ 
+ struct xfs_item_ops {
+ 	void (*iop_size)(xfs_log_item_t *, int *, int *);
+@@ -80,6 +83,7 @@ struct xfs_item_ops {
+ 	void (*iop_unlock)(xfs_log_item_t *);
+ 	xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
+ 	void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
++	void (*iop_error)(xfs_log_item_t *, xfs_buf_t *);
+ };
+ 
+ void	xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item,
+@@ -213,12 +217,14 @@ void		xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *);
+ void		xfs_trans_binval(xfs_trans_t *, struct xfs_buf *);
+ void		xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
+ void		xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
+-void		xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *);
++bool		xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *);
+ void		xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
+ void		xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
+ void		xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
+ void		xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint);
+-void		xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint);
++void		xfs_trans_log_buf(struct xfs_trans *, struct xfs_buf *, uint,
++				  uint);
++void		xfs_trans_dirty_buf(struct xfs_trans *, struct xfs_buf *);
+ void		xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
+ 
+ void		xfs_extent_free_init_defer_op(void);
+@@ -277,6 +283,6 @@ int xfs_trans_log_finish_bmap_update(struct xfs_trans *tp,
+ 		struct xfs_bud_log_item *rudp, struct xfs_defer_ops *dfops,
+ 		enum xfs_bmap_intent_type type, struct xfs_inode *ip,
+ 		int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock,
+-		xfs_filblks_t blockcount, xfs_exntst_t state);
++		xfs_filblks_t *blockcount, xfs_exntst_t state);
+ 
+ #endif	/* __XFS_TRANS_H__ */
+diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
+index d6c9c3e9e02b..70f5ab017323 100644
+--- a/fs/xfs/xfs_trans_ail.c
++++ b/fs/xfs/xfs_trans_ail.c
+@@ -684,8 +684,24 @@ xfs_trans_ail_update_bulk(
+ 	}
+ }
+ 
+-/*
+- * xfs_trans_ail_delete_bulk - remove multiple log items from the AIL
++bool
++xfs_ail_delete_one(
++	struct xfs_ail		*ailp,
++	struct xfs_log_item	*lip)
++{
++	struct xfs_log_item	*mlip = xfs_ail_min(ailp);
++
++	trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
++	xfs_ail_delete(ailp, lip);
++	xfs_clear_li_failed(lip);
++	lip->li_flags &= ~XFS_LI_IN_AIL;
++	lip->li_lsn = 0;
++
++	return mlip == lip;
++}
++
++/**
++ * Remove a log items from the AIL
+  *
+  * @xfs_trans_ail_delete_bulk takes an array of log items that all need to
+  * removed from the AIL. The caller is already holding the AIL lock, and done
+@@ -706,52 +722,36 @@ xfs_trans_ail_update_bulk(
+  * before returning.
+  */
+ void
+-xfs_trans_ail_delete_bulk(
++xfs_trans_ail_delete(
+ 	struct xfs_ail		*ailp,
+-	struct xfs_log_item	**log_items,
+-	int			nr_items,
++	struct xfs_log_item	*lip,
+ 	int			shutdown_type) __releases(ailp->xa_lock)
+ {
+-	xfs_log_item_t		*mlip;
+-	int			mlip_changed = 0;
+-	int			i;
+-
+-	mlip = xfs_ail_min(ailp);
++	struct xfs_mount	*mp = ailp->xa_mount;
++	bool			mlip_changed;
+ 
+-	for (i = 0; i < nr_items; i++) {
+-		struct xfs_log_item *lip = log_items[i];
+-		if (!(lip->li_flags & XFS_LI_IN_AIL)) {
+-			struct xfs_mount	*mp = ailp->xa_mount;
+-
+-			spin_unlock(&ailp->xa_lock);
+-			if (!XFS_FORCED_SHUTDOWN(mp)) {
+-				xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
+-		"%s: attempting to delete a log item that is not in the AIL",
+-						__func__);
+-				xfs_force_shutdown(mp, shutdown_type);
+-			}
+-			return;
++	if (!(lip->li_flags & XFS_LI_IN_AIL)) {
++		spin_unlock(&ailp->xa_lock);
++		if (!XFS_FORCED_SHUTDOWN(mp)) {
++			xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
++	"%s: attempting to delete a log item that is not in the AIL",
++					__func__);
++			xfs_force_shutdown(mp, shutdown_type);
+ 		}
+-
+-		trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
+-		xfs_ail_delete(ailp, lip);
+-		lip->li_flags &= ~XFS_LI_IN_AIL;
+-		lip->li_lsn = 0;
+-		if (mlip == lip)
+-			mlip_changed = 1;
++		return;
+ 	}
+ 
++	mlip_changed = xfs_ail_delete_one(ailp, lip);
+ 	if (mlip_changed) {
+-		if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount))
+-			xlog_assign_tail_lsn_locked(ailp->xa_mount);
++		if (!XFS_FORCED_SHUTDOWN(mp))
++			xlog_assign_tail_lsn_locked(mp);
+ 		if (list_empty(&ailp->xa_ail))
+ 			wake_up_all(&ailp->xa_empty);
+-		spin_unlock(&ailp->xa_lock);
++	}
+ 
++	spin_unlock(&ailp->xa_lock);
++	if (mlip_changed)
+ 		xfs_log_space_wake(ailp->xa_mount);
+-	} else {
+-		spin_unlock(&ailp->xa_lock);
+-	}
+ }
+ 
+ int
+diff --git a/fs/xfs/xfs_trans_bmap.c b/fs/xfs/xfs_trans_bmap.c
+index 6408e7d7c08c..14543d93cd4b 100644
+--- a/fs/xfs/xfs_trans_bmap.c
++++ b/fs/xfs/xfs_trans_bmap.c
+@@ -63,7 +63,7 @@ xfs_trans_log_finish_bmap_update(
+ 	int				whichfork,
+ 	xfs_fileoff_t			startoff,
+ 	xfs_fsblock_t			startblock,
+-	xfs_filblks_t			blockcount,
++	xfs_filblks_t			*blockcount,
+ 	xfs_exntst_t			state)
+ {
+ 	int				error;
+@@ -196,16 +196,23 @@ xfs_bmap_update_finish_item(
+ 	void				**state)
+ {
+ 	struct xfs_bmap_intent		*bmap;
++	xfs_filblks_t			count;
+ 	int				error;
+ 
+ 	bmap = container_of(item, struct xfs_bmap_intent, bi_list);
++	count = bmap->bi_bmap.br_blockcount;
+ 	error = xfs_trans_log_finish_bmap_update(tp, done_item, dop,
+ 			bmap->bi_type,
+ 			bmap->bi_owner, bmap->bi_whichfork,
+ 			bmap->bi_bmap.br_startoff,
+ 			bmap->bi_bmap.br_startblock,
+-			bmap->bi_bmap.br_blockcount,
++			&count,
+ 			bmap->bi_bmap.br_state);
++	if (!error && count > 0) {
++		ASSERT(bmap->bi_type == XFS_BMAP_UNMAP);
++		bmap->bi_bmap.br_blockcount = count;
++		return -EAGAIN;
++	}
+ 	kmem_free(bmap);
+ 	return error;
+ }
+diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
+index 8ee29ca132dc..3ba7a96a8abd 100644
+--- a/fs/xfs/xfs_trans_buf.c
++++ b/fs/xfs/xfs_trans_buf.c
+@@ -356,6 +356,7 @@ xfs_trans_brelse(xfs_trans_t	*tp,
+ 		 xfs_buf_t	*bp)
+ {
+ 	xfs_buf_log_item_t	*bip;
++	int			freed;
+ 
+ 	/*
+ 	 * Default to a normal brelse() call if the tp is NULL.
+@@ -419,16 +420,22 @@ xfs_trans_brelse(xfs_trans_t	*tp,
+ 	/*
+ 	 * Drop our reference to the buf log item.
+ 	 */
+-	atomic_dec(&bip->bli_refcount);
++	freed = atomic_dec_and_test(&bip->bli_refcount);
+ 
+ 	/*
+-	 * If the buf item is not tracking data in the log, then
+-	 * we must free it before releasing the buffer back to the
+-	 * free pool.  Before releasing the buffer to the free pool,
+-	 * clear the transaction pointer in b_fsprivate2 to dissolve
+-	 * its relation to this transaction.
++	 * If the buf item is not tracking data in the log, then we must free it
++	 * before releasing the buffer back to the free pool.
++	 *
++	 * If the fs has shutdown and we dropped the last reference, it may fall
++	 * on us to release a (possibly dirty) bli if it never made it to the
++	 * AIL (e.g., the aborted unpin already happened and didn't release it
++	 * due to our reference). Since we're already shutdown and need xa_lock,
++	 * just force remove from the AIL and release the bli here.
+ 	 */
+-	if (!xfs_buf_item_dirty(bip)) {
++	if (XFS_FORCED_SHUTDOWN(tp->t_mountp) && freed) {
++		xfs_trans_ail_remove(&bip->bli_item, SHUTDOWN_LOG_IO_ERROR);
++		xfs_buf_item_relse(bp);
++	} else if (!(bip->bli_flags & XFS_BLI_DIRTY)) {
+ /***
+ 		ASSERT(bp->b_pincount == 0);
+ ***/
+@@ -486,25 +493,17 @@ xfs_trans_bhold_release(xfs_trans_t	*tp,
+ }
+ 
+ /*
+- * This is called to mark bytes first through last inclusive of the given
+- * buffer as needing to be logged when the transaction is committed.
+- * The buffer must already be associated with the given transaction.
+- *
+- * First and last are numbers relative to the beginning of this buffer,
+- * so the first byte in the buffer is numbered 0 regardless of the
+- * value of b_blkno.
++ * Mark a buffer dirty in the transaction.
+  */
+ void
+-xfs_trans_log_buf(xfs_trans_t	*tp,
+-		  xfs_buf_t	*bp,
+-		  uint		first,
+-		  uint		last)
++xfs_trans_dirty_buf(
++	struct xfs_trans	*tp,
++	struct xfs_buf		*bp)
+ {
+-	xfs_buf_log_item_t	*bip = bp->b_fspriv;
++	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+ 
+ 	ASSERT(bp->b_transp == tp);
+ 	ASSERT(bip != NULL);
+-	ASSERT(first <= last && last < BBTOB(bp->b_length));
+ 	ASSERT(bp->b_iodone == NULL ||
+ 	       bp->b_iodone == xfs_buf_iodone_callbacks);
+ 
+@@ -524,8 +523,6 @@ xfs_trans_log_buf(xfs_trans_t	*tp,
+ 	bp->b_iodone = xfs_buf_iodone_callbacks;
+ 	bip->bli_item.li_cb = xfs_buf_iodone;
+ 
+-	trace_xfs_trans_log_buf(bip);
+-
+ 	/*
+ 	 * If we invalidated the buffer within this transaction, then
+ 	 * cancel the invalidation now that we're dirtying the buffer
+@@ -538,17 +535,37 @@ xfs_trans_log_buf(xfs_trans_t	*tp,
+ 		bp->b_flags &= ~XBF_STALE;
+ 		bip->__bli_format.blf_flags &= ~XFS_BLF_CANCEL;
+ 	}
++	bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
+ 
+ 	tp->t_flags |= XFS_TRANS_DIRTY;
+ 	bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
++}
+ 
+-	/*
+-	 * If we have an ordered buffer we are not logging any dirty range but
+-	 * it still needs to be marked dirty and that it has been logged.
+-	 */
+-	bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
+-	if (!(bip->bli_flags & XFS_BLI_ORDERED))
+-		xfs_buf_item_log(bip, first, last);
++/*
++ * This is called to mark bytes first through last inclusive of the given
++ * buffer as needing to be logged when the transaction is committed.
++ * The buffer must already be associated with the given transaction.
++ *
++ * First and last are numbers relative to the beginning of this buffer,
++ * so the first byte in the buffer is numbered 0 regardless of the
++ * value of b_blkno.
++ */
++void
++xfs_trans_log_buf(
++	struct xfs_trans	*tp,
++	struct xfs_buf		*bp,
++	uint			first,
++	uint			last)
++{
++	struct xfs_buf_log_item	*bip = bp->b_fspriv;
++
++	ASSERT(first <= last && last < BBTOB(bp->b_length));
++	ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED));
++
++	xfs_trans_dirty_buf(tp, bp);
++
++	trace_xfs_trans_log_buf(bip);
++	xfs_buf_item_log(bip, first, last);
+ }
+ 
+ 
+@@ -701,14 +718,13 @@ xfs_trans_inode_alloc_buf(
+ }
+ 
+ /*
+- * Mark the buffer as ordered for this transaction. This means
+- * that the contents of the buffer are not recorded in the transaction
+- * but it is tracked in the AIL as though it was. This allows us
+- * to record logical changes in transactions rather than the physical
+- * changes we make to the buffer without changing writeback ordering
+- * constraints of metadata buffers.
++ * Mark the buffer as ordered for this transaction. This means that the contents
++ * of the buffer are not recorded in the transaction but it is tracked in the
++ * AIL as though it was. This allows us to record logical changes in
++ * transactions rather than the physical changes we make to the buffer without
++ * changing writeback ordering constraints of metadata buffers.
+  */
+-void
++bool
+ xfs_trans_ordered_buf(
+ 	struct xfs_trans	*tp,
+ 	struct xfs_buf		*bp)
+@@ -719,8 +735,18 @@ xfs_trans_ordered_buf(
+ 	ASSERT(bip != NULL);
+ 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+ 
++	if (xfs_buf_item_dirty_format(bip))
++		return false;
++
+ 	bip->bli_flags |= XFS_BLI_ORDERED;
+ 	trace_xfs_buf_item_ordered(bip);
++
++	/*
++	 * We don't log a dirty range of an ordered buffer but it still needs
++	 * to be marked dirty and that it has been logged.
++	 */
++	xfs_trans_dirty_buf(tp, bp);
++	return true;
+ }
+ 
+ /*
+diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
+index 49931b72da8a..b317a3644c00 100644
+--- a/fs/xfs/xfs_trans_priv.h
++++ b/fs/xfs/xfs_trans_priv.h
+@@ -106,18 +106,9 @@ xfs_trans_ail_update(
+ 	xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn);
+ }
+ 
+-void	xfs_trans_ail_delete_bulk(struct xfs_ail *ailp,
+-				struct xfs_log_item **log_items, int nr_items,
+-				int shutdown_type)
+-				__releases(ailp->xa_lock);
+-static inline void
+-xfs_trans_ail_delete(
+-	struct xfs_ail	*ailp,
+-	xfs_log_item_t	*lip,
+-	int		shutdown_type) __releases(ailp->xa_lock)
+-{
+-	xfs_trans_ail_delete_bulk(ailp, &lip, 1, shutdown_type);
+-}
++bool xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip);
++void xfs_trans_ail_delete(struct xfs_ail *ailp, struct xfs_log_item *lip,
++		int shutdown_type) __releases(ailp->xa_lock);
+ 
+ static inline void
+ xfs_trans_ail_remove(
+@@ -173,4 +164,35 @@ xfs_trans_ail_copy_lsn(
+ 	*dst = *src;
+ }
+ #endif
++
++static inline void
++xfs_clear_li_failed(
++	struct xfs_log_item	*lip)
++{
++	struct xfs_buf	*bp = lip->li_buf;
++
++	ASSERT(lip->li_flags & XFS_LI_IN_AIL);
++	lockdep_assert_held(&lip->li_ailp->xa_lock);
++
++	if (lip->li_flags & XFS_LI_FAILED) {
++		lip->li_flags &= ~XFS_LI_FAILED;
++		lip->li_buf = NULL;
++		xfs_buf_rele(bp);
++	}
++}
++
++static inline void
++xfs_set_li_failed(
++	struct xfs_log_item	*lip,
++	struct xfs_buf		*bp)
++{
++	lockdep_assert_held(&lip->li_ailp->xa_lock);
++
++	if (!(lip->li_flags & XFS_LI_FAILED)) {
++		xfs_buf_hold(bp);
++		lip->li_flags |= XFS_LI_FAILED;
++		lip->li_buf = bp;
++	}
++}
++
+ #endif	/* __XFS_TRANS_PRIV_H__ */
+diff --git a/include/linux/fs.h b/include/linux/fs.h
+index dd88ded27fc8..d705ae084edd 100644
+--- a/include/linux/fs.h
++++ b/include/linux/fs.h
+@@ -2760,6 +2760,7 @@ static inline void lockdep_annotate_inode_mutex_key(struct inode *inode) { };
+ #endif
+ extern void unlock_new_inode(struct inode *);
+ extern unsigned int get_next_ino(void);
++extern void evict_inodes(struct super_block *sb);
+ 
+ extern void __iget(struct inode * inode);
+ extern void iget_failed(struct inode *);
+diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
+index 780e7171f548..23db1ae37464 100644
+--- a/include/linux/netdevice.h
++++ b/include/linux/netdevice.h
+@@ -3901,6 +3901,8 @@ struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
+ 	     updev; \
+ 	     updev = netdev_all_upper_get_next_dev_rcu(dev, &(iter)))
+ 
++bool netdev_has_any_upper_dev(struct net_device *dev);
++
+ void *netdev_lower_get_next_private(struct net_device *dev,
+ 				    struct list_head **iter);
+ void *netdev_lower_get_next_private_rcu(struct net_device *dev,
+diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
+index 909972aa3acd..634d19203e7d 100644
+--- a/include/net/inet_frag.h
++++ b/include/net/inet_frag.h
+@@ -1,14 +1,9 @@
+ #ifndef __NET_FRAG_H__
+ #define __NET_FRAG_H__
+ 
+-#include <linux/percpu_counter.h>
+-
+ struct netns_frags {
+-	/* The percpu_counter "mem" need to be cacheline aligned.
+-	 *  mem.count must not share cacheline with other writers
+-	 */
+-	struct percpu_counter   mem ____cacheline_aligned_in_smp;
+-
++	/* Keep atomic mem on separate cachelines in structs that include it */
++	atomic_t		mem ____cacheline_aligned_in_smp;
+ 	/* sysctls */
+ 	int			timeout;
+ 	int			high_thresh;
+@@ -108,15 +103,10 @@ struct inet_frags {
+ int inet_frags_init(struct inet_frags *);
+ void inet_frags_fini(struct inet_frags *);
+ 
+-static inline int inet_frags_init_net(struct netns_frags *nf)
+-{
+-	return percpu_counter_init(&nf->mem, 0, GFP_KERNEL);
+-}
+-static inline void inet_frags_uninit_net(struct netns_frags *nf)
++static inline void inet_frags_init_net(struct netns_frags *nf)
+ {
+-	percpu_counter_destroy(&nf->mem);
++	atomic_set(&nf->mem, 0);
+ }
+-
+ void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f);
+ 
+ void inet_frag_kill(struct inet_frag_queue *q, struct inet_frags *f);
+@@ -140,37 +130,24 @@ static inline bool inet_frag_evicting(struct inet_frag_queue *q)
+ 
+ /* Memory Tracking Functions. */
+ 
+-/* The default percpu_counter batch size is not big enough to scale to
+- * fragmentation mem acct sizes.
+- * The mem size of a 64K fragment is approx:
+- *  (44 fragments * 2944 truesize) + frag_queue struct(200) = 129736 bytes
+- */
+-static unsigned int frag_percpu_counter_batch = 130000;
+-
+ static inline int frag_mem_limit(struct netns_frags *nf)
+ {
+-	return percpu_counter_read(&nf->mem);
++	return atomic_read(&nf->mem);
+ }
+ 
+ static inline void sub_frag_mem_limit(struct netns_frags *nf, int i)
+ {
+-	__percpu_counter_add(&nf->mem, -i, frag_percpu_counter_batch);
++	atomic_sub(i, &nf->mem);
+ }
+ 
+ static inline void add_frag_mem_limit(struct netns_frags *nf, int i)
+ {
+-	__percpu_counter_add(&nf->mem, i, frag_percpu_counter_batch);
++	atomic_add(i, &nf->mem);
+ }
+ 
+-static inline unsigned int sum_frag_mem_limit(struct netns_frags *nf)
++static inline int sum_frag_mem_limit(struct netns_frags *nf)
+ {
+-	unsigned int res;
+-
+-	local_bh_disable();
+-	res = percpu_counter_sum_positive(&nf->mem);
+-	local_bh_enable();
+-
+-	return res;
++	return atomic_read(&nf->mem);
+ }
+ 
+ /* RFC 3168 support :
+diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
+index a74e2aa40ef4..a6bcb18ac4c3 100644
+--- a/include/net/ip6_fib.h
++++ b/include/net/ip6_fib.h
+@@ -68,6 +68,7 @@ struct fib6_node {
+ 	__u16			fn_flags;
+ 	int			fn_sernum;
+ 	struct rt6_info		*rr_ptr;
++	struct rcu_head		rcu;
+ };
+ 
+ #ifndef CONFIG_IPV6_SUBTREES
+@@ -102,7 +103,7 @@ struct rt6_info {
+ 	 * the same cache line.
+ 	 */
+ 	struct fib6_table		*rt6i_table;
+-	struct fib6_node		*rt6i_node;
++	struct fib6_node __rcu		*rt6i_node;
+ 
+ 	struct in6_addr			rt6i_gateway;
+ 
+@@ -165,13 +166,40 @@ static inline void rt6_update_expires(struct rt6_info *rt0, int timeout)
+ 	rt0->rt6i_flags |= RTF_EXPIRES;
+ }
+ 
++/* Function to safely get fn->sernum for passed in rt
++ * and store result in passed in cookie.
++ * Return true if we can get cookie safely
++ * Return false if not
++ */
++static inline bool rt6_get_cookie_safe(const struct rt6_info *rt,
++				       u32 *cookie)
++{
++	struct fib6_node *fn;
++	bool status = false;
++
++	rcu_read_lock();
++	fn = rcu_dereference(rt->rt6i_node);
++
++	if (fn) {
++		*cookie = fn->fn_sernum;
++		status = true;
++	}
++
++	rcu_read_unlock();
++	return status;
++}
++
+ static inline u32 rt6_get_cookie(const struct rt6_info *rt)
+ {
++	u32 cookie = 0;
++
+ 	if (rt->rt6i_flags & RTF_PCPU ||
+ 	    (unlikely(rt->dst.flags & DST_NOCACHE) && rt->dst.from))
+ 		rt = (struct rt6_info *)(rt->dst.from);
+ 
+-	return rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
++	rt6_get_cookie_safe(rt, &cookie);
++
++	return cookie;
+ }
+ 
+ static inline void ip6_rt_put(struct rt6_info *rt)
+diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
+index 89a687f3c0a3..5f5e28f210e0 100644
+--- a/net/bridge/br_device.c
++++ b/net/bridge/br_device.c
+@@ -53,6 +53,9 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
+ 	brstats->tx_bytes += skb->len;
+ 	u64_stats_update_end(&brstats->syncp);
+ 
++#ifdef CONFIG_NET_SWITCHDEV
++	skb->offload_fwd_mark = 0;
++#endif
+ 	BR_INPUT_SKB_CB(skb)->brdev = dev;
+ 
+ 	skb_reset_mac_header(skb);
+diff --git a/net/core/datagram.c b/net/core/datagram.c
+index 58dfa23d12ca..4fa4011feec1 100644
+--- a/net/core/datagram.c
++++ b/net/core/datagram.c
+@@ -351,7 +351,7 @@ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
+ 	if (flags & MSG_PEEK) {
+ 		err = -ENOENT;
+ 		spin_lock_bh(&sk->sk_receive_queue.lock);
+-		if (skb == skb_peek(&sk->sk_receive_queue)) {
++		if (skb->next) {
+ 			__skb_unlink(skb, &sk->sk_receive_queue);
+ 			atomic_dec(&skb->users);
+ 			err = 0;
+diff --git a/net/core/dev.c b/net/core/dev.c
+index 1d0a7369d5a2..ba7b8121a414 100644
+--- a/net/core/dev.c
++++ b/net/core/dev.c
+@@ -5337,12 +5337,13 @@ EXPORT_SYMBOL(netdev_has_upper_dev);
+  * Find out if a device is linked to an upper device and return true in case
+  * it is. The caller must hold the RTNL lock.
+  */
+-static bool netdev_has_any_upper_dev(struct net_device *dev)
++bool netdev_has_any_upper_dev(struct net_device *dev)
+ {
+ 	ASSERT_RTNL();
+ 
+ 	return !list_empty(&dev->all_adj_list.upper);
+ }
++EXPORT_SYMBOL(netdev_has_any_upper_dev);
+ 
+ /**
+  * netdev_master_upper_dev_get - Get master upper device
+diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
+index 30d875dff6b5..f85b08baff16 100644
+--- a/net/ieee802154/6lowpan/reassembly.c
++++ b/net/ieee802154/6lowpan/reassembly.c
+@@ -580,19 +580,14 @@ static int __net_init lowpan_frags_init_net(struct net *net)
+ {
+ 	struct netns_ieee802154_lowpan *ieee802154_lowpan =
+ 		net_ieee802154_lowpan(net);
+-	int res;
+ 
+ 	ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
+ 	ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH;
+ 	ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT;
+ 
+-	res = inet_frags_init_net(&ieee802154_lowpan->frags);
+-	if (res)
+-		return res;
+-	res = lowpan_frags_ns_sysctl_register(net);
+-	if (res)
+-		inet_frags_uninit_net(&ieee802154_lowpan->frags);
+-	return res;
++	inet_frags_init_net(&ieee802154_lowpan->frags);
++
++	return lowpan_frags_ns_sysctl_register(net);
+ }
+ 
+ static void __net_exit lowpan_frags_exit_net(struct net *net)
+diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
+index b5e9317eaf9e..631c0d0d7cf8 100644
+--- a/net/ipv4/inet_fragment.c
++++ b/net/ipv4/inet_fragment.c
+@@ -234,10 +234,8 @@ void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
+ 	cond_resched();
+ 
+ 	if (read_seqretry(&f->rnd_seqlock, seq) ||
+-	    percpu_counter_sum(&nf->mem))
++	    sum_frag_mem_limit(nf))
+ 		goto evict_again;
+-
+-	percpu_counter_destroy(&nf->mem);
+ }
+ EXPORT_SYMBOL(inet_frags_exit_net);
+ 
+diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
+index bbe7f72db9c1..453db950dc9f 100644
+--- a/net/ipv4/ip_fragment.c
++++ b/net/ipv4/ip_fragment.c
+@@ -835,8 +835,6 @@ static void __init ip4_frags_ctl_register(void)
+ 
+ static int __net_init ipv4_frags_init_net(struct net *net)
+ {
+-	int res;
+-
+ 	/* Fragment cache limits.
+ 	 *
+ 	 * The fragment memory accounting code, (tries to) account for
+@@ -862,13 +860,9 @@ static int __net_init ipv4_frags_init_net(struct net *net)
+ 
+ 	net->ipv4.frags.max_dist = 64;
+ 
+-	res = inet_frags_init_net(&net->ipv4.frags);
+-	if (res)
+-		return res;
+-	res = ip4_frags_ns_ctl_register(net);
+-	if (res)
+-		inet_frags_uninit_net(&net->ipv4.frags);
+-	return res;
++	inet_frags_init_net(&net->ipv4.frags);
++
++	return ip4_frags_ns_ctl_register(net);
+ }
+ 
+ static void __net_exit ipv4_frags_exit_net(struct net *net)
+diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
+index 5719d6ba0824..bd7f1836bb70 100644
+--- a/net/ipv4/ip_tunnel.c
++++ b/net/ipv4/ip_tunnel.c
+@@ -609,8 +609,8 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
+ 		ip_rt_put(rt);
+ 		goto tx_dropped;
+ 	}
+-	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, key->tos,
+-		      key->ttl, df, !net_eq(tunnel->net, dev_net(dev)));
++	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
++		      df, !net_eq(tunnel->net, dev_net(dev)));
+ 	return;
+ tx_error:
+ 	dev->stats.tx_errors++;
+diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
+index 1a4db27f5833..6b3d27e50317 100644
+--- a/net/ipv4/tcp.c
++++ b/net/ipv4/tcp.c
+@@ -2297,6 +2297,10 @@ int tcp_disconnect(struct sock *sk, int flags)
+ 	tcp_set_ca_state(sk, TCP_CA_Open);
+ 	tcp_clear_retrans(tp);
+ 	inet_csk_delack_init(sk);
++	/* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0
++	 * issue in __tcp_select_window()
++	 */
++	icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
+ 	tcp_init_send_head(sk);
+ 	memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
+ 	__sk_dst_reset(sk);
+diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
+index b2cabda72320..cc101b1be903 100644
+--- a/net/ipv6/addrconf.c
++++ b/net/ipv6/addrconf.c
+@@ -5443,7 +5443,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
+ 		 * our DAD process, so we don't need
+ 		 * to do it again
+ 		 */
+-		if (!(ifp->rt->rt6i_node))
++		if (!rcu_access_pointer(ifp->rt->rt6i_node))
+ 			ip6_ins_rt(ifp->rt);
+ 		if (ifp->idev->cnf.forwarding)
+ 			addrconf_join_anycast(ifp);
+diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
+index ff389591a340..5da864997495 100644
+--- a/net/ipv6/ip6_fib.c
++++ b/net/ipv6/ip6_fib.c
+@@ -148,11 +148,23 @@ static struct fib6_node *node_alloc(void)
+ 	return fn;
+ }
+ 
+-static void node_free(struct fib6_node *fn)
++static void node_free_immediate(struct fib6_node *fn)
++{
++	kmem_cache_free(fib6_node_kmem, fn);
++}
++
++static void node_free_rcu(struct rcu_head *head)
+ {
++	struct fib6_node *fn = container_of(head, struct fib6_node, rcu);
++
+ 	kmem_cache_free(fib6_node_kmem, fn);
+ }
+ 
++static void node_free(struct fib6_node *fn)
++{
++	call_rcu(&fn->rcu, node_free_rcu);
++}
++
+ static void rt6_rcu_free(struct rt6_info *rt)
+ {
+ 	call_rcu(&rt->dst.rcu_head, dst_rcu_free);
+@@ -189,6 +201,12 @@ static void rt6_release(struct rt6_info *rt)
+ 	}
+ }
+ 
++static void fib6_free_table(struct fib6_table *table)
++{
++	inetpeer_invalidate_tree(&table->tb6_peers);
++	kfree(table);
++}
++
+ static void fib6_link_table(struct net *net, struct fib6_table *tb)
+ {
+ 	unsigned int h;
+@@ -589,9 +607,9 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root,
+ 
+ 		if (!in || !ln) {
+ 			if (in)
+-				node_free(in);
++				node_free_immediate(in);
+ 			if (ln)
+-				node_free(ln);
++				node_free_immediate(ln);
+ 			return ERR_PTR(-ENOMEM);
+ 		}
+ 
+@@ -862,7 +880,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
+ 
+ 		rt->dst.rt6_next = iter;
+ 		*ins = rt;
+-		rt->rt6i_node = fn;
++		rcu_assign_pointer(rt->rt6i_node, fn);
+ 		atomic_inc(&rt->rt6i_ref);
+ 		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
+ 		info->nl_net->ipv6.rt6_stats->fib_rt_entries++;
+@@ -887,7 +905,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
+ 			return err;
+ 
+ 		*ins = rt;
+-		rt->rt6i_node = fn;
++		rcu_assign_pointer(rt->rt6i_node, fn);
+ 		rt->dst.rt6_next = iter->dst.rt6_next;
+ 		atomic_inc(&rt->rt6i_ref);
+ 		inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
+@@ -1020,7 +1038,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
+ 				   root, and then (in failure) stale node
+ 				   in main tree.
+ 				 */
+-				node_free(sfn);
++				node_free_immediate(sfn);
+ 				err = PTR_ERR(sn);
+ 				goto failure;
+ 			}
+@@ -1447,8 +1465,9 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
+ 
+ int fib6_del(struct rt6_info *rt, struct nl_info *info)
+ {
++	struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node,
++				    lockdep_is_held(&rt->rt6i_table->tb6_lock));
+ 	struct net *net = info->nl_net;
+-	struct fib6_node *fn = rt->rt6i_node;
+ 	struct rt6_info **rtp;
+ 
+ #if RT6_DEBUG >= 2
+@@ -1637,7 +1656,9 @@ static int fib6_clean_node(struct fib6_walker *w)
+ 			if (res) {
+ #if RT6_DEBUG >= 2
+ 				pr_debug("%s: del failed: rt=%p@%p err=%d\n",
+-					 __func__, rt, rt->rt6i_node, res);
++					 __func__, rt,
++					 rcu_access_pointer(rt->rt6i_node),
++					 res);
+ #endif
+ 				continue;
+ 			}
+@@ -1878,15 +1899,22 @@ static int __net_init fib6_net_init(struct net *net)
+ 
+ static void fib6_net_exit(struct net *net)
+ {
++	unsigned int i;
++
+ 	rt6_ifdown(net, NULL);
+ 	del_timer_sync(&net->ipv6.ip6_fib_timer);
+ 
+-#ifdef CONFIG_IPV6_MULTIPLE_TABLES
+-	inetpeer_invalidate_tree(&net->ipv6.fib6_local_tbl->tb6_peers);
+-	kfree(net->ipv6.fib6_local_tbl);
+-#endif
+-	inetpeer_invalidate_tree(&net->ipv6.fib6_main_tbl->tb6_peers);
+-	kfree(net->ipv6.fib6_main_tbl);
++	for (i = 0; i < FIB6_TABLE_HASHSZ; i++) {
++		struct hlist_head *head = &net->ipv6.fib_table_hash[i];
++		struct hlist_node *tmp;
++		struct fib6_table *tb;
++
++		hlist_for_each_entry_safe(tb, tmp, head, tb6_hlist) {
++			hlist_del(&tb->tb6_hlist);
++			fib6_free_table(tb);
++		}
++	}
++
+ 	kfree(net->ipv6.fib_table_hash);
+ 	kfree(net->ipv6.rt6_stats);
+ }
+diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
+index d2844ee469cb..f78afe43bdff 100644
+--- a/net/ipv6/ip6_gre.c
++++ b/net/ipv6/ip6_gre.c
+@@ -432,7 +432,9 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ 		}
+ 		break;
+ 	case ICMPV6_PKT_TOOBIG:
+-		mtu = be32_to_cpu(info) - offset;
++		mtu = be32_to_cpu(info) - offset - t->tun_hlen;
++		if (t->dev->type == ARPHRD_ETHER)
++			mtu -= ETH_HLEN;
+ 		if (mtu < IPV6_MIN_MTU)
+ 			mtu = IPV6_MIN_MTU;
+ 		t->dev->mtu = mtu;
+diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
+index 986d4ca38832..b263bf3a19f7 100644
+--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
++++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
+@@ -622,18 +622,12 @@ EXPORT_SYMBOL_GPL(nf_ct_frag6_gather);
+ 
+ static int nf_ct_net_init(struct net *net)
+ {
+-	int res;
+-
+ 	net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
+ 	net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
+ 	net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT;
+-	res = inet_frags_init_net(&net->nf_frag.frags);
+-	if (res)
+-		return res;
+-	res = nf_ct_frag6_sysctl_register(net);
+-	if (res)
+-		inet_frags_uninit_net(&net->nf_frag.frags);
+-	return res;
++	inet_frags_init_net(&net->nf_frag.frags);
++
++	return nf_ct_frag6_sysctl_register(net);
+ }
+ 
+ static void nf_ct_net_exit(struct net *net)
+diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c
+index abb2c307fbe8..a338bbc33cf3 100644
+--- a/net/ipv6/output_core.c
++++ b/net/ipv6/output_core.c
+@@ -86,7 +86,6 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
+ 
+ 	while (offset <= packet_len) {
+ 		struct ipv6_opt_hdr *exthdr;
+-		unsigned int len;
+ 
+ 		switch (**nexthdr) {
+ 
+@@ -112,10 +111,9 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
+ 
+ 		exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
+ 						 offset);
+-		len = ipv6_optlen(exthdr);
+-		if (len + offset >= IPV6_MAXPLEN)
++		offset += ipv6_optlen(exthdr);
++		if (offset > IPV6_MAXPLEN)
+ 			return -EINVAL;
+-		offset += len;
+ 		*nexthdr = &exthdr->nexthdr;
+ 	}
+ 
+diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
+index 3815e8505ed2..e585c0a2591c 100644
+--- a/net/ipv6/reassembly.c
++++ b/net/ipv6/reassembly.c
+@@ -709,19 +709,13 @@ static void ip6_frags_sysctl_unregister(void)
+ 
+ static int __net_init ipv6_frags_init_net(struct net *net)
+ {
+-	int res;
+-
+ 	net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
+ 	net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
+ 	net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT;
+ 
+-	res = inet_frags_init_net(&net->ipv6.frags);
+-	if (res)
+-		return res;
+-	res = ip6_frags_ns_sysctl_register(net);
+-	if (res)
+-		inet_frags_uninit_net(&net->ipv6.frags);
+-	return res;
++	inet_frags_init_net(&net->ipv6.frags);
++
++	return ip6_frags_ns_sysctl_register(net);
+ }
+ 
+ static void __net_exit ipv6_frags_exit_net(struct net *net)
+diff --git a/net/ipv6/route.c b/net/ipv6/route.c
+index 5764a84465f8..61729641e027 100644
+--- a/net/ipv6/route.c
++++ b/net/ipv6/route.c
+@@ -1267,7 +1267,9 @@ static void rt6_dst_from_metrics_check(struct rt6_info *rt)
+ 
+ static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
+ {
+-	if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
++	u32 rt_cookie = 0;
++
++	if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
+ 		return NULL;
+ 
+ 	if (rt6_check_expired(rt))
+@@ -1335,8 +1337,14 @@ static void ip6_link_failure(struct sk_buff *skb)
+ 		if (rt->rt6i_flags & RTF_CACHE) {
+ 			dst_hold(&rt->dst);
+ 			ip6_del_rt(rt);
+-		} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
+-			rt->rt6i_node->fn_sernum = -1;
++		} else {
++			struct fib6_node *fn;
++
++			rcu_read_lock();
++			fn = rcu_dereference(rt->rt6i_node);
++			if (fn && (rt->rt6i_flags & RTF_DEFAULT))
++				fn->fn_sernum = -1;
++			rcu_read_unlock();
+ 		}
+ 	}
+ }
+@@ -1353,7 +1361,8 @@ static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
+ static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
+ {
+ 	return !(rt->rt6i_flags & RTF_CACHE) &&
+-		(rt->rt6i_flags & RTF_PCPU || rt->rt6i_node);
++		(rt->rt6i_flags & RTF_PCPU ||
++		 rcu_access_pointer(rt->rt6i_node));
+ }
+ 
+ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
+diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
+index fecad1098cf8..7eb0e8fe3ca8 100644
+--- a/net/kcm/kcmsock.c
++++ b/net/kcm/kcmsock.c
+@@ -1381,6 +1381,10 @@ static int kcm_attach(struct socket *sock, struct socket *csock,
+ 	if (!csk)
+ 		return -EINVAL;
+ 
++	/* We must prevent loops or risk deadlock ! */
++	if (csk->sk_family == PF_KCM)
++		return -EOPNOTSUPP;
++
+ 	psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL);
+ 	if (!psock)
+ 		return -ENOMEM;
+diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
+index ae7bfd26cd91..35ba4b60d927 100644
+--- a/net/packet/af_packet.c
++++ b/net/packet/af_packet.c
+@@ -2151,6 +2151,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
+ 	struct timespec ts;
+ 	__u32 ts_status;
+ 	bool is_drop_n_account = false;
++	bool do_vnet = false;
+ 
+ 	/* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
+ 	 * We may add members to them until current aligned size without forcing
+@@ -2201,8 +2202,10 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
+ 		netoff = TPACKET_ALIGN(po->tp_hdrlen +
+ 				       (maclen < 16 ? 16 : maclen)) +
+ 				       po->tp_reserve;
+-		if (po->has_vnet_hdr)
++		if (po->has_vnet_hdr) {
+ 			netoff += sizeof(struct virtio_net_hdr);
++			do_vnet = true;
++		}
+ 		macoff = netoff - maclen;
+ 	}
+ 	if (po->tp_version <= TPACKET_V2) {
+@@ -2219,8 +2222,10 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
+ 					skb_set_owner_r(copy_skb, sk);
+ 			}
+ 			snaplen = po->rx_ring.frame_size - macoff;
+-			if ((int)snaplen < 0)
++			if ((int)snaplen < 0) {
+ 				snaplen = 0;
++				do_vnet = false;
++			}
+ 		}
+ 	} else if (unlikely(macoff + snaplen >
+ 			    GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
+@@ -2233,6 +2238,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
+ 		if (unlikely((int)snaplen < 0)) {
+ 			snaplen = 0;
+ 			macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
++			do_vnet = false;
+ 		}
+ 	}
+ 	spin_lock(&sk->sk_receive_queue.lock);
+@@ -2258,7 +2264,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
+ 	}
+ 	spin_unlock(&sk->sk_receive_queue.lock);
+ 
+-	if (po->has_vnet_hdr) {
++	if (do_vnet) {
+ 		if (__packet_rcv_vnet(skb, h.raw + macoff -
+ 					   sizeof(struct virtio_net_hdr))) {
+ 			spin_lock(&sk->sk_receive_queue.lock);
+diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c
+index 048954eee984..e8f56b7c5afb 100644
+--- a/net/sctp/sctp_diag.c
++++ b/net/sctp/sctp_diag.c
+@@ -70,7 +70,8 @@ static int inet_diag_msg_sctpladdrs_fill(struct sk_buff *skb,
+ 
+ 	info = nla_data(attr);
+ 	list_for_each_entry_rcu(laddr, address_list, list) {
+-		memcpy(info, &laddr->a, addrlen);
++		memcpy(info, &laddr->a, sizeof(laddr->a));
++		memset(info + sizeof(laddr->a), 0, addrlen - sizeof(laddr->a));
+ 		info += addrlen;
+ 	}
+ 
+@@ -93,7 +94,9 @@ static int inet_diag_msg_sctpaddrs_fill(struct sk_buff *skb,
+ 	info = nla_data(attr);
+ 	list_for_each_entry(from, &asoc->peer.transport_addr_list,
+ 			    transports) {
+-		memcpy(info, &from->ipaddr, addrlen);
++		memcpy(info, &from->ipaddr, sizeof(from->ipaddr));
++		memset(info + sizeof(from->ipaddr), 0,
++		       addrlen - sizeof(from->ipaddr));
+ 		info += addrlen;
+ 	}
+ 
+diff --git a/net/sctp/socket.c b/net/sctp/socket.c
+index 9647e314d4fc..3ef725229449 100644
+--- a/net/sctp/socket.c
++++ b/net/sctp/socket.c
+@@ -4373,8 +4373,7 @@ int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc,
+ 	info->sctpi_ictrlchunks = asoc->stats.ictrlchunks;
+ 
+ 	prim = asoc->peer.primary_path;
+-	memcpy(&info->sctpi_p_address, &prim->ipaddr,
+-	       sizeof(struct sockaddr_storage));
++	memcpy(&info->sctpi_p_address, &prim->ipaddr, sizeof(prim->ipaddr));
+ 	info->sctpi_p_state = prim->state;
+ 	info->sctpi_p_cwnd = prim->cwnd;
+ 	info->sctpi_p_srtt = prim->srtt;
+diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
+index 84d0fdaf7de9..d3cfbf2f407d 100644
+--- a/net/sctp/ulpqueue.c
++++ b/net/sctp/ulpqueue.c
+@@ -265,7 +265,8 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
+ 		sctp_ulpq_clear_pd(ulpq);
+ 
+ 	if (queue == &sk->sk_receive_queue && !sp->data_ready_signalled) {
+-		sp->data_ready_signalled = 1;
++		if (!sock_owned_by_user(sk))
++			sp->data_ready_signalled = 1;
+ 		sk->sk_data_ready(sk);
+ 	}
+ 	return 1;
author	Mike Pagano <mpagano@gentoo.org>	2017-09-20 06:10:55 -0400
committer	Mike Pagano <mpagano@gentoo.org>	2017-09-20 06:10:55 -0400
commit	55c1db3954d3cdf7dcc7c95dc15d7827827f9294 (patch)
tree	285a9ca070a3c7cf3dcfe8b908940f43fea79c76
parent	Remove redundant patch (diff)
download	linux-patches-4.9-53.tar.gz linux-patches-4.9-53.tar.bz2 linux-patches-4.9-53.zip