Home Home > GIT Browse > vanilla
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKernel Build Daemon <kbuild@suse.de>2019-07-19 12:03:11 +0200
committerKernel Build Daemon <kbuild@suse.de>2019-07-19 12:03:11 +0200
commit64c3c36a7b80cda705d39d057df4136d55890c31 (patch)
treed9977ed3747cbcfcf908a6e5a0f9691321d8bdf5
parentde3dd2072ba7614ce299f501f000b06e1b299315 (diff)
Automatically updated to 5.2-11956-g3bfe1fc46794
-rw-r--r--Documentation/admin-guide/index.rst1
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt13
-rw-r--r--Documentation/admin-guide/xfs.rst (renamed from Documentation/filesystems/xfs.txt)132
-rw-r--r--Documentation/devicetree/bindings/arm/freescale/fsl,scu.txt15
-rw-r--r--Documentation/devicetree/bindings/watchdog/fsl-imx-sc-wdt.txt24
-rw-r--r--Documentation/devicetree/bindings/watchdog/renesas,wdt.txt (renamed from Documentation/devicetree/bindings/watchdog/renesas-wdt.txt)0
-rw-r--r--Documentation/devicetree/bindings/watchdog/sunxi-wdt.txt1
-rw-r--r--Documentation/filesystems/dax.txt2
-rw-r--r--Documentation/power/pm_qos_interface.rst12
-rw-r--r--Documentation/riscv/boot-image-header.txt50
-rw-r--r--Documentation/trace/kprobetrace.rst42
-rw-r--r--Documentation/trace/uprobetracer.rst10
-rw-r--r--Documentation/watchdog/hpwdt.rst4
-rw-r--r--Documentation/watchdog/watchdog-parameters.rst11
-rw-r--r--MAINTAINERS17
-rw-r--r--arch/Kconfig19
-rw-r--r--arch/arm/kernel/module.c7
-rw-r--r--arch/arm64/Kconfig2
-rw-r--r--arch/parisc/include/asm/unistd.h1
-rw-r--r--arch/parisc/kernel/entry.S1
-rw-r--r--arch/parisc/kernel/kprobes.c3
-rw-r--r--arch/parisc/kernel/ptrace.c31
-rw-r--r--arch/parisc/kernel/syscalls/syscall.tbl2
-rw-r--r--arch/riscv/Kconfig10
-rw-r--r--arch/riscv/Kconfig.socs13
-rw-r--r--arch/riscv/boot/dts/sifive/Makefile2
-rw-r--r--arch/riscv/configs/defconfig8
-rw-r--r--arch/riscv/configs/rv32_defconfig2
-rw-r--r--arch/riscv/include/asm/cacheflush.h63
-rw-r--r--arch/riscv/include/asm/fixmap.h5
-rw-r--r--arch/riscv/include/asm/hugetlb.h18
-rw-r--r--arch/riscv/include/asm/image.h65
-rw-r--r--arch/riscv/include/asm/page.h14
-rw-r--r--arch/riscv/include/asm/pgtable-64.h5
-rw-r--r--arch/riscv/include/asm/pgtable.h16
-rw-r--r--arch/riscv/kernel/head.S49
-rw-r--r--arch/riscv/kernel/setup.c6
-rw-r--r--arch/riscv/kernel/vdso.c19
-rw-r--r--arch/riscv/mm/Makefile2
-rw-r--r--arch/riscv/mm/hugetlbpage.c44
-rw-r--r--arch/riscv/mm/init.c326
-rw-r--r--arch/riscv/mm/sifive_l2_cache.c11
-rw-r--r--arch/x86/Kconfig4
-rw-r--r--arch/x86/include/asm/uaccess.h4
-rw-r--r--arch/x86/kernel/ftrace.c6
-rw-r--r--drivers/acpi/acpi_video.c37
-rw-r--r--drivers/acpi/acpica/exconfig.c18
-rw-r--r--drivers/acpi/acpica/tbxfload.c10
-rw-r--r--drivers/acpi/blacklist.c4
-rw-r--r--drivers/acpi/nfit/core.c4
-rw-r--r--drivers/base/power/domain.c8
-rw-r--r--drivers/base/power/domain_governor.c4
-rw-r--r--drivers/base/power/qos.c135
-rw-r--r--drivers/base/power/runtime.c2
-rw-r--r--drivers/block/floppy.c34
-rw-r--r--drivers/block/rbd.c2188
-rw-r--r--drivers/block/rbd_types.h10
-rw-r--r--drivers/cpufreq/bmips-cpufreq.c17
-rw-r--r--drivers/cpufreq/cpufreq.c216
-rw-r--r--drivers/cpufreq/davinci-cpufreq.c3
-rw-r--r--drivers/cpufreq/imx-cpufreq-dt.c3
-rw-r--r--drivers/cpufreq/imx6q-cpufreq.c6
-rw-r--r--drivers/cpufreq/intel_pstate.c7
-rw-r--r--drivers/cpufreq/kirkwood-cpufreq.c3
-rw-r--r--drivers/cpufreq/loongson1-cpufreq.c8
-rw-r--r--drivers/cpufreq/loongson2_cpufreq.c3
-rw-r--r--drivers/cpufreq/maple-cpufreq.c3
-rw-r--r--drivers/cpufreq/omap-cpufreq.c15
-rw-r--r--drivers/cpufreq/pasemi-cpufreq.c3
-rw-r--r--drivers/cpufreq/pmac32-cpufreq.c3
-rw-r--r--drivers/cpufreq/pmac64-cpufreq.c3
-rw-r--r--drivers/cpufreq/s3c2416-cpufreq.c9
-rw-r--r--drivers/cpufreq/s3c64xx-cpufreq.c15
-rw-r--r--drivers/cpufreq/s5pv210-cpufreq.c3
-rw-r--r--drivers/cpufreq/sa1100-cpufreq.c3
-rw-r--r--drivers/cpufreq/sa1110-cpufreq.c3
-rw-r--r--drivers/cpufreq/spear-cpufreq.c3
-rw-r--r--drivers/cpufreq/tegra20-cpufreq.c8
-rw-r--r--drivers/cpuidle/governor.c2
-rw-r--r--drivers/dax/bus.c21
-rw-r--r--drivers/dax/super.c19
-rw-r--r--drivers/md/dm-kcopyd.c34
-rw-r--r--drivers/md/dm-snap.c10
-rw-r--r--drivers/md/dm-table.c24
-rw-r--r--drivers/md/dm-zoned-metadata.c24
-rw-r--r--drivers/md/dm-zoned.h28
-rw-r--r--drivers/md/dm.c5
-rw-r--r--drivers/md/dm.h5
-rw-r--r--drivers/nvdimm/Makefile1
-rw-r--r--drivers/nvdimm/claim.c6
-rw-r--r--drivers/nvdimm/namespace_devs.c8
-rw-r--r--drivers/nvdimm/nd.h1
-rw-r--r--drivers/nvdimm/nd_virtio.c125
-rw-r--r--drivers/nvdimm/pmem.c18
-rw-r--r--drivers/nvdimm/region_devs.c33
-rw-r--r--drivers/nvdimm/virtio_pmem.c122
-rw-r--r--drivers/nvdimm/virtio_pmem.h55
-rw-r--r--drivers/powercap/Kconfig11
-rw-r--r--drivers/powercap/Makefile3
-rw-r--r--drivers/powercap/intel_rapl_common.c (renamed from drivers/powercap/intel_rapl.c)801
-rw-r--r--drivers/powercap/intel_rapl_msr.c183
-rw-r--r--drivers/s390/block/dcssblk.c2
-rw-r--r--drivers/thermal/intel/int340x_thermal/Kconfig6
-rw-r--r--drivers/thermal/intel/int340x_thermal/processor_thermal_device.c173
-rw-r--r--drivers/virtio/Kconfig11
-rw-r--r--drivers/watchdog/Kconfig10
-rw-r--r--drivers/watchdog/acquirewdt.c6
-rw-r--r--drivers/watchdog/advantechwdt.c6
-rw-r--r--drivers/watchdog/aspeed_wdt.c8
-rw-r--r--drivers/watchdog/bcm2835_wdt.c5
-rw-r--r--drivers/watchdog/bcm7038_wdt.c4
-rw-r--r--drivers/watchdog/bcm_kona_wdt.c4
-rw-r--r--drivers/watchdog/cadence_wdt.c4
-rw-r--r--drivers/watchdog/da9052_wdt.c9
-rw-r--r--drivers/watchdog/da9062_wdt.c5
-rw-r--r--drivers/watchdog/davinci_wdt.c14
-rw-r--r--drivers/watchdog/digicolor_wdt.c9
-rw-r--r--drivers/watchdog/ebc-c384_wdt.c9
-rw-r--r--drivers/watchdog/eurotechwdt.c6
-rw-r--r--drivers/watchdog/ftwdt010_wdt.c4
-rw-r--r--drivers/watchdog/gpio_wdt.c7
-rw-r--r--drivers/watchdog/hpwdt.c59
-rw-r--r--drivers/watchdog/i6300esb.c5
-rw-r--r--drivers/watchdog/iTCO_vendor_support.c7
-rw-r--r--drivers/watchdog/iTCO_wdt.c6
-rw-r--r--drivers/watchdog/ib700wdt.c6
-rw-r--r--drivers/watchdog/ie6xx_wdt.c8
-rw-r--r--drivers/watchdog/imx2_wdt.c4
-rw-r--r--drivers/watchdog/imx_sc_wdt.c123
-rw-r--r--drivers/watchdog/intel-mid_wdt.c4
-rw-r--r--drivers/watchdog/jz4740_wdt.c57
-rw-r--r--drivers/watchdog/loongson1_wdt.c4
-rw-r--r--drivers/watchdog/max77620_wdt.c8
-rw-r--r--drivers/watchdog/mei_wdt.c4
-rw-r--r--drivers/watchdog/mena21_wdt.c4
-rw-r--r--drivers/watchdog/menf21bmc_wdt.c4
-rw-r--r--drivers/watchdog/mpc8xxx_wdt.c5
-rw-r--r--drivers/watchdog/mv64x60_wdt.c6
-rw-r--r--drivers/watchdog/ni903x_wdt.c4
-rw-r--r--drivers/watchdog/nic7018_wdt.c1
-rw-r--r--drivers/watchdog/npcm_wdt.c4
-rw-r--r--drivers/watchdog/nv_tco.h6
-rw-r--r--drivers/watchdog/octeon-wdt-main.c11
-rw-r--r--drivers/watchdog/of_xilinx_wdt.c4
-rw-r--r--drivers/watchdog/omap_wdt.c6
-rw-r--r--drivers/watchdog/omap_wdt.h21
-rw-r--r--drivers/watchdog/pc87413_wdt.c6
-rw-r--r--drivers/watchdog/pcwd_pci.c6
-rw-r--r--drivers/watchdog/pcwd_usb.c6
-rw-r--r--drivers/watchdog/pic32-dmt.c4
-rw-r--r--drivers/watchdog/pic32-wdt.c4
-rw-r--r--drivers/watchdog/pnx4008_wdt.c9
-rw-r--r--drivers/watchdog/qcom-wdt.c4
-rw-r--r--drivers/watchdog/rave-sp-wdt.c1
-rw-r--r--drivers/watchdog/renesas_wdt.c35
-rw-r--r--drivers/watchdog/retu_wdt.c10
-rw-r--r--drivers/watchdog/s3c2410_wdt.c4
-rw-r--r--drivers/watchdog/sa1100_wdt.c6
-rw-r--r--drivers/watchdog/sama5d4_wdt.c29
-rw-r--r--drivers/watchdog/sbc7240_wdt.c11
-rw-r--r--drivers/watchdog/sbc8360.c6
-rw-r--r--drivers/watchdog/sch311x_wdt.c6
-rw-r--r--drivers/watchdog/softdog.c6
-rw-r--r--drivers/watchdog/sp5100_tco.c4
-rw-r--r--drivers/watchdog/sp805_wdt.c5
-rw-r--r--drivers/watchdog/sprd_wdt.c1
-rw-r--r--drivers/watchdog/st_lpc_wdt.c4
-rw-r--r--drivers/watchdog/stm32_iwdg.c4
-rw-r--r--drivers/watchdog/stmp3xxx_rtc_wdt.c4
-rw-r--r--drivers/watchdog/tegra_wdt.c4
-rw-r--r--drivers/watchdog/ts4800_wdt.c4
-rw-r--r--drivers/watchdog/w83627hf_wdt.c6
-rw-r--r--drivers/watchdog/wafer5823wdt.c6
-rw-r--r--drivers/watchdog/watchdog_core.c22
-rw-r--r--drivers/watchdog/watchdog_core.h6
-rw-r--r--drivers/watchdog/watchdog_dev.c54
-rw-r--r--drivers/watchdog/wd501p.h6
-rw-r--r--drivers/watchdog/wdt.c6
-rw-r--r--drivers/watchdog/wdt_pci.c6
-rw-r--r--drivers/watchdog/wm831x_wdt.c9
-rw-r--r--drivers/watchdog/xen_wdt.c4
-rw-r--r--drivers/xen/swiotlb-xen.c2
-rw-r--r--fs/ceph/Kconfig12
-rw-r--r--fs/ceph/acl.c22
-rw-r--r--fs/ceph/addr.c2
-rw-r--r--fs/ceph/caps.c120
-rw-r--r--fs/ceph/debugfs.c2
-rw-r--r--fs/ceph/dir.c73
-rw-r--r--fs/ceph/export.c2
-rw-r--r--fs/ceph/file.c34
-rw-r--r--fs/ceph/inode.c208
-rw-r--r--fs/ceph/mds_client.c120
-rw-r--r--fs/ceph/mds_client.h4
-rw-r--r--fs/ceph/mdsmap.c12
-rw-r--r--fs/ceph/quota.c15
-rw-r--r--fs/ceph/snap.c3
-rw-r--r--fs/ceph/super.c13
-rw-r--r--fs/ceph/super.h67
-rw-r--r--fs/ceph/xattr.c456
-rw-r--r--fs/cifs/Kconfig18
-rw-r--r--fs/cifs/Makefile3
-rw-r--r--fs/cifs/cifs_debug.c2
-rw-r--r--fs/cifs/cifs_fs_sb.h6
-rw-r--r--fs/cifs/cifsfs.c14
-rw-r--r--fs/cifs/cifsglob.h7
-rw-r--r--fs/cifs/cifssmb.c16
-rw-r--r--fs/cifs/connect.c61
-rw-r--r--fs/cifs/dfs_cache.c2
-rw-r--r--fs/cifs/inode.c8
-rw-r--r--fs/cifs/misc.c1
-rw-r--r--fs/cifs/smb1ops.c3
-rw-r--r--fs/cifs/smb2inode.c12
-rw-r--r--fs/cifs/smb2ops.c143
-rw-r--r--fs/cifs/smb2pdu.c96
-rw-r--r--fs/cifs/smb2pdu.h36
-rw-r--r--fs/cifs/smb2transport.c10
-rw-r--r--fs/cifs/transport.c46
-rw-r--r--fs/cifs/xattr.c4
-rw-r--r--fs/dax.c53
-rw-r--r--fs/ext4/file.c10
-rw-r--r--fs/nfs/Makefile3
-rw-r--r--fs/nfs/callback_proc.c28
-rw-r--r--fs/nfs/client.c24
-rw-r--r--fs/nfs/dir.c94
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c26
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c2
-rw-r--r--fs/nfs/inode.c30
-rw-r--r--fs/nfs/internal.h7
-rw-r--r--fs/nfs/netns.h3
-rw-r--r--fs/nfs/nfs2xdr.c2
-rw-r--r--fs/nfs/nfs3client.c3
-rw-r--r--fs/nfs/nfs3xdr.c2
-rw-r--r--fs/nfs/nfs4_fs.h4
-rw-r--r--fs/nfs/nfs4client.c14
-rw-r--r--fs/nfs/nfs4file.c8
-rw-r--r--fs/nfs/nfs4proc.c80
-rw-r--r--fs/nfs/nfs4state.c49
-rw-r--r--fs/nfs/nfs4trace.c8
-rw-r--r--fs/nfs/nfs4trace.h283
-rw-r--r--fs/nfs/nfs4xdr.c16
-rw-r--r--fs/nfs/nfstrace.h233
-rw-r--r--fs/nfs/pagelist.c6
-rw-r--r--fs/nfs/pnfs.c20
-rw-r--r--fs/nfs/super.c57
-rw-r--r--fs/nfs/sysfs.c187
-rw-r--r--fs/nfs/sysfs.h25
-rw-r--r--fs/nfs/write.c7
-rw-r--r--fs/xfs/Makefile4
-rw-r--r--fs/xfs/libxfs/xfs_trans_inode.c (renamed from fs/xfs/xfs_trans_inode.c)4
-rw-r--r--fs/xfs/xfs_file.c9
-rw-r--r--include/linux/acpi.h5
-rw-r--r--include/linux/ceph/ceph_features.h1
-rw-r--r--include/linux/ceph/ceph_fs.h2
-rw-r--r--include/linux/ceph/cls_lock_client.h3
-rw-r--r--include/linux/ceph/decode.h13
-rw-r--r--include/linux/ceph/libceph.h10
-rw-r--r--include/linux/ceph/mon_client.h1
-rw-r--r--include/linux/ceph/osd_client.h12
-rw-r--r--include/linux/ceph/striper.h2
-rw-r--r--include/linux/cpufreq.h14
-rw-r--r--include/linux/dax.h41
-rw-r--r--include/linux/device-mapper.h17
-rw-r--r--include/linux/ftrace.h4
-rw-r--r--include/linux/intel_rapl.h155
-rw-r--r--include/linux/iversion.h24
-rw-r--r--include/linux/libnvdimm.h10
-rw-r--r--include/linux/moduleloader.h5
-rw-r--r--include/linux/nfs4.h1
-rw-r--r--include/linux/nfs_fs.h2
-rw-r--r--include/linux/nfs_fs_sb.h1
-rw-r--r--include/linux/pm_qos.h48
-rw-r--r--include/linux/sunrpc/bc_xprt.h1
-rw-r--r--include/linux/sunrpc/clnt.h4
-rw-r--r--include/linux/sunrpc/metrics.h7
-rw-r--r--include/linux/sunrpc/sched.h4
-rw-r--r--include/linux/sunrpc/xprt.h10
-rw-r--r--include/linux/sunrpc/xprtmultipath.h2
-rw-r--r--include/linux/sunrpc/xprtsock.h5
-rw-r--r--include/linux/trace_events.h9
-rw-r--r--include/linux/uaccess.h20
-rw-r--r--include/sound/hda_codec.h2
-rw-r--r--include/trace/events/rpcrdma.h90
-rw-r--r--include/uapi/linux/virtio_ids.h1
-rw-r--r--include/uapi/linux/virtio_pmem.h34
-rw-r--r--kernel/dma/swiotlb.c30
-rw-r--r--kernel/kprobes.c3
-rw-r--r--kernel/module.c60
-rw-r--r--kernel/trace/Kconfig12
-rw-r--r--kernel/trace/ftrace.c48
-rw-r--r--kernel/trace/ring_buffer.c17
-rw-r--r--kernel/trace/trace.c17
-rw-r--r--kernel/trace/trace_event_perf.c3
-rw-r--r--kernel/trace/trace_events.c10
-rw-r--r--kernel/trace/trace_events_filter.c3
-rw-r--r--kernel/trace/trace_kprobe.c357
-rw-r--r--kernel/trace/trace_probe.c142
-rw-r--r--kernel/trace/trace_probe.h77
-rw-r--r--kernel/trace/trace_probe_tmpl.h36
-rw-r--r--kernel/trace/trace_uprobe.c180
-rw-r--r--kernel/tracepoint.c4
-rw-r--r--mm/maccess.c122
-rw-r--r--net/ceph/Makefile2
-rw-r--r--net/ceph/cls_lock_client.c54
-rw-r--r--net/ceph/decode.c84
-rw-r--r--net/ceph/messenger.c14
-rw-r--r--net/ceph/mon_client.c21
-rw-r--r--net/ceph/osd_client.c42
-rw-r--r--net/ceph/osdmap.c31
-rw-r--r--net/ceph/pagevec.c33
-rw-r--r--net/ceph/striper.c17
-rw-r--r--net/sunrpc/Kconfig2
-rw-r--r--net/sunrpc/backchannel_rqst.c40
-rw-r--r--net/sunrpc/clnt.c95
-rw-r--r--net/sunrpc/debugfs.c52
-rw-r--r--net/sunrpc/sched.c81
-rw-r--r--net/sunrpc/stats.c23
-rw-r--r--net/sunrpc/svc.c2
-rw-r--r--net/sunrpc/xprt.c101
-rw-r--r--net/sunrpc/xprtmultipath.c89
-rw-r--r--net/sunrpc/xprtrdma/backchannel.c7
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c327
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c152
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_backchannel.c4
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c8
-rw-r--r--net/sunrpc/xprtrdma/transport.c84
-rw-r--r--net/sunrpc/xprtrdma/verbs.c115
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h45
-rw-r--r--net/sunrpc/xprtsock.c126
-rw-r--r--sound/core/seq/seq_clientmgr.c11
-rw-r--r--sound/pci/au88x0/au88x0_a3d.c15
-rw-r--r--sound/pci/emu10k1/emu10k1x.c3
-rw-r--r--sound/pci/hda/hda_codec.c8
-rw-r--r--sound/pci/hda/patch_hdmi.c31
-rw-r--r--sound/pci/hda/patch_realtek.c10
-rw-r--r--sound/pci/lx6464es/lx6464es.c3
-rw-r--r--sound/pci/rme9652/rme9652.c3
-rw-r--r--sound/ppc/snd_ps3.c3
-rw-r--r--tools/perf/Documentation/perf-probe.txt3
-rw-r--r--tools/perf/util/probe-event.c11
-rw-r--r--tools/perf/util/probe-event.h2
-rw-r--r--tools/perf/util/probe-file.c7
-rw-r--r--tools/perf/util/probe-file.h1
-rw-r--r--tools/perf/util/probe-finder.c19
-rwxr-xr-xtools/testing/selftests/ftrace/ftracetest38
-rw-r--r--tools/testing/selftests/ftrace/test.d/functions4
-rw-r--r--tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_user.tc32
346 files changed, 8787 insertions, 4108 deletions
diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst
index 280355d08af5..33feab2f4084 100644
--- a/Documentation/admin-guide/index.rst
+++ b/Documentation/admin-guide/index.rst
@@ -77,6 +77,7 @@ configure specific aspects of kernel behavior to your liking.
blockdev/index
ext4
binderfs
+ xfs
pm/index
thunderbolt
LSM/index
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index a5f4004e8705..f0461456d910 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2011,6 +2011,19 @@
Built with CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF=y,
the default is off.
+ kprobe_event=[probe-list]
+ [FTRACE] Add kprobe events and enable at boot time.
+ The probe-list is a semicolon delimited list of probe
+ definitions. Each definition is same as kprobe_events
+ interface, but the parameters are comma delimited.
+ For example, to add a kprobe event on vfs_read with
+ arg1 and arg2, add to the command line;
+
+ kprobe_event=p,vfs_read,$arg1,$arg2
+
+ See also Documentation/trace/kprobetrace.rst "Kernel
+ Boot Parameter" section.
+
kpti= [ARM64] Control page table isolation of user
and kernel address spaces.
Default: enabled on cores which need mitigation.
diff --git a/Documentation/filesystems/xfs.txt b/Documentation/admin-guide/xfs.rst
index a5cbb5e0e3db..e76665a8f2f2 100644
--- a/Documentation/filesystems/xfs.txt
+++ b/Documentation/admin-guide/xfs.rst
@@ -1,4 +1,6 @@
+.. SPDX-License-Identifier: GPL-2.0
+======================
The SGI XFS Filesystem
======================
@@ -18,8 +20,6 @@ Mount Options
=============
When mounting an XFS filesystem, the following options are accepted.
-For boolean mount options, the names with the (*) suffix is the
-default behaviour.
allocsize=size
Sets the buffered I/O end-of-file preallocation size when
@@ -31,46 +31,43 @@ default behaviour.
preallocation size, which uses a set of heuristics to
optimise the preallocation size based on the current
allocation patterns within the file and the access patterns
- to the file. Specifying a fixed allocsize value turns off
+ to the file. Specifying a fixed ``allocsize`` value turns off
the dynamic behaviour.
- attr2
- noattr2
+ attr2 or noattr2
The options enable/disable an "opportunistic" improvement to
be made in the way inline extended attributes are stored
on-disk. When the new form is used for the first time when
- attr2 is selected (either when setting or removing extended
+ ``attr2`` is selected (either when setting or removing extended
attributes) the on-disk superblock feature bit field will be
updated to reflect this format being in use.
The default behaviour is determined by the on-disk feature
- bit indicating that attr2 behaviour is active. If either
- mount option it set, then that becomes the new default used
+ bit indicating that ``attr2`` behaviour is active. If either
+ mount option is set, then that becomes the new default used
by the filesystem.
- CRC enabled filesystems always use the attr2 format, and so
- will reject the noattr2 mount option if it is set.
+ CRC enabled filesystems always use the ``attr2`` format, and so
+ will reject the ``noattr2`` mount option if it is set.
- discard
- nodiscard (*)
+ discard or nodiscard (default)
Enable/disable the issuing of commands to let the block
device reclaim space freed by the filesystem. This is
useful for SSD devices, thinly provisioned LUNs and virtual
machine images, but may have a performance impact.
- Note: It is currently recommended that you use the fstrim
- application to discard unused blocks rather than the discard
+ Note: It is currently recommended that you use the ``fstrim``
+ application to ``discard`` unused blocks rather than the ``discard``
mount option because the performance impact of this option
is quite severe.
- grpid/bsdgroups
- nogrpid/sysvgroups (*)
+ grpid/bsdgroups or nogrpid/sysvgroups (default)
These options define what group ID a newly created file
- gets. When grpid is set, it takes the group ID of the
+ gets. When ``grpid`` is set, it takes the group ID of the
directory in which it is created; otherwise it takes the
- fsgid of the current process, unless the directory has the
- setgid bit set, in which case it takes the gid from the
- parent directory, and also gets the setgid bit set if it is
+ ``fsgid`` of the current process, unless the directory has the
+ ``setgid`` bit set, in which case it takes the ``gid`` from the
+ parent directory, and also gets the ``setgid`` bit set if it is
a directory itself.
filestreams
@@ -78,46 +75,42 @@ default behaviour.
across the entire filesystem rather than just on directories
configured to use it.
- ikeep
- noikeep (*)
- When ikeep is specified, XFS does not delete empty inode
- clusters and keeps them around on disk. When noikeep is
+ ikeep or noikeep (default)
+ When ``ikeep`` is specified, XFS does not delete empty inode
+ clusters and keeps them around on disk. When ``noikeep`` is
specified, empty inode clusters are returned to the free
space pool.
- inode32
- inode64 (*)
- When inode32 is specified, it indicates that XFS limits
+ inode32 or inode64 (default)
+ When ``inode32`` is specified, it indicates that XFS limits
inode creation to locations which will not result in inode
numbers with more than 32 bits of significance.
- When inode64 is specified, it indicates that XFS is allowed
+ When ``inode64`` is specified, it indicates that XFS is allowed
to create inodes at any location in the filesystem,
including those which will result in inode numbers occupying
- more than 32 bits of significance.
+ more than 32 bits of significance.
- inode32 is provided for backwards compatibility with older
+ ``inode32`` is provided for backwards compatibility with older
systems and applications, since 64 bits inode numbers might
cause problems for some applications that cannot handle
large inode numbers. If applications are in use which do
- not handle inode numbers bigger than 32 bits, the inode32
+ not handle inode numbers bigger than 32 bits, the ``inode32``
option should be specified.
-
- largeio
- nolargeio (*)
- If "nolargeio" is specified, the optimal I/O reported in
- st_blksize by stat(2) will be as small as possible to allow
+ largeio or nolargeio (default)
+ If ``nolargeio`` is specified, the optimal I/O reported in
+ ``st_blksize`` by **stat(2)** will be as small as possible to allow
user applications to avoid inefficient read/modify/write
I/O. This is typically the page size of the machine, as
this is the granularity of the page cache.
- If "largeio" specified, a filesystem that was created with a
- "swidth" specified will return the "swidth" value (in bytes)
- in st_blksize. If the filesystem does not have a "swidth"
- specified but does specify an "allocsize" then "allocsize"
+ If ``largeio`` is specified, a filesystem that was created with a
+ ``swidth`` specified will return the ``swidth`` value (in bytes)
+ in ``st_blksize``. If the filesystem does not have a ``swidth``
+ specified but does specify an ``allocsize`` then ``allocsize``
(in bytes) will be returned instead. Otherwise the behaviour
- is the same as if "nolargeio" was specified.
+ is the same as if ``nolargeio`` was specified.
logbufs=value
Set the number of in-memory log buffers. Valid numbers
@@ -127,7 +120,7 @@ default behaviour.
If the memory cost of 8 log buffers is too high on small
systems, then it may be reduced at some cost to performance
- on metadata intensive workloads. The logbsize option below
+ on metadata intensive workloads. The ``logbsize`` option below
controls the size of each buffer and so is also relevant to
this case.
@@ -138,7 +131,7 @@ default behaviour.
and 32768 (32k). Valid sizes for version 2 logs also
include 65536 (64k), 131072 (128k) and 262144 (256k). The
logbsize must be an integer multiple of the log
- stripe unit configured at mkfs time.
+ stripe unit configured at **mkfs(8)** time.
The default value for for version 1 logs is 32768, while the
default value for version 2 logs is MAX(32768, log_sunit).
@@ -153,21 +146,21 @@ default behaviour.
noalign
Data allocations will not be aligned at stripe unit
boundaries. This is only relevant to filesystems created
- with non-zero data alignment parameters (sunit, swidth) by
- mkfs.
+ with non-zero data alignment parameters (``sunit``, ``swidth``) by
+ **mkfs(8)**.
norecovery
The filesystem will be mounted without running log recovery.
If the filesystem was not cleanly unmounted, it is likely to
- be inconsistent when mounted in "norecovery" mode.
+ be inconsistent when mounted in ``norecovery`` mode.
Some files or directories may not be accessible because of this.
- Filesystems mounted "norecovery" must be mounted read-only or
+ Filesystems mounted ``norecovery`` must be mounted read-only or
the mount will fail.
nouuid
Don't check for double mounted file systems using the file
- system uuid. This is useful to mount LVM snapshot volumes,
- and often used in combination with "norecovery" for mounting
+ system ``uuid``. This is useful to mount LVM snapshot volumes,
+ and often used in combination with ``norecovery`` for mounting
read-only snapshots.
noquota
@@ -176,15 +169,15 @@ default behaviour.
uquota/usrquota/uqnoenforce/quota
User disk quota accounting enabled, and limits (optionally)
- enforced. Refer to xfs_quota(8) for further details.
+ enforced. Refer to **xfs_quota(8)** for further details.
gquota/grpquota/gqnoenforce
Group disk quota accounting enabled and limits (optionally)
- enforced. Refer to xfs_quota(8) for further details.
+ enforced. Refer to **xfs_quota(8)** for further details.
pquota/prjquota/pqnoenforce
Project disk quota accounting enabled and limits (optionally)
- enforced. Refer to xfs_quota(8) for further details.
+ enforced. Refer to **xfs_quota(8)** for further details.
sunit=value and swidth=value
Used to specify the stripe unit and width for a RAID device
@@ -192,11 +185,11 @@ default behaviour.
block units. These options are only relevant to filesystems
that were created with non-zero data alignment parameters.
- The sunit and swidth parameters specified must be compatible
+ The ``sunit`` and ``swidth`` parameters specified must be compatible
with the existing filesystem alignment characteristics. In
- general, that means the only valid changes to sunit are
- increasing it by a power-of-2 multiple. Valid swidth values
- are any integer multiple of a valid sunit value.
+ general, that means the only valid changes to ``sunit`` are
+ increasing it by a power-of-2 multiple. Valid ``swidth`` values
+ are any integer multiple of a valid ``sunit`` value.
Typically the only time these mount options are necessary if
after an underlying RAID device has had it's geometry
@@ -221,22 +214,25 @@ default behaviour.
Deprecated Mount Options
========================
+=========================== ================
Name Removal Schedule
- ---- ----------------
+=========================== ================
+=========================== ================
Removed Mount Options
=====================
+=========================== =======
Name Removed
- ---- -------
+=========================== =======
delaylog/nodelaylog v4.0
ihashsize v4.0
irixsgid v4.0
osyncisdsync/osyncisosync v4.0
barrier v4.19
nobarrier v4.19
-
+=========================== =======
sysctls
=======
@@ -302,27 +298,27 @@ The following sysctls are available for the XFS filesystem:
fs.xfs.inherit_sync (Min: 0 Default: 1 Max: 1)
Setting this to "1" will cause the "sync" flag set
- by the xfs_io(8) chattr command on a directory to be
+ by the **xfs_io(8)** chattr command on a directory to be
inherited by files in that directory.
fs.xfs.inherit_nodump (Min: 0 Default: 1 Max: 1)
Setting this to "1" will cause the "nodump" flag set
- by the xfs_io(8) chattr command on a directory to be
+ by the **xfs_io(8)** chattr command on a directory to be
inherited by files in that directory.
fs.xfs.inherit_noatime (Min: 0 Default: 1 Max: 1)
Setting this to "1" will cause the "noatime" flag set
- by the xfs_io(8) chattr command on a directory to be
+ by the **xfs_io(8)** chattr command on a directory to be
inherited by files in that directory.
fs.xfs.inherit_nosymlinks (Min: 0 Default: 1 Max: 1)
Setting this to "1" will cause the "nosymlinks" flag set
- by the xfs_io(8) chattr command on a directory to be
+ by the **xfs_io(8)** chattr command on a directory to be
inherited by files in that directory.
fs.xfs.inherit_nodefrag (Min: 0 Default: 1 Max: 1)
Setting this to "1" will cause the "nodefrag" flag set
- by the xfs_io(8) chattr command on a directory to be
+ by the **xfs_io(8)** chattr command on a directory to be
inherited by files in that directory.
fs.xfs.rotorstep (Min: 1 Default: 1 Max: 256)
@@ -368,7 +364,7 @@ handler:
-error handlers:
Defines the behavior for a specific error.
-The filesystem behavior during an error can be set via sysfs files. Each
+The filesystem behavior during an error can be set via ``sysfs`` files. Each
error handler works independently - the first condition met by an error handler
for a specific class will cause the error to be propagated rather than reset and
retried.
@@ -419,7 +415,7 @@ level directory:
handler configurations.
Note: there is no guarantee that fail_at_unmount can be set while an
- unmount is in progress. It is possible that the sysfs entries are
+ unmount is in progress. It is possible that the ``sysfs`` entries are
removed by the unmounting filesystem before a "retry forever" error
handler configuration causes unmount to hang, and hence the filesystem
must be configured appropriately before unmount begins to prevent
@@ -428,7 +424,7 @@ level directory:
Each filesystem has specific error class handlers that define the error
propagation behaviour for specific errors. There is also a "default" error
handler defined, which defines the behaviour for all errors that don't have
-specific handlers defined. Where multiple retry constraints are configuredi for
+specific handlers defined. Where multiple retry constraints are configured for
a single error, the first retry configuration that expires will cause the error
to be propagated. The handler configurations are found in the directory:
@@ -463,7 +459,7 @@ to be propagated. The handler configurations are found in the directory:
Setting the value to "N" (where 0 < N < Max) will allow XFS to retry the
operation for up to "N" seconds before propagating the error.
-Note: The default behaviour for a specific error handler is dependent on both
+**Note:** The default behaviour for a specific error handler is dependent on both
the class and error context. For example, the default values for
"metadata/ENODEV" are "0" rather than "-1" so that this error handler defaults
to "fail immediately" behaviour. This is done because ENODEV is a fatal,
diff --git a/Documentation/devicetree/bindings/arm/freescale/fsl,scu.txt b/Documentation/devicetree/bindings/arm/freescale/fsl,scu.txt
index f378922906f6..a575e42f7fec 100644
--- a/Documentation/devicetree/bindings/arm/freescale/fsl,scu.txt
+++ b/Documentation/devicetree/bindings/arm/freescale/fsl,scu.txt
@@ -145,6 +145,16 @@ Optional Child nodes:
- Data cells of ocotp:
Detailed bindings are described in bindings/nvmem/nvmem.txt
+Watchdog bindings based on SCU Message Protocol
+------------------------------------------------------------
+
+Required properties:
+- compatible: should be:
+ "fsl,imx8qxp-sc-wdt"
+ followed by "fsl,imx-sc-wdt";
+Optional properties:
+- timeout-sec: contains the watchdog timeout in seconds.
+
Example (imx8qxp):
-------------
aliases {
@@ -207,6 +217,11 @@ firmware {
rtc: rtc {
compatible = "fsl,imx8qxp-sc-rtc";
};
+
+ watchdog {
+ compatible = "fsl,imx8qxp-sc-wdt", "fsl,imx-sc-wdt";
+ timeout-sec = <60>;
+ };
};
};
diff --git a/Documentation/devicetree/bindings/watchdog/fsl-imx-sc-wdt.txt b/Documentation/devicetree/bindings/watchdog/fsl-imx-sc-wdt.txt
deleted file mode 100644
index 02b87e92ae68..000000000000
--- a/Documentation/devicetree/bindings/watchdog/fsl-imx-sc-wdt.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-* Freescale i.MX System Controller Watchdog
-
-i.MX system controller watchdog is for i.MX SoCs with system controller inside,
-the watchdog is managed by system controller, users can ONLY communicate with
-system controller from secure mode for watchdog operations, so Linux i.MX system
-controller watchdog driver will call ARM SMC API and trap into ARM-Trusted-Firmware
-for watchdog operations, ARM-Trusted-Firmware is running at secure EL3 mode and
-it will request system controller to execute the watchdog operation passed from
-Linux kernel.
-
-Required properties:
-- compatible: Should be :
- "fsl,imx8qxp-sc-wdt"
- followed by "fsl,imx-sc-wdt";
-
-Optional properties:
-- timeout-sec : Contains the watchdog timeout in seconds.
-
-Examples:
-
-watchdog {
- compatible = "fsl,imx8qxp-sc-wdt", "fsl,imx-sc-wdt";
- timeout-sec = <60>;
-};
diff --git a/Documentation/devicetree/bindings/watchdog/renesas-wdt.txt b/Documentation/devicetree/bindings/watchdog/renesas,wdt.txt
index 9f365c1a3399..9f365c1a3399 100644
--- a/Documentation/devicetree/bindings/watchdog/renesas-wdt.txt
+++ b/Documentation/devicetree/bindings/watchdog/renesas,wdt.txt
diff --git a/Documentation/devicetree/bindings/watchdog/sunxi-wdt.txt b/Documentation/devicetree/bindings/watchdog/sunxi-wdt.txt
index 46055254e8dd..e65198d82a2b 100644
--- a/Documentation/devicetree/bindings/watchdog/sunxi-wdt.txt
+++ b/Documentation/devicetree/bindings/watchdog/sunxi-wdt.txt
@@ -6,6 +6,7 @@ Required properties:
"allwinner,sun4i-a10-wdt"
"allwinner,sun6i-a31-wdt"
"allwinner,sun50i-a64-wdt","allwinner,sun6i-a31-wdt"
+ "allwinner,sun50i-h6-wdt","allwinner,sun6i-a31-wdt"
"allwinner,suniv-f1c100s-wdt", "allwinner,sun4i-a10-wdt"
- reg : Specifies base physical address and size of the registers.
diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt
index 6d2c0d340dea..679729442fd2 100644
--- a/Documentation/filesystems/dax.txt
+++ b/Documentation/filesystems/dax.txt
@@ -76,7 +76,7 @@ exposure of uninitialized data through mmap.
These filesystems may be used for inspiration:
- ext2: see Documentation/filesystems/ext2.txt
- ext4: see Documentation/filesystems/ext4/
-- xfs: see Documentation/filesystems/xfs.txt
+- xfs: see Documentation/admin-guide/xfs.rst
Handling Media Errors
diff --git a/Documentation/power/pm_qos_interface.rst b/Documentation/power/pm_qos_interface.rst
index 945fc6d760c9..69921f072ce1 100644
--- a/Documentation/power/pm_qos_interface.rst
+++ b/Documentation/power/pm_qos_interface.rst
@@ -129,7 +129,7 @@ int dev_pm_qos_remove_request(handle):
and call the notification trees if the target was changed as a result of
removing the request.
-s32 dev_pm_qos_read_value(device):
+s32 dev_pm_qos_read_value(device, type):
Returns the aggregated value for a given device's constraints list.
enum pm_qos_flags_status dev_pm_qos_flags(device, mask)
@@ -176,12 +176,14 @@ Notification mechanisms:
The per-device PM QoS framework has a per-device notification tree.
-int dev_pm_qos_add_notifier(device, notifier):
- Adds a notification callback function for the device.
+int dev_pm_qos_add_notifier(device, notifier, type):
+ Adds a notification callback function for the device for a particular request
+ type.
+
The callback is called when the aggregated value of the device constraints list
- is changed (for resume latency device PM QoS only).
+ is changed.
-int dev_pm_qos_remove_notifier(device, notifier):
+int dev_pm_qos_remove_notifier(device, notifier, type):
Removes the notification callback function for the device.
diff --git a/Documentation/riscv/boot-image-header.txt b/Documentation/riscv/boot-image-header.txt
new file mode 100644
index 000000000000..1b73fea23b39
--- /dev/null
+++ b/Documentation/riscv/boot-image-header.txt
@@ -0,0 +1,50 @@
+ Boot image header in RISC-V Linux
+ =============================================
+
+Author: Atish Patra <atish.patra@wdc.com>
+Date : 20 May 2019
+
+This document only describes the boot image header details for RISC-V Linux.
+The complete booting guide will be available at Documentation/riscv/booting.txt.
+
+The following 64-byte header is present in decompressed Linux kernel image.
+
+ u32 code0; /* Executable code */
+ u32 code1; /* Executable code */
+ u64 text_offset; /* Image load offset, little endian */
+ u64 image_size; /* Effective Image size, little endian */
+ u64 flags; /* kernel flags, little endian */
+ u32 version; /* Version of this header */
+ u32 res1 = 0; /* Reserved */
+ u64 res2 = 0; /* Reserved */
+ u64 magic = 0x5643534952; /* Magic number, little endian, "RISCV" */
+ u32 res3; /* Reserved for additional RISC-V specific header */
+ u32 res4; /* Reserved for PE COFF offset */
+
+This header format is compliant with PE/COFF header and largely inspired from
+ARM64 header. Thus, both ARM64 & RISC-V header can be combined into one common
+header in future.
+
+Notes:
+- This header can also be reused to support EFI stub for RISC-V in future. EFI
+ specification needs PE/COFF image header in the beginning of the kernel image
+ in order to load it as an EFI application. In order to support EFI stub,
+ code0 should be replaced with "MZ" magic string and res5(at offset 0x3c) should
+ point to the rest of the PE/COFF header.
+
+- version field indicate header version number.
+ Bits 0:15 - Minor version
+ Bits 16:31 - Major version
+
+ This preserves compatibility across newer and older version of the header.
+ The current version is defined as 0.1.
+
+- res3 is reserved for offset to any other additional fields. This makes the
+ header extendible in future. One example would be to accommodate ISA
+ extension for RISC-V in future. For current version, it is set to be zero.
+
+- In current header, the flag field has only one field.
+ Bit 0: Kernel endianness. 1 if BE, 0 if LE.
+
+- Image size is mandatory for boot loader to load kernel image. Booting will
+ fail otherwise.
diff --git a/Documentation/trace/kprobetrace.rst b/Documentation/trace/kprobetrace.rst
index 7d2b0178d3f3..fbb314bfa112 100644
--- a/Documentation/trace/kprobetrace.rst
+++ b/Documentation/trace/kprobetrace.rst
@@ -51,15 +51,17 @@ Synopsis of kprobe_events
$argN : Fetch the Nth function argument. (N >= 1) (\*1)
$retval : Fetch return value.(\*2)
$comm : Fetch current task comm.
- +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(\*3)
+ +|-[u]OFFS(FETCHARG) : Fetch memory at FETCHARG +|- OFFS address.(\*3)(\*4)
NAME=FETCHARG : Set NAME as the argument name of FETCHARG.
FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types
(u8/u16/u32/u64/s8/s16/s32/s64), hexadecimal types
- (x8/x16/x32/x64), "string" and bitfield are supported.
+ (x8/x16/x32/x64), "string", "ustring" and bitfield
+ are supported.
(\*1) only for the probe on function entry (offs == 0).
(\*2) only for return probe.
(\*3) this is useful for fetching a field of data structures.
+ (\*4) "u" means user-space dereference. See :ref:`user_mem_access`.
Types
-----
@@ -77,7 +79,8 @@ apply it to registers/stack-entries etc. (for example, '$stack1:x8[8]' is
wrong, but '+8($stack):x8[8]' is OK.)
String type is a special type, which fetches a "null-terminated" string from
kernel space. This means it will fail and store NULL if the string container
-has been paged out.
+has been paged out. "ustring" type is an alternative of string for user-space.
+See :ref:`user_mem_access` for more info..
The string array type is a bit different from other types. For other base
types, <base-type>[1] is equal to <base-type> (e.g. +0(%di):x32[1] is same
as +0(%di):x32.) But string[1] is not equal to string. The string type itself
@@ -92,6 +95,25 @@ Symbol type('symbol') is an alias of u32 or u64 type (depends on BITS_PER_LONG)
which shows given pointer in "symbol+offset" style.
For $comm, the default type is "string"; any other type is invalid.
+.. _user_mem_access:
+User Memory Access
+------------------
+Kprobe events supports user-space memory access. For that purpose, you can use
+either user-space dereference syntax or 'ustring' type.
+
+The user-space dereference syntax allows you to access a field of a data
+structure in user-space. This is done by adding the "u" prefix to the
+dereference syntax. For example, +u4(%si) means it will read memory from the
+address in the register %si offset by 4, and the memory is expected to be in
+user-space. You can use this for strings too, e.g. +u0(%si):string will read
+a string from the address in the register %si that is expected to be in user-
+space. 'ustring' is a shortcut way of performing the same task. That is,
++0(%si):ustring is equivalent to +u0(%si):string.
+
+Note that kprobe-event provides the user-memory access syntax but it doesn't
+use it transparently. This means if you use normal dereference or string type
+for user memory, it might fail, and may always fail on some archs. The user
+has to carefully check if the target data is in kernel or user space.
Per-Probe Event Filtering
-------------------------
@@ -124,6 +146,20 @@ You can check the total number of probe hits and probe miss-hits via
The first column is event name, the second is the number of probe hits,
the third is the number of probe miss-hits.
+Kernel Boot Parameter
+---------------------
+You can add and enable new kprobe events when booting up the kernel by
+"kprobe_event=" parameter. The parameter accepts a semicolon-delimited
+kprobe events, which format is similar to the kprobe_events.
+The difference is that the probe definition parameters are comma-delimited
+instead of space. For example, adding myprobe event on do_sys_open like below
+
+ p:myprobe do_sys_open dfd=%ax filename=%dx flags=%cx mode=+4($stack)
+
+should be below for kernel boot parameter (just replace spaces with comma)
+
+ p:myprobe,do_sys_open,dfd=%ax,filename=%dx,flags=%cx,mode=+4($stack)
+
Usage examples
--------------
diff --git a/Documentation/trace/uprobetracer.rst b/Documentation/trace/uprobetracer.rst
index 0b21305fabdc..6e75a6c5a2c8 100644
--- a/Documentation/trace/uprobetracer.rst
+++ b/Documentation/trace/uprobetracer.rst
@@ -42,16 +42,18 @@ Synopsis of uprobe_tracer
@+OFFSET : Fetch memory at OFFSET (OFFSET from same file as PATH)
$stackN : Fetch Nth entry of stack (N >= 0)
$stack : Fetch stack address.
- $retval : Fetch return value.(*)
+ $retval : Fetch return value.(\*1)
$comm : Fetch current task comm.
- +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(**)
+ +|-[u]OFFS(FETCHARG) : Fetch memory at FETCHARG +|- OFFS address.(\*2)(\*3)
NAME=FETCHARG : Set NAME as the argument name of FETCHARG.
FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types
(u8/u16/u32/u64/s8/s16/s32/s64), hexadecimal types
(x8/x16/x32/x64), "string" and bitfield are supported.
- (*) only for return probe.
- (**) this is useful for fetching a field of data structures.
+ (\*1) only for return probe.
+ (\*2) this is useful for fetching a field of data structures.
+ (\*3) Unlike kprobe event, "u" prefix will just be ignored, becuse uprobe
+ events can access only user-space memory.
Types
-----
diff --git a/Documentation/watchdog/hpwdt.rst b/Documentation/watchdog/hpwdt.rst
index 94a96371113e..c165d92cfd12 100644
--- a/Documentation/watchdog/hpwdt.rst
+++ b/Documentation/watchdog/hpwdt.rst
@@ -39,6 +39,10 @@ Last reviewed: 08/20/2018
Default value is set when compiling the kernel. If it is set
to "Y", then there is no way of disabling the watchdog once
it has been started.
+ kdumptimeout Minimum timeout in seconds to apply upon receipt of an NMI
+ before calling panic. (-1) disables the watchdog. When value
+ is > 0, the timer is reprogrammed with the greater of
+ value or current timeout value.
============ ================================================================
NOTE:
diff --git a/Documentation/watchdog/watchdog-parameters.rst b/Documentation/watchdog/watchdog-parameters.rst
index b121caae7798..a3985cc5aeda 100644
--- a/Documentation/watchdog/watchdog-parameters.rst
+++ b/Documentation/watchdog/watchdog-parameters.rst
@@ -13,6 +13,17 @@ modules.
-------------------------------------------------
+watchdog core:
+ open_timeout:
+ Maximum time, in seconds, for which the watchdog framework will take
+ care of pinging a running hardware watchdog until userspace opens the
+ corresponding /dev/watchdogN device. A value of 0 means an infinite
+ timeout. Setting this to a non-zero value can be useful to ensure that
+ either userspace comes up properly, or the board gets reset and allows
+ fallback logic in the bootloader to try something else.
+
+-------------------------------------------------
+
acquirewdt:
wdt_stop:
Acquire WDT 'stop' io port (default 0x43)
diff --git a/MAINTAINERS b/MAINTAINERS
index ce0687771073..500cdb68ccbc 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3765,7 +3765,7 @@ F: arch/powerpc/platforms/cell/
CEPH COMMON CODE (LIBCEPH)
M: Ilya Dryomov <idryomov@gmail.com>
-M: "Yan, Zheng" <zyan@redhat.com>
+M: Jeff Layton <jlayton@kernel.org>
M: Sage Weil <sage@redhat.com>
L: ceph-devel@vger.kernel.org
W: http://ceph.com/
@@ -3777,7 +3777,7 @@ F: include/linux/ceph/
F: include/linux/crush/
CEPH DISTRIBUTED FILE SYSTEM CLIENT (CEPH)
-M: "Yan, Zheng" <zyan@redhat.com>
+M: Jeff Layton <jlayton@kernel.org>
M: Sage Weil <sage@redhat.com>
M: Ilya Dryomov <idryomov@gmail.com>
L: ceph-devel@vger.kernel.org
@@ -6321,9 +6321,8 @@ F: Documentation/devicetree/bindings/counter/ftm-quaddec.txt
F: drivers/counter/ftm-quaddec.c
FLOPPY DRIVER
-M: Jiri Kosina <jikos@kernel.org>
-T: git git://git.kernel.org/pub/scm/linux/kernel/git/jikos/floppy.git
-S: Odd fixes
+S: Orphan
+L: linux-block@vger.kernel.org
F: drivers/block/floppy.c
FMC SUBSYSTEM
@@ -12841,6 +12840,7 @@ F: drivers/base/power/
F: include/linux/pm.h
F: include/linux/pm_*
F: include/linux/powercap.h
+F: include/linux/intel_rapl.h
F: drivers/powercap/
F: kernel/configs/nopm.config
@@ -13720,7 +13720,7 @@ RISC-V ARCHITECTURE
M: Palmer Dabbelt <palmer@sifive.com>
M: Albert Ou <aou@eecs.berkeley.edu>
L: linux-riscv@lists.infradead.org
-T: git git://git.kernel.org/pub/scm/linux/kernel/git/palmer/riscv-linux.git
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/riscv/linux.git
S: Supported
F: arch/riscv/
K: riscv
@@ -14582,7 +14582,7 @@ M: Paul Walmsley <paul.walmsley@sifive.com>
L: linux-riscv@lists.infradead.org
T: git git://github.com/sifive/riscv-linux.git
S: Supported
-K: sifive
+K: [^@]sifive
N: sifive
SIFIVE FU540 SYSTEM-ON-CHIP
@@ -17651,9 +17651,8 @@ L: linux-xfs@vger.kernel.org
W: http://xfs.org/
T: git git://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git
S: Supported
-F: Documentation/filesystems/xfs.txt
+F: Documentation/admin-guide/xfs.rst
F: Documentation/ABI/testing/sysfs-fs-xfs
-F: Documentation/filesystems/xfs.txt
F: Documentation/filesystems/xfs-delayed-logging-design.txt
F: Documentation/filesystems/xfs-self-describing-metadata.txt
F: fs/xfs/
diff --git a/arch/Kconfig b/arch/Kconfig
index e8d19c3cb91f..ac0fba400ded 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -128,22 +128,6 @@ config UPROBES
managed by the kernel and kept transparent to the probed
application. )
-config HAVE_64BIT_ALIGNED_ACCESS
- def_bool 64BIT && !HAVE_EFFICIENT_UNALIGNED_ACCESS
- help
- Some architectures require 64 bit accesses to be 64 bit
- aligned, which also requires structs containing 64 bit values
- to be 64 bit aligned too. This includes some 32 bit
- architectures which can do 64 bit accesses, as well as 64 bit
- architectures without unaligned access.
-
- This symbol should be selected by an architecture if 64 bit
- accesses are required to be 64 bit aligned in this way even
- though it is not a 64 bit architecture.
-
- See Documentation/unaligned-memory-access.txt for more
- information on the topic of unaligned memory accesses.
-
config HAVE_EFFICIENT_UNALIGNED_ACCESS
bool
help
@@ -585,6 +569,9 @@ config HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
config HAVE_ARCH_HUGE_VMAP
bool
+config ARCH_WANT_HUGE_PMD_SHARE
+ bool
+
config HAVE_ARCH_SOFT_DIRTY
bool
diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c
index b3d439c41c7b..deef17f34bd2 100644
--- a/arch/arm/kernel/module.c
+++ b/arch/arm/kernel/module.c
@@ -55,6 +55,13 @@ void *module_alloc(unsigned long size)
}
#endif
+bool module_exit_section(const char *name)
+{
+ return strstarts(name, ".exit") ||
+ strstarts(name, ".ARM.extab.exit") ||
+ strstarts(name, ".ARM.exidx.exit");
+}
+
int
apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
unsigned int relindex, struct module *module)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index e1ea69994e0f..3adcec05b1f6 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -73,6 +73,7 @@ config ARM64
select ARCH_SUPPORTS_NUMA_BALANCING
select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT
select ARCH_WANT_FRAME_POINTERS
+ select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36)
select ARCH_HAS_UBSAN_SANITIZE_ALL
select ARM_AMBA
select ARM_ARCH_TIMER
@@ -906,7 +907,6 @@ config SYS_SUPPORTS_HUGETLBFS
def_bool y
config ARCH_WANT_HUGE_PMD_SHARE
- def_bool y if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36)
config ARCH_HAS_CACHE_LINE_SIZE
def_bool y
diff --git a/arch/parisc/include/asm/unistd.h b/arch/parisc/include/asm/unistd.h
index b0838dc4dfee..cd438e4150f6 100644
--- a/arch/parisc/include/asm/unistd.h
+++ b/arch/parisc/include/asm/unistd.h
@@ -166,6 +166,7 @@ type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5) \
#define __ARCH_WANT_SYS_FORK
#define __ARCH_WANT_SYS_VFORK
#define __ARCH_WANT_SYS_CLONE
+#define __ARCH_WANT_SYS_CLONE3
#define __ARCH_WANT_COMPAT_SYS_SENDFILE
#ifdef CONFIG_64BIT
diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S
index 3e430590c1e1..d9d3387f7c47 100644
--- a/arch/parisc/kernel/entry.S
+++ b/arch/parisc/kernel/entry.S
@@ -1732,6 +1732,7 @@ ENDPROC_CFI(sys_\name\()_wrapper)
.endm
fork_like clone
+fork_like clone3
fork_like fork
fork_like vfork
diff --git a/arch/parisc/kernel/kprobes.c b/arch/parisc/kernel/kprobes.c
index d58960b33bda..5d7f2692ac5a 100644
--- a/arch/parisc/kernel/kprobes.c
+++ b/arch/parisc/kernel/kprobes.c
@@ -133,6 +133,9 @@ int __kprobes parisc_kprobe_ss_handler(struct pt_regs *regs)
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
struct kprobe *p = kprobe_running();
+ if (!p)
+ return 0;
+
if (regs->iaoq[0] != (unsigned long)p->ainsn.insn+4)
return 0;
diff --git a/arch/parisc/kernel/ptrace.c b/arch/parisc/kernel/ptrace.c
index f642ba378ffa..9f6ff7bc06f9 100644
--- a/arch/parisc/kernel/ptrace.c
+++ b/arch/parisc/kernel/ptrace.c
@@ -167,6 +167,9 @@ long arch_ptrace(struct task_struct *child, long request,
if ((addr & (sizeof(unsigned long)-1)) ||
addr >= sizeof(struct pt_regs))
break;
+ if (addr == PT_IAOQ0 || addr == PT_IAOQ1) {
+ data |= 3; /* ensure userspace privilege */
+ }
if ((addr >= PT_GR1 && addr <= PT_GR31) ||
addr == PT_IAOQ0 || addr == PT_IAOQ1 ||
(addr >= PT_FR0 && addr <= PT_FR31 + 4) ||
@@ -228,16 +231,18 @@ long arch_ptrace(struct task_struct *child, long request,
static compat_ulong_t translate_usr_offset(compat_ulong_t offset)
{
- if (offset < 0)
- return sizeof(struct pt_regs);
- else if (offset <= 32*4) /* gr[0..31] */
- return offset * 2 + 4;
- else if (offset <= 32*4+32*8) /* gr[0..31] + fr[0..31] */
- return offset + 32*4;
- else if (offset < sizeof(struct pt_regs)/2 + 32*4)
- return offset * 2 + 4 - 32*8;
+ compat_ulong_t pos;
+
+ if (offset < 32*4) /* gr[0..31] */
+ pos = offset * 2 + 4;
+ else if (offset < 32*4+32*8) /* fr[0] ... fr[31] */
+ pos = (offset - 32*4) + PT_FR0;
+ else if (offset < sizeof(struct pt_regs)/2 + 32*4) /* sr[0] ... ipsw */
+ pos = (offset - 32*4 - 32*8) * 2 + PT_SR0 + 4;
else
- return sizeof(struct pt_regs);
+ pos = sizeof(struct pt_regs);
+
+ return pos;
}
long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
@@ -281,9 +286,12 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
addr = translate_usr_offset(addr);
if (addr >= sizeof(struct pt_regs))
break;
+ if (addr == PT_IAOQ0+4 || addr == PT_IAOQ1+4) {
+ data |= 3; /* ensure userspace privilege */
+ }
if (addr >= PT_FR0 && addr <= PT_FR31 + 4) {
/* Special case, fp regs are 64 bits anyway */
- *(__u64 *) ((char *) task_regs(child) + addr) = data;
+ *(__u32 *) ((char *) task_regs(child) + addr) = data;
ret = 0;
}
else if ((addr >= PT_GR1+4 && addr <= PT_GR31+4) ||
@@ -496,7 +504,8 @@ static void set_reg(struct pt_regs *regs, int num, unsigned long val)
return;
case RI(iaoq[0]):
case RI(iaoq[1]):
- regs->iaoq[num - RI(iaoq[0])] = val;
+ /* set 2 lowest bits to ensure userspace privilege: */
+ regs->iaoq[num - RI(iaoq[0])] = val | 3;
return;
case RI(sar): regs->sar = val;
return;
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index c7aadfef5386..670d1371aca1 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -431,4 +431,4 @@
432 common fsmount sys_fsmount
433 common fspick sys_fspick
434 common pidfd_open sys_pidfd_open
-# 435 reserved for clone3
+435 common clone3 sys_clone3_wrapper
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 13a1c0d04e9e..59a4727ecd6c 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -52,6 +52,8 @@ config RISCV
select ARCH_HAS_MMIOWB
select HAVE_EBPF_JIT if 64BIT
select EDAC_SUPPORT
+ select ARCH_HAS_GIGANTIC_PAGE
+ select ARCH_WANT_HUGE_PMD_SHARE if 64BIT
config MMU
def_bool y
@@ -66,6 +68,12 @@ config PAGE_OFFSET
default 0xffffffff80000000 if 64BIT && MAXPHYSMEM_2GB
default 0xffffffe000000000 if 64BIT && MAXPHYSMEM_128GB
+config ARCH_WANT_GENERAL_HUGETLB
+ def_bool y
+
+config SYS_SUPPORTS_HUGETLBFS
+ def_bool y
+
config STACKTRACE_SUPPORT
def_bool y
@@ -97,6 +105,8 @@ config PGTABLE_LEVELS
default 3 if 64BIT
default 2
+source "arch/riscv/Kconfig.socs"
+
menu "Platform type"
choice
diff --git a/arch/riscv/Kconfig.socs b/arch/riscv/Kconfig.socs
new file mode 100644
index 000000000000..536c0ef4aee8
--- /dev/null
+++ b/arch/riscv/Kconfig.socs
@@ -0,0 +1,13 @@
+menu "SoC selection"
+
+config SOC_SIFIVE
+ bool "SiFive SoCs"
+ select SERIAL_SIFIVE
+ select SERIAL_SIFIVE_CONSOLE
+ select CLK_SIFIVE
+ select CLK_SIFIVE_FU540_PRCI
+ select SIFIVE_PLIC
+ help
+ This enables support for SiFive SoC platform hardware.
+
+endmenu
diff --git a/arch/riscv/boot/dts/sifive/Makefile b/arch/riscv/boot/dts/sifive/Makefile
index baaeef9efdcb..6d6189e6e4af 100644
--- a/arch/riscv/boot/dts/sifive/Makefile
+++ b/arch/riscv/boot/dts/sifive/Makefile
@@ -1,2 +1,2 @@
# SPDX-License-Identifier: GPL-2.0
-dtb-y += hifive-unleashed-a00.dtb
+dtb-$(CONFIG_SOC_SIFIVE) += hifive-unleashed-a00.dtb
diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig
index 04944fb4fa7a..b7b749b18853 100644
--- a/arch/riscv/configs/defconfig
+++ b/arch/riscv/configs/defconfig
@@ -1,5 +1,7 @@
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
+CONFIG_NO_HZ_IDLE=y
+CONFIG_HIGH_RES_TIMERS=y
CONFIG_IKCONFIG=y
CONFIG_IKCONFIG_PROC=y
CONFIG_CGROUPS=y
@@ -12,6 +14,7 @@ CONFIG_CHECKPOINT_RESTORE=y
CONFIG_BLK_DEV_INITRD=y
CONFIG_EXPERT=y
CONFIG_BPF_SYSCALL=y
+CONFIG_SOC_SIFIVE=y
CONFIG_SMP=y
CONFIG_MODULES=y
CONFIG_MODULE_UNLOAD=y
@@ -49,8 +52,6 @@ CONFIG_SERIAL_8250=y
CONFIG_SERIAL_8250_CONSOLE=y
CONFIG_SERIAL_OF_PLATFORM=y
CONFIG_SERIAL_EARLYCON_RISCV_SBI=y
-CONFIG_SERIAL_SIFIVE=y
-CONFIG_SERIAL_SIFIVE_CONSOLE=y
CONFIG_HVC_RISCV_SBI=y
# CONFIG_PTP_1588_CLOCK is not set
CONFIG_DRM=y
@@ -66,9 +67,6 @@ CONFIG_USB_OHCI_HCD_PLATFORM=y
CONFIG_USB_STORAGE=y
CONFIG_USB_UAS=y
CONFIG_VIRTIO_MMIO=y
-CONFIG_CLK_SIFIVE=y
-CONFIG_CLK_SIFIVE_FU540_PRCI=y
-CONFIG_SIFIVE_PLIC=y
CONFIG_SPI_SIFIVE=y
CONFIG_EXT4_FS=y
CONFIG_EXT4_FS_POSIX_ACL=y
diff --git a/arch/riscv/configs/rv32_defconfig b/arch/riscv/configs/rv32_defconfig
index 1a911ed8e772..d5449ef805a3 100644
--- a/arch/riscv/configs/rv32_defconfig
+++ b/arch/riscv/configs/rv32_defconfig
@@ -1,5 +1,7 @@
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
+CONFIG_NO_HZ_IDLE=y
+CONFIG_HIGH_RES_TIMERS=y
CONFIG_IKCONFIG=y
CONFIG_IKCONFIG_PROC=y
CONFIG_CGROUPS=y
diff --git a/arch/riscv/include/asm/cacheflush.h b/arch/riscv/include/asm/cacheflush.h
index ad8678f1b54a..555b20b11dc3 100644
--- a/arch/riscv/include/asm/cacheflush.h
+++ b/arch/riscv/include/asm/cacheflush.h
@@ -6,11 +6,66 @@
#ifndef _ASM_RISCV_CACHEFLUSH_H
#define _ASM_RISCV_CACHEFLUSH_H
-#include <asm-generic/cacheflush.h>
+#include <linux/mm.h>
-#undef flush_icache_range
-#undef flush_icache_user_range
-#undef flush_dcache_page
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
+
+/*
+ * The cache doesn't need to be flushed when TLB entries change when
+ * the cache is mapped to physical memory, not virtual memory
+ */
+static inline void flush_cache_all(void)
+{
+}
+
+static inline void flush_cache_mm(struct mm_struct *mm)
+{
+}
+
+static inline void flush_cache_dup_mm(struct mm_struct *mm)
+{
+}
+
+static inline void flush_cache_range(struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end)
+{
+}
+
+static inline void flush_cache_page(struct vm_area_struct *vma,
+ unsigned long vmaddr,
+ unsigned long pfn)
+{
+}
+
+static inline void flush_dcache_mmap_lock(struct address_space *mapping)
+{
+}
+
+static inline void flush_dcache_mmap_unlock(struct address_space *mapping)
+{
+}
+
+static inline void flush_icache_page(struct vm_area_struct *vma,
+ struct page *page)
+{
+}
+
+static inline void flush_cache_vmap(unsigned long start, unsigned long end)
+{
+}
+
+static inline void flush_cache_vunmap(unsigned long start, unsigned long end)
+{
+}
+
+#define copy_to_user_page(vma, page, vaddr, dst, src, len) \
+ do { \
+ memcpy(dst, src, len); \
+ flush_icache_user_range(vma, page, vaddr, len); \
+ } while (0)
+#define copy_from_user_page(vma, page, vaddr, dst, src, len) \
+ memcpy(dst, src, len)
static inline void local_flush_icache_all(void)
{
diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h
index c207f6634b91..9c66033c3a54 100644
--- a/arch/riscv/include/asm/fixmap.h
+++ b/arch/riscv/include/asm/fixmap.h
@@ -21,6 +21,11 @@
*/
enum fixed_addresses {
FIX_HOLE,
+#define FIX_FDT_SIZE SZ_1M
+ FIX_FDT_END,
+ FIX_FDT = FIX_FDT_END + FIX_FDT_SIZE / PAGE_SIZE - 1,
+ FIX_PTE,
+ FIX_PMD,
FIX_EARLYCON_MEM_BASE,
__end_of_fixed_addresses
};
diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h
new file mode 100644
index 000000000000..728a5db66597
--- /dev/null
+++ b/arch/riscv/include/asm/hugetlb.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_RISCV_HUGETLB_H
+#define _ASM_RISCV_HUGETLB_H
+
+#include <asm-generic/hugetlb.h>
+#include <asm/page.h>
+
+static inline int is_hugepage_only_range(struct mm_struct *mm,
+ unsigned long addr,
+ unsigned long len) {
+ return 0;
+}
+
+static inline void arch_clear_hugepage_flags(struct page *page)
+{
+}
+
+#endif /* _ASM_RISCV_HUGETLB_H */
diff --git a/arch/riscv/include/asm/image.h b/arch/riscv/include/asm/image.h
new file mode 100644
index 000000000000..ef28e106f247
--- /dev/null
+++ b/arch/riscv/include/asm/image.h
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __ASM_IMAGE_H
+#define __ASM_IMAGE_H
+
+#define RISCV_IMAGE_MAGIC "RISCV"
+
+#define RISCV_IMAGE_FLAG_BE_SHIFT 0
+#define RISCV_IMAGE_FLAG_BE_MASK 0x1
+
+#define RISCV_IMAGE_FLAG_LE 0
+#define RISCV_IMAGE_FLAG_BE 1
+
+#ifdef CONFIG_CPU_BIG_ENDIAN
+#error conversion of header fields to LE not yet implemented
+#else
+#define __HEAD_FLAG_BE RISCV_IMAGE_FLAG_LE
+#endif
+
+#define __HEAD_FLAG(field) (__HEAD_FLAG_##field << \
+ RISCV_IMAGE_FLAG_##field##_SHIFT)
+
+#define __HEAD_FLAGS (__HEAD_FLAG(BE))
+
+#define RISCV_HEADER_VERSION_MAJOR 0
+#define RISCV_HEADER_VERSION_MINOR 1
+
+#define RISCV_HEADER_VERSION (RISCV_HEADER_VERSION_MAJOR << 16 | \
+ RISCV_HEADER_VERSION_MINOR)
+
+#ifndef __ASSEMBLY__
+/**
+ * struct riscv_image_header - riscv kernel image header
+ * @code0: Executable code
+ * @code1: Executable code
+ * @text_offset: Image load offset (little endian)
+ * @image_size: Effective Image size (little endian)
+ * @flags: kernel flags (little endian)
+ * @version: version
+ * @res1: reserved
+ * @res2: reserved
+ * @magic: Magic number
+ * @res3: reserved (will be used for additional RISC-V specific
+ * header)
+ * @res4: reserved (will be used for PE COFF offset)
+ *
+ * The intention is for this header format to be shared between multiple
+ * architectures to avoid a proliferation of image header formats.
+ */
+
+struct riscv_image_header {
+ u32 code0;
+ u32 code1;
+ u64 text_offset;
+ u64 image_size;
+ u64 flags;
+ u32 version;
+ u32 res1;
+ u64 res2;
+ u64 magic;
+ u32 res3;
+ u32 res4;
+};
+#endif /* __ASSEMBLY__ */
+#endif /* __ASM_IMAGE_H */
diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
index 8ddb6c7fedac..707e00a8430b 100644
--- a/arch/riscv/include/asm/page.h
+++ b/arch/riscv/include/asm/page.h
@@ -16,6 +16,16 @@
#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT)
#define PAGE_MASK (~(PAGE_SIZE - 1))
+#ifdef CONFIG_64BIT
+#define HUGE_MAX_HSTATE 2
+#else
+#define HUGE_MAX_HSTATE 1
+#endif
+#define HPAGE_SHIFT PMD_SHIFT
+#define HPAGE_SIZE (_AC(1, UL) << HPAGE_SHIFT)
+#define HPAGE_MASK (~(HPAGE_SIZE - 1))
+#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
+
/*
* PAGE_OFFSET -- the first address of the first page of memory.
* When not using MMU this corresponds to the first free page in
@@ -115,8 +125,4 @@ extern unsigned long min_low_pfn;
#include <asm-generic/memory_model.h>
#include <asm-generic/getorder.h>
-/* vDSO support */
-/* We do define AT_SYSINFO_EHDR but don't use the gate mechanism */
-#define __HAVE_ARCH_GATE_AREA
-
#endif /* _ASM_RISCV_PAGE_H */
diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h
index 45dfac2ac51f..74630989006d 100644
--- a/arch/riscv/include/asm/pgtable-64.h
+++ b/arch/riscv/include/asm/pgtable-64.h
@@ -70,6 +70,11 @@ static inline pmd_t pfn_pmd(unsigned long pfn, pgprot_t prot)
return __pmd((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot));
}
+static inline unsigned long _pmd_pfn(pmd_t pmd)
+{
+ return pmd_val(pmd) >> _PAGE_PFN_SHIFT;
+}
+
#define pmd_ERROR(e) \
pr_err("%s:%d: bad pmd %016lx.\n", __FILE__, __LINE__, pmd_val(e))
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index f7c3f7de15f2..a364aba23d55 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -59,6 +59,8 @@
#define PAGE_KERNEL __pgprot(_PAGE_KERNEL)
#define PAGE_KERNEL_EXEC __pgprot(_PAGE_KERNEL | _PAGE_EXEC)
+#define PAGE_TABLE __pgprot(_PAGE_TABLE)
+
extern pgd_t swapper_pg_dir[];
/* MAP_PRIVATE permissions: xwr (copy-on-write) */
@@ -113,12 +115,16 @@ static inline void pmd_clear(pmd_t *pmdp)
set_pmd(pmdp, __pmd(0));
}
-
static inline pgd_t pfn_pgd(unsigned long pfn, pgprot_t prot)
{
return __pgd((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot));
}
+static inline unsigned long _pgd_pfn(pgd_t pgd)
+{
+ return pgd_val(pgd) >> _PAGE_PFN_SHIFT;
+}
+
#define pgd_index(addr) (((addr) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
/* Locate an entry in the page global directory */
@@ -250,6 +256,11 @@ static inline pte_t pte_mkspecial(pte_t pte)
return __pte(pte_val(pte) | _PAGE_SPECIAL);
}
+static inline pte_t pte_mkhuge(pte_t pte)
+{
+ return pte;
+}
+
/* Modify page protection bits */
static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
{
@@ -396,6 +407,7 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
#define kern_addr_valid(addr) (1) /* FIXME */
#endif
+extern void *dtb_early_va;
extern void setup_bootmem(void);
extern void paging_init(void);
@@ -409,7 +421,7 @@ static inline void pgtable_cache_init(void)
#define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE)
/*
- * Task size is 0x40000000000 for RV64 or 0xb800000 for RV32.
+ * Task size is 0x4000000000 for RV64 or 0xb800000 for RV32.
* Note that PGDIR_SIZE must evenly divide TASK_SIZE.
*/
#ifdef CONFIG_64BIT
diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S
index 4e46f31072da..0f1ba17e476f 100644
--- a/arch/riscv/kernel/head.S
+++ b/arch/riscv/kernel/head.S
@@ -11,9 +11,41 @@
#include <asm/thread_info.h>
#include <asm/page.h>
#include <asm/csr.h>
+#include <asm/image.h>
__INIT
ENTRY(_start)
+ /*
+ * Image header expected by Linux boot-loaders. The image header data
+ * structure is described in asm/image.h.
+ * Do not modify it without modifying the structure and all bootloaders
+ * that expects this header format!!
+ */
+ /* jump to start kernel */
+ j _start_kernel
+ /* reserved */
+ .word 0
+ .balign 8
+#if __riscv_xlen == 64
+ /* Image load offset(2MB) from start of RAM */
+ .dword 0x200000
+#else
+ /* Image load offset(4MB) from start of RAM */
+ .dword 0x400000
+#endif
+ /* Effective size of kernel image */
+ .dword _end - _start
+ .dword __HEAD_FLAGS
+ .word RISCV_HEADER_VERSION
+ .word 0
+ .dword 0
+ .asciz RISCV_IMAGE_MAGIC
+ .word 0
+ .balign 4
+ .word 0
+
+.global _start_kernel
+_start_kernel:
/* Mask all interrupts */
csrw CSR_SIE, zero
csrw CSR_SIP, zero
@@ -55,7 +87,9 @@ clear_bss_done:
/* Initialize page tables and relocate to virtual addresses */
la sp, init_thread_union + THREAD_SIZE
+ mv a0, s1
call setup_vm
+ la a0, early_pg_dir
call relocate
/* Restore C environment */
@@ -64,25 +98,23 @@ clear_bss_done:
la sp, init_thread_union + THREAD_SIZE
/* Start the kernel */
- mv a0, s1
call parse_dtb
tail start_kernel
relocate:
/* Relocate return address */
li a1, PAGE_OFFSET
- la a0, _start
- sub a1, a1, a0
+ la a2, _start
+ sub a1, a1, a2
add ra, ra, a1
/* Point stvec to virtual address of intruction after satp write */
- la a0, 1f
- add a0, a0, a1
- csrw CSR_STVEC, a0
+ la a2, 1f
+ add a2, a2, a1
+ csrw CSR_STVEC, a2
/* Compute satp for kernel page tables, but don't load it yet */
- la a2, swapper_pg_dir
- srl a2, a2, PAGE_SHIFT
+ srl a2, a0, PAGE_SHIFT
li a1, SATP_MODE
or a2, a2, a1
@@ -148,6 +180,7 @@ relocate:
fence
/* Enable virtual memory and relocate to virtual address */
+ la a0, swapper_pg_dir
call relocate
tail smp_callin
diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c
index b92e6831d1ec..a990a6cb184f 100644
--- a/arch/riscv/kernel/setup.c
+++ b/arch/riscv/kernel/setup.c
@@ -39,11 +39,9 @@ struct screen_info screen_info = {
atomic_t hart_lottery;
unsigned long boot_cpu_hartid;
-void __init parse_dtb(phys_addr_t dtb_phys)
+void __init parse_dtb(void)
{
- void *dtb = __va(dtb_phys);
-
- if (early_init_dt_scan(dtb))
+ if (early_init_dt_scan(dtb_early_va))
return;
pr_err("No DTB passed to the kernel\n");
diff --git a/arch/riscv/kernel/vdso.c b/arch/riscv/kernel/vdso.c
index a0084c36d270..c9c21e0d5641 100644
--- a/arch/riscv/kernel/vdso.c
+++ b/arch/riscv/kernel/vdso.c
@@ -92,22 +92,3 @@ const char *arch_vma_name(struct vm_area_struct *vma)
return "[vdso]";
return NULL;
}
-
-/*
- * Function stubs to prevent linker errors when AT_SYSINFO_EHDR is defined
- */
-
-int in_gate_area_no_mm(unsigned long addr)
-{
- return 0;
-}
-
-int in_gate_area(struct mm_struct *mm, unsigned long addr)
-{
- return 0;
-}
-
-struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
-{
- return NULL;
-}
diff --git a/arch/riscv/mm/Makefile b/arch/riscv/mm/Makefile
index fc51d3b7876e..74055e1d6f21 100644
--- a/arch/riscv/mm/Makefile
+++ b/arch/riscv/mm/Makefile
@@ -12,3 +12,5 @@ obj-y += ioremap.o
obj-y += cacheflush.o
obj-y += context.o
obj-y += sifive_l2_cache.o
+
+obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
new file mode 100644
index 000000000000..0d4747e9d5b5
--- /dev/null
+++ b/arch/riscv/mm/hugetlbpage.c
@@ -0,0 +1,44 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/hugetlb.h>
+#include <linux/err.h>
+
+int pud_huge(pud_t pud)
+{
+ return pud_present(pud) &&
+ (pud_val(pud) & (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC));
+}
+
+int pmd_huge(pmd_t pmd)
+{
+ return pmd_present(pmd) &&
+ (pmd_val(pmd) & (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC));
+}
+
+static __init int setup_hugepagesz(char *opt)
+{
+ unsigned long ps = memparse(opt, &opt);
+
+ if (ps == HPAGE_SIZE) {
+ hugetlb_add_hstate(HPAGE_SHIFT - PAGE_SHIFT);
+ } else if (IS_ENABLED(CONFIG_64BIT) && ps == PUD_SIZE) {
+ hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
+ } else {
+ hugetlb_bad_size();
+ pr_err("hugepagesz: Unsupported page size %lu M\n", ps >> 20);
+ return 0;
+ }
+
+ return 1;
+}
+__setup("hugepagesz=", setup_hugepagesz);
+
+#ifdef CONFIG_CONTIG_ALLOC
+static __init int gigantic_pages_init(void)
+{
+ /* With CONTIG_ALLOC, we can allocate gigantic pages at runtime */
+ if (IS_ENABLED(CONFIG_64BIT) && !size_to_hstate(1UL << PUD_SHIFT))
+ hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
+ return 0;
+}
+arch_initcall(gigantic_pages_init);
+#endif
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 84747d7a1e85..42bf939693d3 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2012 Regents of the University of California
+ * Copyright (C) 2019 Western Digital Corporation or its affiliates.
*/
#include <linux/init.h>
@@ -21,6 +22,8 @@ unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]
__page_aligned_bss;
EXPORT_SYMBOL(empty_zero_page);
+extern char _start[];
+
static void __init zone_sizes_init(void)
{
unsigned long max_zone_pfns[MAX_NR_ZONES] = { 0, };
@@ -39,13 +42,6 @@ void setup_zero_page(void)
memset((void *)empty_zero_page, 0, PAGE_SIZE);
}
-void __init paging_init(void)
-{
- setup_zero_page();
- local_flush_tlb_all();
- zone_sizes_init();
-}
-
void __init mem_init(void)
{
#ifdef CONFIG_FLATMEM
@@ -84,29 +80,20 @@ disable:
initrd_start = 0;
initrd_end = 0;
}
-
-void __init free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
#endif /* CONFIG_BLK_DEV_INITRD */
void __init setup_bootmem(void)
{
struct memblock_region *reg;
phys_addr_t mem_size = 0;
+ phys_addr_t vmlinux_end = __pa(&_end);
+ phys_addr_t vmlinux_start = __pa(&_start);
/* Find the memory region containing the kernel */
for_each_memblock(memory, reg) {
- phys_addr_t vmlinux_end = __pa(_end);
phys_addr_t end = reg->base + reg->size;
if (reg->base <= vmlinux_end && vmlinux_end <= end) {
- /*
- * Reserve from the start of the region to the end of
- * the kernel
- */
- memblock_reserve(reg->base, vmlinux_end - reg->base);
mem_size = min(reg->size, (phys_addr_t)-PAGE_OFFSET);
/*
@@ -120,6 +107,9 @@ void __init setup_bootmem(void)
}
BUG_ON(mem_size == 0);
+ /* Reserve from the start of the kernel to the end of the kernel */
+ memblock_reserve(vmlinux_start, vmlinux_end - vmlinux_start);
+
set_max_mapnr(PFN_DOWN(mem_size));
max_low_pfn = PFN_DOWN(memblock_end_of_DRAM());
@@ -147,17 +137,15 @@ EXPORT_SYMBOL(va_pa_offset);
unsigned long pfn_base;
EXPORT_SYMBOL(pfn_base);
+void *dtb_early_va;
pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned_bss;
-pgd_t trampoline_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE);
+pgd_t trampoline_pg_dir[PTRS_PER_PGD] __page_aligned_bss;
+pte_t fixmap_pte[PTRS_PER_PTE] __page_aligned_bss;
+static bool mmu_enabled;
-#ifndef __PAGETABLE_PMD_FOLDED
-#define NUM_SWAPPER_PMDS ((uintptr_t)-PAGE_OFFSET >> PGDIR_SHIFT)
-pmd_t swapper_pmd[PTRS_PER_PMD*((-PAGE_OFFSET)/PGDIR_SIZE)] __page_aligned_bss;
-pmd_t trampoline_pmd[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE);
-pmd_t fixmap_pmd[PTRS_PER_PMD] __page_aligned_bss;
-#endif
+#define MAX_EARLY_MAPPING_SIZE SZ_128M
-pte_t fixmap_pte[PTRS_PER_PTE] __page_aligned_bss;
+pgd_t early_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE);
void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t prot)
{
@@ -176,6 +164,156 @@ void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t prot)
}
}
+static pte_t *__init get_pte_virt(phys_addr_t pa)
+{
+ if (mmu_enabled) {
+ clear_fixmap(FIX_PTE);
+ return (pte_t *)set_fixmap_offset(FIX_PTE, pa);
+ } else {
+ return (pte_t *)((uintptr_t)pa);
+ }
+}
+
+static phys_addr_t __init alloc_pte(uintptr_t va)
+{
+ /*
+ * We only create PMD or PGD early mappings so we
+ * should never reach here with MMU disabled.
+ */
+ BUG_ON(!mmu_enabled);
+
+ return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);
+}
+
+static void __init create_pte_mapping(pte_t *ptep,
+ uintptr_t va, phys_addr_t pa,
+ phys_addr_t sz, pgprot_t prot)
+{
+ uintptr_t pte_index = pte_index(va);
+
+ BUG_ON(sz != PAGE_SIZE);
+
+ if (pte_none(ptep[pte_index]))
+ ptep[pte_index] = pfn_pte(PFN_DOWN(pa), prot);
+}
+
+#ifndef __PAGETABLE_PMD_FOLDED
+
+pmd_t trampoline_pmd[PTRS_PER_PMD] __page_aligned_bss;
+pmd_t fixmap_pmd[PTRS_PER_PMD] __page_aligned_bss;
+
+#if MAX_EARLY_MAPPING_SIZE < PGDIR_SIZE
+#define NUM_EARLY_PMDS 1UL
+#else
+#define NUM_EARLY_PMDS (1UL + MAX_EARLY_MAPPING_SIZE / PGDIR_SIZE)
+#endif
+pmd_t early_pmd[PTRS_PER_PMD * NUM_EARLY_PMDS] __initdata __aligned(PAGE_SIZE);
+
+static pmd_t *__init get_pmd_virt(phys_addr_t pa)
+{
+ if (mmu_enabled) {
+ clear_fixmap(FIX_PMD);
+ return (pmd_t *)set_fixmap_offset(FIX_PMD, pa);
+ } else {
+ return (pmd_t *)((uintptr_t)pa);
+ }
+}
+
+static phys_addr_t __init alloc_pmd(uintptr_t va)
+{
+ uintptr_t pmd_num;
+
+ if (mmu_enabled)
+ return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);
+
+ pmd_num = (va - PAGE_OFFSET) >> PGDIR_SHIFT;
+ BUG_ON(pmd_num >= NUM_EARLY_PMDS);
+ return (uintptr_t)&early_pmd[pmd_num * PTRS_PER_PMD];
+}
+
+static void __init create_pmd_mapping(pmd_t *pmdp,
+ uintptr_t va, phys_addr_t pa,
+ phys_addr_t sz, pgprot_t prot)
+{
+ pte_t *ptep;
+ phys_addr_t pte_phys;
+ uintptr_t pmd_index = pmd_index(va);
+
+ if (sz == PMD_SIZE) {
+ if (pmd_none(pmdp[pmd_index]))
+ pmdp[pmd_index] = pfn_pmd(PFN_DOWN(pa), prot);
+ return;
+ }
+
+ if (pmd_none(pmdp[pmd_index])) {
+ pte_phys = alloc_pte(va);
+ pmdp[pmd_index] = pfn_pmd(PFN_DOWN(pte_phys), PAGE_TABLE);
+ ptep = get_pte_virt(pte_phys);
+ memset(ptep, 0, PAGE_SIZE);
+ } else {
+ pte_phys = PFN_PHYS(_pmd_pfn(pmdp[pmd_index]));
+ ptep = get_pte_virt(pte_phys);
+ }
+
+ create_pte_mapping(ptep, va, pa, sz, prot);
+}
+
+#define pgd_next_t pmd_t
+#define alloc_pgd_next(__va) alloc_pmd(__va)
+#define get_pgd_next_virt(__pa) get_pmd_virt(__pa)
+#define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \
+ create_pmd_mapping(__nextp, __va, __pa, __sz, __prot)
+#define PTE_PARENT_SIZE PMD_SIZE
+#define fixmap_pgd_next fixmap_pmd
+#else
+#define pgd_next_t pte_t
+#define alloc_pgd_next(__va) alloc_pte(__va)
+#define get_pgd_next_virt(__pa) get_pte_virt(__pa)
+#define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \
+ create_pte_mapping(__nextp, __va, __pa, __sz, __prot)
+#define PTE_PARENT_SIZE PGDIR_SIZE
+#define fixmap_pgd_next fixmap_pte
+#endif
+
+static void __init create_pgd_mapping(pgd_t *pgdp,
+ uintptr_t va, phys_addr_t pa,
+ phys_addr_t sz, pgprot_t prot)
+{
+ pgd_next_t *nextp;
+ phys_addr_t next_phys;
+ uintptr_t pgd_index = pgd_index(va);
+
+ if (sz == PGDIR_SIZE) {
+ if (pgd_val(pgdp[pgd_index]) == 0)
+ pgdp[pgd_index] = pfn_pgd(PFN_DOWN(pa), prot);
+ return;
+ }
+
+ if (pgd_val(pgdp[pgd_index]) == 0) {
+ next_phys = alloc_pgd_next(va);
+ pgdp[pgd_index] = pfn_pgd(PFN_DOWN(next_phys), PAGE_TABLE);
+ nextp = get_pgd_next_virt(next_phys);
+ memset(nextp, 0, PAGE_SIZE);
+ } else {
+ next_phys = PFN_PHYS(_pgd_pfn(pgdp[pgd_index]));
+ nextp = get_pgd_next_virt(next_phys);
+ }
+
+ create_pgd_next_mapping(nextp, va, pa, sz, prot);
+}
+
+static uintptr_t __init best_map_size(phys_addr_t base, phys_addr_t size)
+{
+ uintptr_t map_size = PAGE_SIZE;
+
+ /* Upgrade to PMD/PGDIR mappings whenever possible */
+ if (!(base & (PTE_PARENT_SIZE - 1)) &&
+ !(size & (PTE_PARENT_SIZE - 1)))
+ map_size = PTE_PARENT_SIZE;
+
+ return map_size;
+}
+
/*
* setup_vm() is called from head.S with MMU-off.
*
@@ -195,55 +333,115 @@ void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t prot)
"not use absolute addressing."
#endif
-asmlinkage void __init setup_vm(void)
+asmlinkage void __init setup_vm(uintptr_t dtb_pa)
{
- extern char _start;
- uintptr_t i;
- uintptr_t pa = (uintptr_t) &_start;
- pgprot_t prot = __pgprot(pgprot_val(PAGE_KERNEL) | _PAGE_EXEC);
+ uintptr_t va, end_va;
+ uintptr_t load_pa = (uintptr_t)(&_start);
+ uintptr_t load_sz = (uintptr_t)(&_end) - load_pa;
+ uintptr_t map_size = best_map_size(load_pa, MAX_EARLY_MAPPING_SIZE);
- va_pa_offset = PAGE_OFFSET - pa;
- pfn_base = PFN_DOWN(pa);
+ va_pa_offset = PAGE_OFFSET - load_pa;
+ pfn_base = PFN_DOWN(load_pa);
+
+ /*
+ * Enforce boot alignment requirements of RV32 and
+ * RV64 by only allowing PMD or PGD mappings.
+ */
+ BUG_ON(map_size == PAGE_SIZE);
/* Sanity check alignment and size */
BUG_ON((PAGE_OFFSET % PGDIR_SIZE) != 0);
- BUG_ON((pa % (PAGE_SIZE * PTRS_PER_PTE)) != 0);
+ BUG_ON((load_pa % map_size) != 0);
+ BUG_ON(load_sz > MAX_EARLY_MAPPING_SIZE);
+
+ /* Setup early PGD for fixmap */
+ create_pgd_mapping(early_pg_dir, FIXADDR_START,
+ (uintptr_t)fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE);
#ifndef __PAGETABLE_PMD_FOLDED
- trampoline_pg_dir[(PAGE_OFFSET >> PGDIR_SHIFT) % PTRS_PER_PGD] =
- pfn_pgd(PFN_DOWN((uintptr_t)trampoline_pmd),
- __pgprot(_PAGE_TABLE));
- trampoline_pmd[0] = pfn_pmd(PFN_DOWN(pa), prot);
+ /* Setup fixmap PMD */
+ create_pmd_mapping(fixmap_pmd, FIXADDR_START,
+ (uintptr_t)fixmap_pte, PMD_SIZE, PAGE_TABLE);
+ /* Setup trampoline PGD and PMD */
+ create_pgd_mapping(trampoline_pg_dir, PAGE_OFFSET,
+ (uintptr_t)trampoline_pmd, PGDIR_SIZE, PAGE_TABLE);
+ create_pmd_mapping(trampoline_pmd, PAGE_OFFSET,
+ load_pa, PMD_SIZE, PAGE_KERNEL_EXEC);
+#else
+ /* Setup trampoline PGD */
+ create_pgd_mapping(trampoline_pg_dir, PAGE_OFFSET,
+ load_pa, PGDIR_SIZE, PAGE_KERNEL_EXEC);
+#endif
- for (i = 0; i < (-PAGE_OFFSET)/PGDIR_SIZE; ++i) {
- size_t o = (PAGE_OFFSET >> PGDIR_SHIFT) % PTRS_PER_PGD + i;
+ /*
+ * Setup early PGD covering entire kernel which will allows
+ * us to reach paging_init(). We map all memory banks later
+ * in setup_vm_final() below.
+ */
+ end_va = PAGE_OFFSET + load_sz;
+ for (va = PAGE_OFFSET; va < end_va; va += map_size)
+ create_pgd_mapping(early_pg_dir, va,
+ load_pa + (va - PAGE_OFFSET),
+ map_size, PAGE_KERNEL_EXEC);
+
+ /* Create fixed mapping for early FDT parsing */
+ end_va = __fix_to_virt(FIX_FDT) + FIX_FDT_SIZE;
+ for (va = __fix_to_virt(FIX_FDT); va < end_va; va += PAGE_SIZE)
+ create_pte_mapping(fixmap_pte, va,
+ dtb_pa + (va - __fix_to_virt(FIX_FDT)),
+ PAGE_SIZE, PAGE_KERNEL);
+
+ /* Save pointer to DTB for early FDT parsing */
+ dtb_early_va = (void *)fix_to_virt(FIX_FDT) + (dtb_pa & ~PAGE_MASK);
+}
- swapper_pg_dir[o] =
- pfn_pgd(PFN_DOWN((uintptr_t)swapper_pmd) + i,
- __pgprot(_PAGE_TABLE));
- }
- for (i = 0; i < ARRAY_SIZE(swapper_pmd); i++)
- swapper_pmd[i] = pfn_pmd(PFN_DOWN(pa + i * PMD_SIZE), prot);
-
- swapper_pg_dir[(FIXADDR_START >> PGDIR_SHIFT) % PTRS_PER_PGD] =
- pfn_pgd(PFN_DOWN((uintptr_t)fixmap_pmd),
- __pgprot(_PAGE_TABLE));
- fixmap_pmd[(FIXADDR_START >> PMD_SHIFT) % PTRS_PER_PMD] =
- pfn_pmd(PFN_DOWN((uintptr_t)fixmap_pte),
- __pgprot(_PAGE_TABLE));
-#else
- trampoline_pg_dir[(PAGE_OFFSET >> PGDIR_SHIFT) % PTRS_PER_PGD] =
- pfn_pgd(PFN_DOWN(pa), prot);
+static void __init setup_vm_final(void)
+{
+ uintptr_t va, map_size;
+ phys_addr_t pa, start, end;
+ struct memblock_region *reg;
- for (i = 0; i < (-PAGE_OFFSET)/PGDIR_SIZE; ++i) {
- size_t o = (PAGE_OFFSET >> PGDIR_SHIFT) % PTRS_PER_PGD + i;
+ /* Set mmu_enabled flag */
+ mmu_enabled = true;
- swapper_pg_dir[o] =
- pfn_pgd(PFN_DOWN(pa + i * PGDIR_SIZE), prot);
+ /* Setup swapper PGD for fixmap */
+ create_pgd_mapping(swapper_pg_dir, FIXADDR_START,
+ __pa(fixmap_pgd_next),
+ PGDIR_SIZE, PAGE_TABLE);
+
+ /* Map all memory banks */
+ for_each_memblock(memory, reg) {
+ start = reg->base;
+ end = start + reg->size;
+
+ if (start >= end)
+ break;
+ if (memblock_is_nomap(reg))
+ continue;
+ if (start <= __pa(PAGE_OFFSET) &&
+ __pa(PAGE_OFFSET) < end)
+ start = __pa(PAGE_OFFSET);
+
+ map_size = best_map_size(start, end - start);
+ for (pa = start; pa < end; pa += map_size) {
+ va = (uintptr_t)__va(pa);
+ create_pgd_mapping(swapper_pg_dir, va, pa,
+ map_size, PAGE_KERNEL_EXEC);
+ }
}
- swapper_pg_dir[(FIXADDR_START >> PGDIR_SHIFT) % PTRS_PER_PGD] =
- pfn_pgd(PFN_DOWN((uintptr_t)fixmap_pte),
- __pgprot(_PAGE_TABLE));
-#endif
+ /* Clear fixmap PTE and PMD mappings */
+ clear_fixmap(FIX_PTE);
+ clear_fixmap(FIX_PMD);
+
+ /* Move to swapper page table */
+ csr_write(sptbr, PFN_DOWN(__pa(swapper_pg_dir)) | SATP_MODE);
+ local_flush_tlb_all();
+}
+
+void __init paging_init(void)
+{
+ setup_vm_final();
+ setup_zero_page();
+ zone_sizes_init();
}
diff --git a/arch/riscv/mm/sifive_l2_cache.c b/arch/riscv/mm/sifive_l2_cache.c
index 4eb64619b3f4..2e637ad71c05 100644
--- a/arch/riscv/mm/sifive_l2_cache.c
+++ b/arch/riscv/mm/sifive_l2_cache.c
@@ -109,13 +109,14 @@ EXPORT_SYMBOL_GPL(unregister_sifive_l2_error_notifier);
static irqreturn_t l2_int_handler(int irq, void *device)
{
- unsigned int regval, add_h, add_l;
+ unsigned int add_h, add_l;
if (irq == g_irq[DIR_CORR]) {
add_h = readl(l2_base + SIFIVE_L2_DIRECCFIX_HIGH);
add_l = readl(l2_base + SIFIVE_L2_DIRECCFIX_LOW);
pr_err("L2CACHE: DirError @ 0x%08X.%08X\n", add_h, add_l);
- regval = readl(l2_base + SIFIVE_L2_DIRECCFIX_COUNT);
+ /* Reading this register clears the DirError interrupt sig */
+ readl(l2_base + SIFIVE_L2_DIRECCFIX_COUNT);
atomic_notifier_call_chain(&l2_err_chain, SIFIVE_L2_ERR_TYPE_CE,
"DirECCFix");
}
@@ -123,7 +124,8 @@ static irqreturn_t l2_int_handler(int irq, void *device)
add_h = readl(l2_base + SIFIVE_L2_DATECCFIX_HIGH);
add_l = readl(l2_base + SIFIVE_L2_DATECCFIX_LOW);
pr_err("L2CACHE: DataError @ 0x%08X.%08X\n", add_h, add_l);
- regval = readl(l2_base + SIFIVE_L2_DATECCFIX_COUNT);
+ /* Reading this register clears the DataError interrupt sig */
+ readl(l2_base + SIFIVE_L2_DATECCFIX_COUNT);
atomic_notifier_call_chain(&l2_err_chain, SIFIVE_L2_ERR_TYPE_CE,
"DatECCFix");
}
@@ -131,7 +133,8 @@ static irqreturn_t l2_int_handler(int irq, void *device)
add_h = readl(l2_base + SIFIVE_L2_DATECCFAIL_HIGH);
add_l = readl(l2_base + SIFIVE_L2_DATECCFAIL_LOW);
pr_err("L2CACHE: DataFail @ 0x%08X.%08X\n", add_h, add_l);
- regval = readl(l2_base + SIFIVE_L2_DATECCFAIL_COUNT);
+ /* Reading this register clears the DataFail interrupt sig */
+ readl(l2_base + SIFIVE_L2_DATECCFAIL_COUNT);
atomic_notifier_call_chain(&l2_err_chain, SIFIVE_L2_ERR_TYPE_UE,
"DatECCFail");
}
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 1342654e8057..78772870facd 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -94,6 +94,7 @@ config X86
select ARCH_USE_QUEUED_SPINLOCKS
select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
select ARCH_WANTS_DYNAMIC_TASK_STRUCT
+ select ARCH_WANT_HUGE_PMD_SHARE
select ARCH_WANTS_THP_SWAP if X86_64
select BUILDTIME_EXTABLE_SORT
select CLKEVT_I8253
@@ -307,9 +308,6 @@ config ARCH_HIBERNATION_POSSIBLE
config ARCH_SUSPEND_POSSIBLE
def_bool y
-config ARCH_WANT_HUGE_PMD_SHARE
- def_bool y
-
config ARCH_WANT_GENERAL_HUGETLB
def_bool y
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index c82abd6e4ca3..9c4435307ff8 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -66,7 +66,9 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
})
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
-# define WARN_ON_IN_IRQ() WARN_ON_ONCE(!in_task())
+static inline bool pagefault_disabled(void);
+# define WARN_ON_IN_IRQ() \
+ WARN_ON_ONCE(!in_task() && !pagefault_disabled())
#else
# define WARN_ON_IN_IRQ()
#endif
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 4b73f5937f41..024c3053dbba 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -373,7 +373,7 @@ static int add_brk_on_nop(struct dyn_ftrace *rec)
return add_break(rec->ip, old);
}
-static int add_breakpoints(struct dyn_ftrace *rec, int enable)
+static int add_breakpoints(struct dyn_ftrace *rec, bool enable)
{
unsigned long ftrace_addr;
int ret;
@@ -481,7 +481,7 @@ static int add_update_nop(struct dyn_ftrace *rec)
return add_update_code(ip, new);
}
-static int add_update(struct dyn_ftrace *rec, int enable)
+static int add_update(struct dyn_ftrace *rec, bool enable)
{
unsigned long ftrace_addr;
int ret;
@@ -527,7 +527,7 @@ static int finish_update_nop(struct dyn_ftrace *rec)
return ftrace_write(ip, new, 1);
}
-static int finish_update(struct dyn_ftrace *rec, int enable)
+static int finish_update(struct dyn_ftrace *rec, bool enable)
{
unsigned long ftrace_addr;
int ret;
diff --git a/drivers/acpi/acpi_video.c b/drivers/acpi/acpi_video.c
index 9489ffc06411..4f325e47519f 100644
--- a/drivers/acpi/acpi_video.c
+++ b/drivers/acpi/acpi_video.c
@@ -60,6 +60,12 @@ module_param(report_key_events, int, 0644);
MODULE_PARM_DESC(report_key_events,
"0: none, 1: output changes, 2: brightness changes, 3: all");
+static int hw_changes_brightness = -1;
+module_param(hw_changes_brightness, int, 0644);
+MODULE_PARM_DESC(hw_changes_brightness,
+ "Set this to 1 on buggy hw which changes the brightness itself when "
+ "a hotkey is pressed: -1: auto, 0: normal 1: hw-changes-brightness");
+
/*
* Whether the struct acpi_video_device_attrib::device_id_scheme bit should be
* assumed even if not actually set.
@@ -405,6 +411,14 @@ static int video_set_report_key_events(const struct dmi_system_id *id)
return 0;
}
+static int video_hw_changes_brightness(
+ const struct dmi_system_id *d)
+{
+ if (hw_changes_brightness == -1)
+ hw_changes_brightness = 1;
+ return 0;
+}
+
static const struct dmi_system_id video_dmi_table[] = {
/*
* Broken _BQC workaround http://bugzilla.kernel.org/show_bug.cgi?id=13121
@@ -529,6 +543,21 @@ static const struct dmi_system_id video_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "Vostro V131"),
},
},
+ /*
+ * Some machines change the brightness themselves when a brightness
+ * hotkey gets pressed, despite us telling them not to. In this case
+ * acpi_video_device_notify() should only call backlight_force_update(
+ * BACKLIGHT_UPDATE_HOTKEY) and not do anything else.
+ */
+ {
+ /* https://bugzilla.kernel.org/show_bug.cgi?id=204077 */
+ .callback = video_hw_changes_brightness,
+ .ident = "Packard Bell EasyNote MZ35",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Packard Bell"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "EasyNote MZ35"),
+ },
+ },
{}
};
@@ -1612,6 +1641,14 @@ static void acpi_video_device_notify(acpi_handle handle, u32 event, void *data)
bus = video_device->video;
input = bus->input;
+ if (hw_changes_brightness > 0) {
+ if (video_device->backlight)
+ backlight_force_update(video_device->backlight,
+ BACKLIGHT_UPDATE_HOTKEY);
+ acpi_notifier_call_chain(device, event, 0);
+ return;
+ }
+
switch (event) {
case ACPI_VIDEO_NOTIFY_CYCLE_BRIGHTNESS: /* Cycle brightness */
brightness_switch_event(video_device, event);
diff --git a/drivers/acpi/acpica/exconfig.c b/drivers/acpi/acpica/exconfig.c
index 587aeeeb5070..46a8baf28bd0 100644
--- a/drivers/acpi/acpica/exconfig.c
+++ b/drivers/acpi/acpica/exconfig.c
@@ -174,12 +174,11 @@ acpi_ex_load_table_op(struct acpi_walk_state *walk_state,
return_ACPI_STATUS(status);
}
- /* Complete the initialization/resolution of package objects */
+ /* Complete the initialization/resolution of new objects */
- status = acpi_ns_walk_namespace(ACPI_TYPE_PACKAGE, ACPI_ROOT_OBJECT,
- ACPI_UINT32_MAX, 0,
- acpi_ns_init_one_package, NULL, NULL,
- NULL);
+ acpi_ex_exit_interpreter();
+ acpi_ns_initialize_objects();
+ acpi_ex_enter_interpreter();
/* Parameter Data (optional) */
@@ -437,12 +436,11 @@ acpi_ex_load_op(union acpi_operand_object *obj_desc,
return_ACPI_STATUS(status);
}
- /* Complete the initialization/resolution of package objects */
+ /* Complete the initialization/resolution of new objects */
- status = acpi_ns_walk_namespace(ACPI_TYPE_PACKAGE, ACPI_ROOT_OBJECT,
- ACPI_UINT32_MAX, 0,
- acpi_ns_init_one_package, NULL, NULL,
- NULL);
+ acpi_ex_exit_interpreter();
+ acpi_ns_initialize_objects();
+ acpi_ex_enter_interpreter();
/* Store the ddb_handle into the Target operand */
diff --git a/drivers/acpi/acpica/tbxfload.c b/drivers/acpi/acpica/tbxfload.c
index ef8f8a9f3c9c..86f1693f6d29 100644
--- a/drivers/acpi/acpica/tbxfload.c
+++ b/drivers/acpi/acpica/tbxfload.c
@@ -297,15 +297,11 @@ acpi_status acpi_load_table(struct acpi_table_header *table)
status = acpi_tb_install_and_load_table(ACPI_PTR_TO_PHYSADDR(table),
ACPI_TABLE_ORIGIN_EXTERNAL_VIRTUAL,
FALSE, &table_index);
-
if (ACPI_SUCCESS(status)) {
- /* Complete the initialization/resolution of package objects */
- status = acpi_ns_walk_namespace(ACPI_TYPE_PACKAGE,
- ACPI_ROOT_OBJECT,
- ACPI_UINT32_MAX, 0,
- acpi_ns_init_one_package,
- NULL, NULL, NULL);
+ /* Complete the initialization/resolution of new objects */
+
+ acpi_ns_initialize_objects();
}
return_ACPI_STATUS(status);
diff --git a/drivers/acpi/blacklist.c b/drivers/acpi/blacklist.c
index ad2c565f5cbe..a86a770c9b79 100644
--- a/drivers/acpi/blacklist.c
+++ b/drivers/acpi/blacklist.c
@@ -17,7 +17,9 @@
#include "internal.h"
+#ifdef CONFIG_DMI
static const struct dmi_system_id acpi_rev_dmi_table[] __initconst;
+#endif
/*
* POLICY: If *anything* doesn't work, put it on the blacklist.
@@ -61,7 +63,9 @@ int __init acpi_blacklisted(void)
}
(void)early_acpi_osi_init();
+#ifdef CONFIG_DMI
dmi_check_system(acpi_rev_dmi_table);
+#endif
return blacklisted;
}
diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 23022cf20d26..c02fa27dd3f3 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -2426,7 +2426,7 @@ static void write_blk_ctl(struct nfit_blk *nfit_blk, unsigned int bw,
offset = to_interleave_offset(offset, mmio);
writeq(cmd, mmio->addr.base + offset);
- nvdimm_flush(nfit_blk->nd_region);
+ nvdimm_flush(nfit_blk->nd_region, NULL);
if (nfit_blk->dimm_flags & NFIT_BLK_DCR_LATCH)
readq(mmio->addr.base + offset);
@@ -2475,7 +2475,7 @@ static int acpi_nfit_blk_single_io(struct nfit_blk *nfit_blk,
}
if (rw)
- nvdimm_flush(nfit_blk->nd_region);
+ nvdimm_flush(nfit_blk->nd_region, NULL);
rc = read_blk_stat(nfit_blk, lane) ? -EIO : 0;
return rc;
diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 33c30c1e6a30..b063bc41b0a9 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -1536,7 +1536,8 @@ static int genpd_add_device(struct generic_pm_domain *genpd, struct device *dev,
if (ret)
genpd_free_dev_data(dev, gpd_data);
else
- dev_pm_qos_add_notifier(dev, &gpd_data->nb);
+ dev_pm_qos_add_notifier(dev, &gpd_data->nb,
+ DEV_PM_QOS_RESUME_LATENCY);
return ret;
}
@@ -1569,7 +1570,8 @@ static int genpd_remove_device(struct generic_pm_domain *genpd,
pdd = dev->power.subsys_data->domain_data;
gpd_data = to_gpd_data(pdd);
- dev_pm_qos_remove_notifier(dev, &gpd_data->nb);
+ dev_pm_qos_remove_notifier(dev, &gpd_data->nb,
+ DEV_PM_QOS_RESUME_LATENCY);
genpd_lock(genpd);
@@ -1597,7 +1599,7 @@ static int genpd_remove_device(struct generic_pm_domain *genpd,
out:
genpd_unlock(genpd);
- dev_pm_qos_add_notifier(dev, &gpd_data->nb);
+ dev_pm_qos_add_notifier(dev, &gpd_data->nb, DEV_PM_QOS_RESUME_LATENCY);
return ret;
}
diff --git a/drivers/base/power/domain_governor.c b/drivers/base/power/domain_governor.c
index 3838045c9277..daa8c7689f7e 100644
--- a/drivers/base/power/domain_governor.c
+++ b/drivers/base/power/domain_governor.c
@@ -33,7 +33,7 @@ static int dev_update_qos_constraint(struct device *dev, void *data)
* take its current PM QoS constraint (that's the only thing
* known at this point anyway).
*/
- constraint_ns = dev_pm_qos_read_value(dev);
+ constraint_ns = dev_pm_qos_read_value(dev, DEV_PM_QOS_RESUME_LATENCY);
constraint_ns *= NSEC_PER_USEC;
}
@@ -66,7 +66,7 @@ static bool default_suspend_ok(struct device *dev)
td->constraint_changed = false;
td->cached_suspend_ok = false;
td->effective_constraint_ns = 0;
- constraint_ns = __dev_pm_qos_read_value(dev);
+ constraint_ns = __dev_pm_qos_resume_latency(dev);
spin_unlock_irqrestore(&dev->power.lock, flags);
diff --git a/drivers/base/power/qos.c b/drivers/base/power/qos.c
index 6c91f8df1d59..6c90fd7e2ff8 100644
--- a/drivers/base/power/qos.c
+++ b/drivers/base/power/qos.c
@@ -90,29 +90,49 @@ enum pm_qos_flags_status dev_pm_qos_flags(struct device *dev, s32 mask)
EXPORT_SYMBOL_GPL(dev_pm_qos_flags);
/**
- * __dev_pm_qos_read_value - Get PM QoS constraint for a given device.
+ * __dev_pm_qos_resume_latency - Get resume latency constraint for a given device.
* @dev: Device to get the PM QoS constraint value for.
*
* This routine must be called with dev->power.lock held.
*/
-s32 __dev_pm_qos_read_value(struct device *dev)
+s32 __dev_pm_qos_resume_latency(struct device *dev)
{
lockdep_assert_held(&dev->power.lock);
- return dev_pm_qos_raw_read_value(dev);
+ return dev_pm_qos_raw_resume_latency(dev);
}
/**
* dev_pm_qos_read_value - Get PM QoS constraint for a given device (locked).
* @dev: Device to get the PM QoS constraint value for.
+ * @type: QoS request type.
*/
-s32 dev_pm_qos_read_value(struct device *dev)
+s32 dev_pm_qos_read_value(struct device *dev, enum dev_pm_qos_req_type type)
{
+ struct dev_pm_qos *qos = dev->power.qos;
unsigned long flags;
s32 ret;
spin_lock_irqsave(&dev->power.lock, flags);
- ret = __dev_pm_qos_read_value(dev);
+
+ switch (type) {
+ case DEV_PM_QOS_RESUME_LATENCY:
+ ret = IS_ERR_OR_NULL(qos) ? PM_QOS_RESUME_LATENCY_NO_CONSTRAINT
+ : pm_qos_read_value(&qos->resume_latency);
+ break;
+ case DEV_PM_QOS_MIN_FREQUENCY:
+ ret = IS_ERR_OR_NULL(qos) ? PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE
+ : pm_qos_read_value(&qos->min_frequency);
+ break;
+ case DEV_PM_QOS_MAX_FREQUENCY:
+ ret = IS_ERR_OR_NULL(qos) ? PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE
+ : pm_qos_read_value(&qos->max_frequency);
+ break;
+ default:
+ WARN_ON(1);
+ ret = 0;
+ }
+
spin_unlock_irqrestore(&dev->power.lock, flags);
return ret;
@@ -149,6 +169,14 @@ static int apply_constraint(struct dev_pm_qos_request *req,
req->dev->power.set_latency_tolerance(req->dev, value);
}
break;
+ case DEV_PM_QOS_MIN_FREQUENCY:
+ ret = pm_qos_update_target(&qos->min_frequency,
+ &req->data.pnode, action, value);
+ break;
+ case DEV_PM_QOS_MAX_FREQUENCY:
+ ret = pm_qos_update_target(&qos->max_frequency,
+ &req->data.pnode, action, value);
+ break;
case DEV_PM_QOS_FLAGS:
ret = pm_qos_update_flags(&qos->flags, &req->data.flr,
action, value);
@@ -177,12 +205,11 @@ static int dev_pm_qos_constraints_allocate(struct device *dev)
if (!qos)
return -ENOMEM;
- n = kzalloc(sizeof(*n), GFP_KERNEL);
+ n = kzalloc(3 * sizeof(*n), GFP_KERNEL);
if (!n) {
kfree(qos);
return -ENOMEM;
}
- BLOCKING_INIT_NOTIFIER_HEAD(n);
c = &qos->resume_latency;
plist_head_init(&c->list);
@@ -191,6 +218,7 @@ static int dev_pm_qos_constraints_allocate(struct device *dev)
c->no_constraint_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT;
c->type = PM_QOS_MIN;
c->notifiers = n;
+ BLOCKING_INIT_NOTIFIER_HEAD(n);
c = &qos->latency_tolerance;
plist_head_init(&c->list);
@@ -199,6 +227,24 @@ static int dev_pm_qos_constraints_allocate(struct device *dev)
c->no_constraint_value = PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT;
c->type = PM_QOS_MIN;
+ c = &qos->min_frequency;
+ plist_head_init(&c->list);
+ c->target_value = PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE;
+ c->default_value = PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE;
+ c->no_constraint_value = PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE;
+ c->type = PM_QOS_MAX;
+ c->notifiers = ++n;
+ BLOCKING_INIT_NOTIFIER_HEAD(n);
+
+ c = &qos->max_frequency;
+ plist_head_init(&c->list);
+ c->target_value = PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE;
+ c->default_value = PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE;
+ c->no_constraint_value = PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE;
+ c->type = PM_QOS_MIN;
+ c->notifiers = ++n;
+ BLOCKING_INIT_NOTIFIER_HEAD(n);
+
INIT_LIST_HEAD(&qos->flags.list);
spin_lock_irq(&dev->power.lock);
@@ -252,11 +298,25 @@ void dev_pm_qos_constraints_destroy(struct device *dev)
apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE);
memset(req, 0, sizeof(*req));
}
+
c = &qos->latency_tolerance;
plist_for_each_entry_safe(req, tmp, &c->list, data.pnode) {
apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE);
memset(req, 0, sizeof(*req));
}
+
+ c = &qos->min_frequency;
+ plist_for_each_entry_safe(req, tmp, &c->list, data.pnode) {
+ apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE);
+ memset(req, 0, sizeof(*req));
+ }
+
+ c = &qos->max_frequency;
+ plist_for_each_entry_safe(req, tmp, &c->list, data.pnode) {
+ apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE);
+ memset(req, 0, sizeof(*req));
+ }
+
f = &qos->flags;
list_for_each_entry_safe(req, tmp, &f->list, data.flr.node) {
apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE);
@@ -368,6 +428,8 @@ static int __dev_pm_qos_update_request(struct dev_pm_qos_request *req,
switch(req->type) {
case DEV_PM_QOS_RESUME_LATENCY:
case DEV_PM_QOS_LATENCY_TOLERANCE:
+ case DEV_PM_QOS_MIN_FREQUENCY:
+ case DEV_PM_QOS_MAX_FREQUENCY:
curr_value = req->data.pnode.prio;
break;
case DEV_PM_QOS_FLAGS:
@@ -467,6 +529,7 @@ EXPORT_SYMBOL_GPL(dev_pm_qos_remove_request);
*
* @dev: target device for the constraint
* @notifier: notifier block managed by caller.
+ * @type: request type.
*
* Will register the notifier into a notification chain that gets called
* upon changes to the target value for the device.
@@ -474,7 +537,8 @@ EXPORT_SYMBOL_GPL(dev_pm_qos_remove_request);
* If the device's constraints object doesn't exist when this routine is called,
* it will be created (or error code will be returned if that fails).
*/
-int dev_pm_qos_add_notifier(struct device *dev, struct notifier_block *notifier)
+int dev_pm_qos_add_notifier(struct device *dev, struct notifier_block *notifier,
+ enum dev_pm_qos_req_type type)
{
int ret = 0;
@@ -485,10 +549,28 @@ int dev_pm_qos_add_notifier(struct device *dev, struct notifier_block *notifier)
else if (!dev->power.qos)
ret = dev_pm_qos_constraints_allocate(dev);
- if (!ret)
+ if (ret)
+ goto unlock;
+
+ switch (type) {
+ case DEV_PM_QOS_RESUME_LATENCY:
ret = blocking_notifier_chain_register(dev->power.qos->resume_latency.notifiers,
notifier);
+ break;
+ case DEV_PM_QOS_MIN_FREQUENCY:
+ ret = blocking_notifier_chain_register(dev->power.qos->min_frequency.notifiers,
+ notifier);
+ break;
+ case DEV_PM_QOS_MAX_FREQUENCY:
+ ret = blocking_notifier_chain_register(dev->power.qos->max_frequency.notifiers,
+ notifier);
+ break;
+ default:
+ WARN_ON(1);
+ ret = -EINVAL;
+ }
+unlock:
mutex_unlock(&dev_pm_qos_mtx);
return ret;
}
@@ -500,24 +582,44 @@ EXPORT_SYMBOL_GPL(dev_pm_qos_add_notifier);
*
* @dev: target device for the constraint
* @notifier: notifier block to be removed.
+ * @type: request type.
*
* Will remove the notifier from the notification chain that gets called
* upon changes to the target value.
*/
int dev_pm_qos_remove_notifier(struct device *dev,
- struct notifier_block *notifier)
+ struct notifier_block *notifier,
+ enum dev_pm_qos_req_type type)
{
- int retval = 0;
+ int ret = 0;
mutex_lock(&dev_pm_qos_mtx);
/* Silently return if the constraints object is not present. */
- if (!IS_ERR_OR_NULL(dev->power.qos))
- retval = blocking_notifier_chain_unregister(dev->power.qos->resume_latency.notifiers,
- notifier);
+ if (IS_ERR_OR_NULL(dev->power.qos))
+ goto unlock;
+
+ switch (type) {
+ case DEV_PM_QOS_RESUME_LATENCY:
+ ret = blocking_notifier_chain_unregister(dev->power.qos->resume_latency.notifiers,
+ notifier);
+ break;
+ case DEV_PM_QOS_MIN_FREQUENCY:
+ ret = blocking_notifier_chain_unregister(dev->power.qos->min_frequency.notifiers,
+ notifier);
+ break;
+ case DEV_PM_QOS_MAX_FREQUENCY:
+ ret = blocking_notifier_chain_unregister(dev->power.qos->max_frequency.notifiers,
+ notifier);
+ break;
+ default:
+ WARN_ON(1);
+ ret = -EINVAL;
+ }
+unlock:
mutex_unlock(&dev_pm_qos_mtx);
- return retval;
+ return ret;
}
EXPORT_SYMBOL_GPL(dev_pm_qos_remove_notifier);
@@ -577,6 +679,9 @@ static void __dev_pm_qos_drop_user_request(struct device *dev,
req = dev->power.qos->flags_req;
dev->power.qos->flags_req = NULL;
break;
+ default:
+ WARN_ON(1);
+ return;
}
__dev_pm_qos_remove_request(req);
kfree(req);
diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 952a1e7057c7..b75335508d2c 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -275,7 +275,7 @@ static int rpm_check_suspend_allowed(struct device *dev)
|| (dev->power.request_pending
&& dev->power.request == RPM_REQ_RESUME))
retval = -EAGAIN;
- else if (__dev_pm_qos_read_value(dev) == 0)
+ else if (__dev_pm_qos_resume_latency(dev) == 0)
retval = -EPERM;
else if (dev->power.runtime_status == RPM_SUSPENDED)
retval = 1;
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index f652c1ac3ae9..0469aceaa230 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -2120,6 +2120,9 @@ static void setup_format_params(int track)
raw_cmd->kernel_data = floppy_track_buffer;
raw_cmd->length = 4 * F_SECT_PER_TRACK;
+ if (!F_SECT_PER_TRACK)
+ return;
+
/* allow for about 30ms for data transport per track */
head_shift = (F_SECT_PER_TRACK + 5) / 6;
@@ -3230,8 +3233,12 @@ static int set_geometry(unsigned int cmd, struct floppy_struct *g,
int cnt;
/* sanity checking for parameters. */
- if (g->sect <= 0 ||
- g->head <= 0 ||
+ if ((int)g->sect <= 0 ||
+ (int)g->head <= 0 ||
+ /* check for overflow in max_sector */
+ (int)(g->sect * g->head) <= 0 ||
+ /* check for zero in F_SECT_PER_TRACK */
+ (unsigned char)((g->sect << 2) >> FD_SIZECODE(g)) == 0 ||
g->track <= 0 || g->track > UDP->tracks >> STRETCH(g) ||
/* check if reserved bits are set */
(g->stretch & ~(FD_STRETCH | FD_SWAPSIDES | FD_SECTBASEMASK)) != 0)
@@ -3375,6 +3382,24 @@ static int fd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
return 0;
}
+static bool valid_floppy_drive_params(const short autodetect[8],
+ int native_format)
+{
+ size_t floppy_type_size = ARRAY_SIZE(floppy_type);
+ size_t i = 0;
+
+ for (i = 0; i < 8; ++i) {
+ if (autodetect[i] < 0 ||
+ autodetect[i] >= floppy_type_size)
+ return false;
+ }
+
+ if (native_format < 0 || native_format >= floppy_type_size)
+ return false;
+
+ return true;
+}
+
static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
unsigned long param)
{
@@ -3501,6 +3526,9 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int
SUPBOUND(size, strlen((const char *)outparam) + 1);
break;
case FDSETDRVPRM:
+ if (!valid_floppy_drive_params(inparam.dp.autodetect,
+ inparam.dp.native_format))
+ return -EINVAL;
*UDP = inparam.dp;
break;
case FDGETDRVPRM:
@@ -3698,6 +3726,8 @@ static int compat_setdrvprm(int drive,
return -EPERM;
if (copy_from_user(&v, arg, sizeof(struct compat_floppy_drive_params)))
return -EFAULT;
+ if (!valid_floppy_drive_params(v.autodetect, v.native_format))
+ return -EINVAL;
mutex_lock(&floppy_mutex);
UDP->cmos = v.cmos;
UDP->max_dtr = v.max_dtr;
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index e5009a34f9c2..3327192bb71f 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -115,6 +115,8 @@ static int atomic_dec_return_safe(atomic_t *v)
#define RBD_FEATURE_LAYERING (1ULL<<0)
#define RBD_FEATURE_STRIPINGV2 (1ULL<<1)
#define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2)
+#define RBD_FEATURE_OBJECT_MAP (1ULL<<3)
+#define RBD_FEATURE_FAST_DIFF (1ULL<<4)
#define RBD_FEATURE_DEEP_FLATTEN (1ULL<<5)
#define RBD_FEATURE_DATA_POOL (1ULL<<7)
#define RBD_FEATURE_OPERATIONS (1ULL<<8)
@@ -122,6 +124,8 @@ static int atomic_dec_return_safe(atomic_t *v)
#define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \
RBD_FEATURE_STRIPINGV2 | \
RBD_FEATURE_EXCLUSIVE_LOCK | \
+ RBD_FEATURE_OBJECT_MAP | \
+ RBD_FEATURE_FAST_DIFF | \
RBD_FEATURE_DEEP_FLATTEN | \
RBD_FEATURE_DATA_POOL | \
RBD_FEATURE_OPERATIONS)
@@ -203,6 +207,11 @@ struct rbd_client {
struct list_head node;
};
+struct pending_result {
+ int result; /* first nonzero result */
+ int num_pending;
+};
+
struct rbd_img_request;
enum obj_request_type {
@@ -219,6 +228,18 @@ enum obj_operation_type {
OBJ_OP_ZEROOUT,
};
+#define RBD_OBJ_FLAG_DELETION (1U << 0)
+#define RBD_OBJ_FLAG_COPYUP_ENABLED (1U << 1)
+#define RBD_OBJ_FLAG_COPYUP_ZEROS (1U << 2)
+#define RBD_OBJ_FLAG_MAY_EXIST (1U << 3)
+#define RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT (1U << 4)
+
+enum rbd_obj_read_state {
+ RBD_OBJ_READ_START = 1,
+ RBD_OBJ_READ_OBJECT,
+ RBD_OBJ_READ_PARENT,
+};
+
/*
* Writes go through the following state machine to deal with
* layering:
@@ -245,17 +266,28 @@ enum obj_operation_type {
* even if there is a parent).
*/
enum rbd_obj_write_state {
- RBD_OBJ_WRITE_FLAT = 1,
- RBD_OBJ_WRITE_GUARD,
- RBD_OBJ_WRITE_READ_FROM_PARENT,
- RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC,
- RBD_OBJ_WRITE_COPYUP_OPS,
+ RBD_OBJ_WRITE_START = 1,
+ RBD_OBJ_WRITE_PRE_OBJECT_MAP,
+ RBD_OBJ_WRITE_OBJECT,
+ __RBD_OBJ_WRITE_COPYUP,
+ RBD_OBJ_WRITE_COPYUP,
+ RBD_OBJ_WRITE_POST_OBJECT_MAP,
+};
+
+enum rbd_obj_copyup_state {
+ RBD_OBJ_COPYUP_START = 1,
+ RBD_OBJ_COPYUP_READ_PARENT,
+ __RBD_OBJ_COPYUP_OBJECT_MAPS,
+ RBD_OBJ_COPYUP_OBJECT_MAPS,
+ __RBD_OBJ_COPYUP_WRITE_OBJECT,
+ RBD_OBJ_COPYUP_WRITE_OBJECT,
};
struct rbd_obj_request {
struct ceph_object_extent ex;
+ unsigned int flags; /* RBD_OBJ_FLAG_* */
union {
- bool tried_parent; /* for reads */
+ enum rbd_obj_read_state read_state; /* for reads */
enum rbd_obj_write_state write_state; /* for writes */
};
@@ -271,14 +303,15 @@ struct rbd_obj_request {
u32 bvec_idx;
};
};
+
+ enum rbd_obj_copyup_state copyup_state;
struct bio_vec *copyup_bvecs;
u32 copyup_bvec_count;
- struct ceph_osd_request *osd_req;
-
- u64 xferred; /* bytes transferred */
- int result;
+ struct list_head osd_reqs; /* w/ r_private_item */
+ struct mutex state_mutex;
+ struct pending_result pending;
struct kref kref;
};
@@ -287,11 +320,19 @@ enum img_req_flags {
IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
};
+enum rbd_img_state {
+ RBD_IMG_START = 1,
+ RBD_IMG_EXCLUSIVE_LOCK,
+ __RBD_IMG_OBJECT_REQUESTS,
+ RBD_IMG_OBJECT_REQUESTS,
+};
+
struct rbd_img_request {
struct rbd_device *rbd_dev;
enum obj_operation_type op_type;
enum obj_request_type data_type;
unsigned long flags;
+ enum rbd_img_state state;
union {
u64 snap_id; /* for reads */
struct ceph_snap_context *snapc; /* for writes */
@@ -300,13 +341,14 @@ struct rbd_img_request {
struct request *rq; /* block request */
struct rbd_obj_request *obj_request; /* obj req initiator */
};
- spinlock_t completion_lock;
- u64 xferred;/* aggregate bytes transferred */
- int result; /* first nonzero obj_request result */
+ struct list_head lock_item;
struct list_head object_extents; /* obj_req.ex structs */
- u32 pending_count;
+ struct mutex state_mutex;
+ struct pending_result pending;
+ struct work_struct work;
+ int work_result;
struct kref kref;
};
@@ -380,7 +422,17 @@ struct rbd_device {
struct work_struct released_lock_work;
struct delayed_work lock_dwork;
struct work_struct unlock_work;
- wait_queue_head_t lock_waitq;
+ spinlock_t lock_lists_lock;
+ struct list_head acquiring_list;
+ struct list_head running_list;
+ struct completion acquire_wait;
+ int acquire_err;
+ struct completion releasing_wait;
+
+ spinlock_t object_map_lock;
+ u8 *object_map;
+ u64 object_map_size; /* in objects */
+ u64 object_map_flags;
struct workqueue_struct *task_wq;
@@ -408,12 +460,10 @@ struct rbd_device {
* Flag bits for rbd_dev->flags:
* - REMOVING (which is coupled with rbd_dev->open_count) is protected
* by rbd_dev->lock
- * - BLACKLISTED is protected by rbd_dev->lock_rwsem
*/
enum rbd_dev_flags {
RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */
RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
- RBD_DEV_FLAG_BLACKLISTED, /* our ceph_client is blacklisted */
};
static DEFINE_MUTEX(client_mutex); /* Serialize client creation */
@@ -466,6 +516,8 @@ static int minor_to_rbd_dev_id(int minor)
static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
{
+ lockdep_assert_held(&rbd_dev->lock_rwsem);
+
return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
}
@@ -583,6 +635,26 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
u8 *order, u64 *snap_size);
static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
u64 *snap_features);
+static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev);
+
+static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result);
+static void rbd_img_handle_request(struct rbd_img_request *img_req, int result);
+
+/*
+ * Return true if nothing else is pending.
+ */
+static bool pending_result_dec(struct pending_result *pending, int *result)
+{
+ rbd_assert(pending->num_pending > 0);
+
+ if (*result && !pending->result)
+ pending->result = *result;
+ if (--pending->num_pending)
+ return false;
+
+ *result = pending->result;
+ return true;
+}
static int rbd_open(struct block_device *bdev, fmode_t mode)
{
@@ -1317,6 +1389,8 @@ static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
u32 bytes)
{
+ dout("%s %p data buf %u~%u\n", __func__, obj_req, off, bytes);
+
switch (obj_req->img_request->data_type) {
case OBJ_REQUEST_BIO:
zero_bios(&obj_req->bio_pos, off, bytes);
@@ -1339,13 +1413,6 @@ static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
kref_put(&obj_request->kref, rbd_obj_request_destroy);
}
-static void rbd_img_request_get(struct rbd_img_request *img_request)
-{
- dout("%s: img %p (was %d)\n", __func__, img_request,
- kref_read(&img_request->kref));
- kref_get(&img_request->kref);
-}
-
static void rbd_img_request_destroy(struct kref *kref);
static void rbd_img_request_put(struct rbd_img_request *img_request)
{
@@ -1362,7 +1429,6 @@ static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
/* Image request now owns object's original reference */
obj_request->img_request = img_request;
- img_request->pending_count++;
dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
}
@@ -1375,13 +1441,13 @@ static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
rbd_obj_request_put(obj_request);
}
-static void rbd_obj_request_submit(struct rbd_obj_request *obj_request)
+static void rbd_osd_submit(struct ceph_osd_request *osd_req)
{
- struct ceph_osd_request *osd_req = obj_request->osd_req;
+ struct rbd_obj_request *obj_req = osd_req->r_priv;
- dout("%s %p object_no %016llx %llu~%llu osd_req %p\n", __func__,
- obj_request, obj_request->ex.oe_objno, obj_request->ex.oe_off,
- obj_request->ex.oe_len, osd_req);
+ dout("%s osd_req %p for obj_req %p objno %llu %llu~%llu\n",
+ __func__, osd_req, obj_req, obj_req->ex.oe_objno,
+ obj_req->ex.oe_off, obj_req->ex.oe_len);
ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
}
@@ -1457,41 +1523,38 @@ static bool rbd_img_is_write(struct rbd_img_request *img_req)
}
}
-static void rbd_obj_handle_request(struct rbd_obj_request *obj_req);
-
static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
{
struct rbd_obj_request *obj_req = osd_req->r_priv;
+ int result;
dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
osd_req->r_result, obj_req);
- rbd_assert(osd_req == obj_req->osd_req);
- obj_req->result = osd_req->r_result < 0 ? osd_req->r_result : 0;
- if (!obj_req->result && !rbd_img_is_write(obj_req->img_request))
- obj_req->xferred = osd_req->r_result;
+ /*
+ * Writes aren't allowed to return a data payload. In some
+ * guarded write cases (e.g. stat + zero on an empty object)
+ * a stat response makes it through, but we don't care.
+ */
+ if (osd_req->r_result > 0 && rbd_img_is_write(obj_req->img_request))
+ result = 0;
else
- /*
- * Writes aren't allowed to return a data payload. In some
- * guarded write cases (e.g. stat + zero on an empty object)
- * a stat response makes it through, but we don't care.
- */
- obj_req->xferred = 0;
+ result = osd_req->r_result;
- rbd_obj_handle_request(obj_req);
+ rbd_obj_handle_request(obj_req, result);
}
-static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
+static void rbd_osd_format_read(struct ceph_osd_request *osd_req)
{
- struct ceph_osd_request *osd_req = obj_request->osd_req;
+ struct rbd_obj_request *obj_request = osd_req->r_priv;
osd_req->r_flags = CEPH_OSD_FLAG_READ;
osd_req->r_snapid = obj_request->img_request->snap_id;
}
-static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
+static void rbd_osd_format_write(struct ceph_osd_request *osd_req)
{
- struct ceph_osd_request *osd_req = obj_request->osd_req;
+ struct rbd_obj_request *obj_request = osd_req->r_priv;
osd_req->r_flags = CEPH_OSD_FLAG_WRITE;
ktime_get_real_ts64(&osd_req->r_mtime);
@@ -1499,19 +1562,21 @@ static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
}
static struct ceph_osd_request *
-__rbd_osd_req_create(struct rbd_obj_request *obj_req,
- struct ceph_snap_context *snapc, unsigned int num_ops)
+__rbd_obj_add_osd_request(struct rbd_obj_request *obj_req,
+ struct ceph_snap_context *snapc, int num_ops)
{
struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
struct ceph_osd_request *req;
const char *name_format = rbd_dev->image_format == 1 ?
RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
+ int ret;
req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO);
if (!req)
- return NULL;
+ return ERR_PTR(-ENOMEM);
+ list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
req->r_callback = rbd_osd_req_callback;
req->r_priv = obj_req;
@@ -1522,27 +1587,20 @@ __rbd_osd_req_create(struct rbd_obj_request *obj_req,
ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
req->r_base_oloc.pool = rbd_dev->layout.pool_id;
- if (ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
- rbd_dev->header.object_prefix, obj_req->ex.oe_objno))
- goto err_req;
+ ret = ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
+ rbd_dev->header.object_prefix,
+ obj_req->ex.oe_objno);
+ if (ret)
+ return ERR_PTR(ret);
return req;
-
-err_req:
- ceph_osdc_put_request(req);
- return NULL;
}
static struct ceph_osd_request *
-rbd_osd_req_create(struct rbd_obj_request *obj_req, unsigned int num_ops)
+rbd_obj_add_osd_request(struct rbd_obj_request *obj_req, int num_ops)
{
- return __rbd_osd_req_create(obj_req, obj_req->img_request->snapc,
- num_ops);
-}
-
-static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
-{
- ceph_osdc_put_request(osd_req);
+ return __rbd_obj_add_osd_request(obj_req, obj_req->img_request->snapc,
+ num_ops);
}
static struct rbd_obj_request *rbd_obj_request_create(void)
@@ -1554,6 +1612,8 @@ static struct rbd_obj_request *rbd_obj_request_create(void)
return NULL;
ceph_object_extent_init(&obj_request->ex);
+ INIT_LIST_HEAD(&obj_request->osd_reqs);
+ mutex_init(&obj_request->state_mutex);
kref_init(&obj_request->kref);
dout("%s %p\n", __func__, obj_request);
@@ -1563,14 +1623,19 @@ static struct rbd_obj_request *rbd_obj_request_create(void)
static void rbd_obj_request_destroy(struct kref *kref)
{
struct rbd_obj_request *obj_request;
+ struct ceph_osd_request *osd_req;
u32 i;
obj_request = container_of(kref, struct rbd_obj_request, kref);
dout("%s: obj %p\n", __func__, obj_request);
- if (obj_request->osd_req)
- rbd_osd_req_destroy(obj_request->osd_req);
+ while (!list_empty(&obj_request->osd_reqs)) {
+ osd_req = list_first_entry(&obj_request->osd_reqs,
+ struct ceph_osd_request, r_private_item);
+ list_del_init(&osd_req->r_private_item);
+ ceph_osdc_put_request(osd_req);
+ }
switch (obj_request->img_request->data_type) {
case OBJ_REQUEST_NODATA:
@@ -1684,8 +1749,9 @@ static struct rbd_img_request *rbd_img_request_create(
if (rbd_dev_parent_get(rbd_dev))
img_request_layered_set(img_request);
- spin_lock_init(&img_request->completion_lock);
+ INIT_LIST_HEAD(&img_request->lock_item);
INIT_LIST_HEAD(&img_request->object_extents);
+ mutex_init(&img_request->state_mutex);
kref_init(&img_request->kref);
dout("%s: rbd_dev %p %s -> img %p\n", __func__, rbd_dev,
@@ -1703,6 +1769,7 @@ static void rbd_img_request_destroy(struct kref *kref)
dout("%s: img %p\n", __func__, img_request);
+ WARN_ON(!list_empty(&img_request->lock_item));
for_each_obj_request_safe(img_request, obj_request, next_obj_request)
rbd_img_obj_request_del(img_request, obj_request);
@@ -1717,6 +1784,466 @@ static void rbd_img_request_destroy(struct kref *kref)
kmem_cache_free(rbd_img_request_cache, img_request);
}
+#define BITS_PER_OBJ 2
+#define OBJS_PER_BYTE (BITS_PER_BYTE / BITS_PER_OBJ)
+#define OBJ_MASK ((1 << BITS_PER_OBJ) - 1)
+
+static void __rbd_object_map_index(struct rbd_device *rbd_dev, u64 objno,
+ u64 *index, u8 *shift)
+{
+ u32 off;
+
+ rbd_assert(objno < rbd_dev->object_map_size);
+ *index = div_u64_rem(objno, OBJS_PER_BYTE, &off);
+ *shift = (OBJS_PER_BYTE - off - 1) * BITS_PER_OBJ;
+}
+
+static u8 __rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
+{
+ u64 index;
+ u8 shift;
+
+ lockdep_assert_held(&rbd_dev->object_map_lock);
+ __rbd_object_map_index(rbd_dev, objno, &index, &shift);
+ return (rbd_dev->object_map[index] >> shift) & OBJ_MASK;
+}
+
+static void __rbd_object_map_set(struct rbd_device *rbd_dev, u64 objno, u8 val)
+{
+ u64 index;
+ u8 shift;
+ u8 *p;
+
+ lockdep_assert_held(&rbd_dev->object_map_lock);
+ rbd_assert(!(val & ~OBJ_MASK));
+
+ __rbd_object_map_index(rbd_dev, objno, &index, &shift);
+ p = &rbd_dev->object_map[index];
+ *p = (*p & ~(OBJ_MASK << shift)) | (val << shift);
+}
+
+static u8 rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
+{
+ u8 state;
+
+ spin_lock(&rbd_dev->object_map_lock);
+ state = __rbd_object_map_get(rbd_dev, objno);
+ spin_unlock(&rbd_dev->object_map_lock);
+ return state;
+}
+
+static bool use_object_map(struct rbd_device *rbd_dev)
+{
+ return ((rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) &&
+ !(rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID));
+}
+
+static bool rbd_object_map_may_exist(struct rbd_device *rbd_dev, u64 objno)
+{
+ u8 state;
+
+ /* fall back to default logic if object map is disabled or invalid */
+ if (!use_object_map(rbd_dev))
+ return true;
+
+ state = rbd_object_map_get(rbd_dev, objno);
+ return state != OBJECT_NONEXISTENT;
+}
+
+static void rbd_object_map_name(struct rbd_device *rbd_dev, u64 snap_id,
+ struct ceph_object_id *oid)
+{
+ if (snap_id == CEPH_NOSNAP)
+ ceph_oid_printf(oid, "%s%s", RBD_OBJECT_MAP_PREFIX,
+ rbd_dev->spec->image_id);
+ else
+ ceph_oid_printf(oid, "%s%s.%016llx", RBD_OBJECT_MAP_PREFIX,
+ rbd_dev->spec->image_id, snap_id);
+}
+
+static int rbd_object_map_lock(struct rbd_device *rbd_dev)
+{
+ struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+ CEPH_DEFINE_OID_ONSTACK(oid);
+ u8 lock_type;
+ char *lock_tag;
+ struct ceph_locker *lockers;
+ u32 num_lockers;
+ bool broke_lock = false;
+ int ret;
+
+ rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
+
+again:
+ ret = ceph_cls_lock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
+ CEPH_CLS_LOCK_EXCLUSIVE, "", "", "", 0);
+ if (ret != -EBUSY || broke_lock) {
+ if (ret == -EEXIST)
+ ret = 0; /* already locked by myself */
+ if (ret)
+ rbd_warn(rbd_dev, "failed to lock object map: %d", ret);
+ return ret;
+ }
+
+ ret = ceph_cls_lock_info(osdc, &oid, &rbd_dev->header_oloc,
+ RBD_LOCK_NAME, &lock_type, &lock_tag,
+ &lockers, &num_lockers);
+ if (ret) {
+ if (ret == -ENOENT)
+ goto again;
+
+ rbd_warn(rbd_dev, "failed to get object map lockers: %d", ret);
+ return ret;
+ }
+
+ kfree(lock_tag);
+ if (num_lockers == 0)
+ goto again;
+
+ rbd_warn(rbd_dev, "breaking object map lock owned by %s%llu",
+ ENTITY_NAME(lockers[0].id.name));
+
+ ret = ceph_cls_break_lock(osdc, &oid, &rbd_dev->header_oloc,
+ RBD_LOCK_NAME, lockers[0].id.cookie,
+ &lockers[0].id.name);
+ ceph_free_lockers(lockers, num_lockers);
+ if (ret) {
+ if (ret == -ENOENT)
+ goto again;
+
+ rbd_warn(rbd_dev, "failed to break object map lock: %d", ret);
+ return ret;
+ }
+
+ broke_lock = true;
+ goto again;
+}
+
+static void rbd_object_map_unlock(struct rbd_device *rbd_dev)
+{
+ struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+ CEPH_DEFINE_OID_ONSTACK(oid);
+ int ret;
+
+ rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
+
+ ret = ceph_cls_unlock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
+ "");
+ if (ret && ret != -ENOENT)
+ rbd_warn(rbd_dev, "failed to unlock object map: %d", ret);
+}
+
+static int decode_object_map_header(void **p, void *end, u64 *object_map_size)
+{
+ u8 struct_v;
+ u32 struct_len;
+ u32 header_len;
+ void *header_end;
+ int ret;
+
+ ceph_decode_32_safe(p, end, header_len, e_inval);
+ header_end = *p + header_len;
+
+ ret = ceph_start_decoding(p, end, 1, "BitVector header", &struct_v,
+ &struct_len);
+ if (ret)
+ return ret;
+
+ ceph_decode_64_safe(p, end, *object_map_size, e_inval);
+
+ *p = header_end;
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
+static int __rbd_object_map_load(struct rbd_device *rbd_dev)
+{
+ struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+ CEPH_DEFINE_OID_ONSTACK(oid);
+ struct page **pages;
+ void *p, *end;
+ size_t reply_len;
+ u64 num_objects;
+ u64 object_map_bytes;
+ u64 object_map_size;
+ int num_pages;
+ int ret;
+
+ rbd_assert(!rbd_dev->object_map && !rbd_dev->object_map_size);
+
+ num_objects = ceph_get_num_objects(&rbd_dev->layout,
+ rbd_dev->mapping.size);
+ object_map_bytes = DIV_ROUND_UP_ULL(num_objects * BITS_PER_OBJ,
+ BITS_PER_BYTE);
+ num_pages = calc_pages_for(0, object_map_bytes) + 1;
+ pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+
+ reply_len = num_pages * PAGE_SIZE;
+ rbd_object_map_name(rbd_dev, rbd_dev->spec->snap_id, &oid);
+ ret = ceph_osdc_call(osdc, &oid, &rbd_dev->header_oloc,
+ "rbd", "object_map_load", CEPH_OSD_FLAG_READ,
+ NULL, 0, pages, &reply_len);
+ if (ret)
+ goto out;
+
+ p = page_address(pages[0]);
+ end = p + min(reply_len, (size_t)PAGE_SIZE);
+ ret = decode_object_map_header(&p, end, &object_map_size);
+ if (ret)
+ goto out;
+
+ if (object_map_size != num_objects) {
+ rbd_warn(rbd_dev, "object map size mismatch: %llu vs %llu",
+ object_map_size, num_objects);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (offset_in_page(p) + object_map_bytes > reply_len) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ rbd_dev->object_map = kvmalloc(object_map_bytes, GFP_KERNEL);
+ if (!rbd_dev->object_map) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ rbd_dev->object_map_size = object_map_size;
+ ceph_copy_from_page_vector(pages, rbd_dev->object_map,
+ offset_in_page(p), object_map_bytes);
+
+out:
+ ceph_release_page_vector(pages, num_pages);
+ return ret;
+}
+
+static void rbd_object_map_free(struct rbd_device *rbd_dev)
+{
+ kvfree(rbd_dev->object_map);
+ rbd_dev->object_map = NULL;
+ rbd_dev->object_map_size = 0;
+}
+
+static int rbd_object_map_load(struct rbd_device *rbd_dev)
+{
+ int ret;
+
+ ret = __rbd_object_map_load(rbd_dev);
+ if (ret)
+ return ret;
+
+ ret = rbd_dev_v2_get_flags(rbd_dev);
+ if (ret) {
+ rbd_object_map_free(rbd_dev);
+ return ret;
+ }
+
+ if (rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID)
+ rbd_warn(rbd_dev, "object map is invalid");
+
+ return 0;
+}
+
+static int rbd_object_map_open(struct rbd_device *rbd_dev)
+{
+ int ret;
+
+ ret = rbd_object_map_lock(rbd_dev);
+ if (ret)
+ return ret;
+
+ ret = rbd_object_map_load(rbd_dev);
+ if (ret) {
+ rbd_object_map_unlock(rbd_dev);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void rbd_object_map_close(struct rbd_device *rbd_dev)
+{
+ rbd_object_map_free(rbd_dev);
+ rbd_object_map_unlock(rbd_dev);
+}
+
+/*
+ * This function needs snap_id (or more precisely just something to
+ * distinguish between HEAD and snapshot object maps), new_state and
+ * current_state that were passed to rbd_object_map_update().
+ *
+ * To avoid allocating and stashing a context we piggyback on the OSD
+ * request. A HEAD update has two ops (assert_locked). For new_state
+ * and current_state we decode our own object_map_update op, encoded in
+ * rbd_cls_object_map_update().
+ */
+static int rbd_object_map_update_finish(struct rbd_obj_request *obj_req,
+ struct ceph_osd_request *osd_req)
+{
+ struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+ struct ceph_osd_data *osd_data;
+ u64 objno;
+ u8 state, new_state, current_state;
+ bool has_current_state;
+ void *p;
+
+ if (osd_req->r_result)
+ return osd_req->r_result;
+
+ /*
+ * Nothing to do for a snapshot object map.
+ */
+ if (osd_req->r_num_ops == 1)
+ return 0;
+
+ /*
+ * Update in-memory HEAD object map.
+ */
+ rbd_assert(osd_req->r_num_ops == 2);
+ osd_data = osd_req_op_data(osd_req, 1, cls, request_data);
+ rbd_assert(osd_data->type == CEPH_OSD_DATA_TYPE_PAGES);
+
+ p = page_address(osd_data->pages[0]);
+ objno = ceph_decode_64(&p);
+ rbd_assert(objno == obj_req->ex.oe_objno);
+ rbd_assert(ceph_decode_64(&p) == objno + 1);
+ new_state = ceph_decode_8(&p);
+ has_current_state = ceph_decode_8(&p);
+ if (has_current_state)
+ current_state = ceph_decode_8(&p);
+
+ spin_lock(&rbd_dev->object_map_lock);
+ state = __rbd_object_map_get(rbd_dev, objno);
+ if (!has_current_state || current_state == state ||
+ (current_state == OBJECT_EXISTS && state == OBJECT_EXISTS_CLEAN))
+ __rbd_object_map_set(rbd_dev, objno, new_state);
+ spin_unlock(&rbd_dev->object_map_lock);
+
+ return 0;
+}
+
+static void rbd_object_map_callback(struct ceph_osd_request *osd_req)
+{
+ struct rbd_obj_request *obj_req = osd_req->r_priv;
+ int result;
+
+ dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
+ osd_req->r_result, obj_req);
+
+ result = rbd_object_map_update_finish(obj_req, osd_req);
+ rbd_obj_handle_request(obj_req, result);
+}
+
+static bool update_needed(struct rbd_device *rbd_dev, u64 objno, u8 new_state)
+{
+ u8 state = rbd_object_map_get(rbd_dev, objno);
+
+ if (state == new_state ||
+ (new_state == OBJECT_PENDING && state == OBJECT_NONEXISTENT) ||
+ (new_state == OBJECT_NONEXISTENT && state != OBJECT_PENDING))
+ return false;
+
+ return true;
+}
+
+static int rbd_cls_object_map_update(struct ceph_osd_request *req,
+ int which, u64 objno, u8 new_state,
+ const u8 *current_state)
+{
+ struct page **pages;
+ void *p, *start;
+ int ret;
+
+ ret = osd_req_op_cls_init(req, which, "rbd", "object_map_update");
+ if (ret)
+ return ret;
+
+ pages = ceph_alloc_page_vector(1, GFP_NOIO);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+
+ p = start = page_address(pages[0]);
+ ceph_encode_64(&p, objno);
+ ceph_encode_64(&p, objno + 1);
+ ceph_encode_8(&p, new_state);
+ if (current_state) {
+ ceph_encode_8(&p, 1);
+ ceph_encode_8(&p, *current_state);
+ } else {
+ ceph_encode_8(&p, 0);
+ }
+
+ osd_req_op_cls_request_data_pages(req, which, pages, p - start, 0,
+ false, true);
+ return 0;
+}
+
+/*
+ * Return:
+ * 0 - object map update sent
+ * 1 - object map update isn't needed
+ * <0 - error
+ */
+static int rbd_object_map_update(struct rbd_obj_request *obj_req, u64 snap_id,
+ u8 new_state, const u8 *current_state)
+{
+ struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+ struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+ struct ceph_osd_request *req;
+ int num_ops = 1;
+ int which = 0;
+ int ret;
+
+ if (snap_id == CEPH_NOSNAP) {
+ if (!update_needed(rbd_dev, obj_req->ex.oe_objno, new_state))
+ return 1;
+
+ num_ops++; /* assert_locked */
+ }
+
+ req = ceph_osdc_alloc_request(osdc, NULL, num_ops, false, GFP_NOIO);
+ if (!req)
+ return -ENOMEM;
+
+ list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
+ req->r_callback = rbd_object_map_callback;
+ req->r_priv = obj_req;
+
+ rbd_object_map_name(rbd_dev, snap_id, &req->r_base_oid);
+ ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
+ req->r_flags = CEPH_OSD_FLAG_WRITE;
+ ktime_get_real_ts64(&req->r_mtime);
+
+ if (snap_id == CEPH_NOSNAP) {
+ /*
+ * Protect against possible race conditions during lock
+ * ownership transitions.
+ */
+ ret = ceph_cls_assert_locked(req, which++, RBD_LOCK_NAME,
+ CEPH_CLS_LOCK_EXCLUSIVE, "", "");
+ if (ret)
+ return ret;
+ }
+
+ ret = rbd_cls_object_map_update(req, which, obj_req->ex.oe_objno,
+ new_state, current_state);
+ if (ret)
+ return ret;
+
+ ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
+ if (ret)
+ return ret;
+
+ ceph_osdc_start_request(osdc, req, false);
+ return 0;
+}
+
static void prune_extents(struct ceph_file_extent *img_extents,
u32 *num_img_extents, u64 overlap)
{
@@ -1764,11 +2291,13 @@ static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req,
return 0;
}
-static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which)
+static void rbd_osd_setup_data(struct ceph_osd_request *osd_req, int which)
{
+ struct rbd_obj_request *obj_req = osd_req->r_priv;
+
switch (obj_req->img_request->data_type) {
case OBJ_REQUEST_BIO:
- osd_req_op_extent_osd_data_bio(obj_req->osd_req, which,
+ osd_req_op_extent_osd_data_bio(osd_req, which,
&obj_req->bio_pos,
obj_req->ex.oe_len);
break;
@@ -1777,7 +2306,7 @@ static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which)
rbd_assert(obj_req->bvec_pos.iter.bi_size ==
obj_req->ex.oe_len);
rbd_assert(obj_req->bvec_idx == obj_req->bvec_count);
- osd_req_op_extent_osd_data_bvec_pos(obj_req->osd_req, which,
+ osd_req_op_extent_osd_data_bvec_pos(osd_req, which,
&obj_req->bvec_pos);
break;
default:
@@ -1785,22 +2314,7 @@ static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which)
}
}
-static int rbd_obj_setup_read(struct rbd_obj_request *obj_req)
-{
- obj_req->osd_req = __rbd_osd_req_create(obj_req, NULL, 1);
- if (!obj_req->osd_req)
- return -ENOMEM;
-
- osd_req_op_extent_init(obj_req->osd_req, 0, CEPH_OSD_OP_READ,
- obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
- rbd_osd_req_setup_data(obj_req, 0);
-
- rbd_osd_req_format_read(obj_req);
- return 0;
-}
-
-static int __rbd_obj_setup_stat(struct rbd_obj_request *obj_req,
- unsigned int which)
+static int rbd_osd_setup_stat(struct ceph_osd_request *osd_req, int which)
{
struct page **pages;
@@ -1816,45 +2330,60 @@ static int __rbd_obj_setup_stat(struct rbd_obj_request *obj_req,
if (IS_ERR(pages))
return PTR_ERR(pages);
- osd_req_op_init(obj_req->osd_req, which, CEPH_OSD_OP_STAT, 0);
- osd_req_op_raw_data_in_pages(obj_req->osd_req, which, pages,
+ osd_req_op_init(osd_req, which, CEPH_OSD_OP_STAT, 0);
+ osd_req_op_raw_data_in_pages(osd_req, which, pages,
8 + sizeof(struct ceph_timespec),
0, false, true);
return 0;
}
-static int count_write_ops(struct rbd_obj_request *obj_req)
+static int rbd_osd_setup_copyup(struct ceph_osd_request *osd_req, int which,
+ u32 bytes)
+{
+ struct rbd_obj_request *obj_req = osd_req->r_priv;
+ int ret;
+
+ ret = osd_req_op_cls_init(osd_req, which, "rbd", "copyup");
+ if (ret)
+ return ret;
+
+ osd_req_op_cls_request_data_bvecs(osd_req, which, obj_req->copyup_bvecs,
+ obj_req->copyup_bvec_count, bytes);
+ return 0;
+}
+
+static int rbd_obj_init_read(struct rbd_obj_request *obj_req)
{
- return 2; /* setallochint + write/writefull */
+ obj_req->read_state = RBD_OBJ_READ_START;
+ return 0;
}
-static void __rbd_obj_setup_write(struct rbd_obj_request *obj_req,
- unsigned int which)
+static void __rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
+ int which)
{
+ struct rbd_obj_request *obj_req = osd_req->r_priv;
struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
u16 opcode;
- osd_req_op_alloc_hint_init(obj_req->osd_req, which++,
- rbd_dev->layout.object_size,
- rbd_dev->layout.object_size);
+ if (!use_object_map(rbd_dev) ||
+ !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST)) {
+ osd_req_op_alloc_hint_init(osd_req, which++,
+ rbd_dev->layout.object_size,
+ rbd_dev->layout.object_size);
+ }
if (rbd_obj_is_entire(obj_req))
opcode = CEPH_OSD_OP_WRITEFULL;
else
opcode = CEPH_OSD_OP_WRITE;
- osd_req_op_extent_init(obj_req->osd_req, which, opcode,
+ osd_req_op_extent_init(osd_req, which, opcode,
obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
- rbd_osd_req_setup_data(obj_req, which++);
-
- rbd_assert(which == obj_req->osd_req->r_num_ops);
- rbd_osd_req_format_write(obj_req);
+ rbd_osd_setup_data(osd_req, which);
}
-static int rbd_obj_setup_write(struct rbd_obj_request *obj_req)
+static int rbd_obj_init_write(struct rbd_obj_request *obj_req)
{
- unsigned int num_osd_ops, which = 0;
- bool need_guard;
int ret;
/* reverse map the entire object onto the parent */
@@ -1862,24 +2391,10 @@ static int rbd_obj_setup_write(struct rbd_obj_request *obj_req)
if (ret)
return ret;
- need_guard = rbd_obj_copyup_enabled(obj_req);
- num_osd_ops = need_guard + count_write_ops(obj_req);
-
- obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
- if (!obj_req->osd_req)
- return -ENOMEM;
-
- if (need_guard) {
- ret = __rbd_obj_setup_stat(obj_req, which++);
- if (ret)
- return ret;
+ if (rbd_obj_copyup_enabled(obj_req))
+ obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
- obj_req->write_state = RBD_OBJ_WRITE_GUARD;
- } else {
- obj_req->write_state = RBD_OBJ_WRITE_FLAT;
- }
-
- __rbd_obj_setup_write(obj_req, which);
+ obj_req->write_state = RBD_OBJ_WRITE_START;
return 0;
}
@@ -1889,11 +2404,26 @@ static u16 truncate_or_zero_opcode(struct rbd_obj_request *obj_req)
CEPH_OSD_OP_ZERO;
}
-static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req)
+static void __rbd_osd_setup_discard_ops(struct ceph_osd_request *osd_req,
+ int which)
+{
+ struct rbd_obj_request *obj_req = osd_req->r_priv;
+
+ if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) {
+ rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
+ osd_req_op_init(osd_req, which, CEPH_OSD_OP_DELETE, 0);
+ } else {
+ osd_req_op_extent_init(osd_req, which,
+ truncate_or_zero_opcode(obj_req),
+ obj_req->ex.oe_off, obj_req->ex.oe_len,
+ 0, 0);
+ }
+}
+
+static int rbd_obj_init_discard(struct rbd_obj_request *obj_req)
{
struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
- u64 off = obj_req->ex.oe_off;
- u64 next_off = obj_req->ex.oe_off + obj_req->ex.oe_len;
+ u64 off, next_off;
int ret;
/*
@@ -1906,10 +2436,17 @@ static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req)
*/
if (rbd_dev->opts->alloc_size != rbd_dev->layout.object_size ||
!rbd_obj_is_tail(obj_req)) {
- off = round_up(off, rbd_dev->opts->alloc_size);
- next_off = round_down(next_off, rbd_dev->opts->alloc_size);
+ off = round_up(obj_req->ex.oe_off, rbd_dev->opts->alloc_size);
+ next_off = round_down(obj_req->ex.oe_off + obj_req->ex.oe_len,
+ rbd_dev->opts->alloc_size);
if (off >= next_off)
return 1;
+
+ dout("%s %p %llu~%llu -> %llu~%llu\n", __func__,
+ obj_req, obj_req->ex.oe_off, obj_req->ex.oe_len,
+ off, next_off - off);
+ obj_req->ex.oe_off = off;
+ obj_req->ex.oe_len = next_off - off;
}
/* reverse map the entire object onto the parent */
@@ -1917,52 +2454,29 @@ static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req)
if (ret)
return ret;
- obj_req->osd_req = rbd_osd_req_create(obj_req, 1);
- if (!obj_req->osd_req)
- return -ENOMEM;
-
- if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) {
- osd_req_op_init(obj_req->osd_req, 0, CEPH_OSD_OP_DELETE, 0);
- } else {
- dout("%s %p %llu~%llu -> %llu~%llu\n", __func__,
- obj_req, obj_req->ex.oe_off, obj_req->ex.oe_len,
- off, next_off - off);
- osd_req_op_extent_init(obj_req->osd_req, 0,
- truncate_or_zero_opcode(obj_req),
- off, next_off - off, 0, 0);
- }
+ obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
+ if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents)
+ obj_req->flags |= RBD_OBJ_FLAG_DELETION;
- obj_req->write_state = RBD_OBJ_WRITE_FLAT;
- rbd_osd_req_format_write(obj_req);
+ obj_req->write_state = RBD_OBJ_WRITE_START;
return 0;
}
-static int count_zeroout_ops(struct rbd_obj_request *obj_req)
-{
- int num_osd_ops;
-
- if (rbd_obj_is_entire(obj_req) && obj_req->num_img_extents &&
- !rbd_obj_copyup_enabled(obj_req))
- num_osd_ops = 2; /* create + truncate */
- else
- num_osd_ops = 1; /* delete/truncate/zero */
-
- return num_osd_ops;
-}
-
-static void __rbd_obj_setup_zeroout(struct rbd_obj_request *obj_req,
- unsigned int which)
+static void __rbd_osd_setup_zeroout_ops(struct ceph_osd_request *osd_req,
+ int which)
{
+ struct rbd_obj_request *obj_req = osd_req->r_priv;
u16 opcode;
if (rbd_obj_is_entire(obj_req)) {
if (obj_req->num_img_extents) {
- if (!rbd_obj_copyup_enabled(obj_req))
- osd_req_op_init(obj_req->osd_req, which++,
+ if (!(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
+ osd_req_op_init(osd_req, which++,
CEPH_OSD_OP_CREATE, 0);
opcode = CEPH_OSD_OP_TRUNCATE;
} else {
- osd_req_op_init(obj_req->osd_req, which++,
+ rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
+ osd_req_op_init(osd_req, which++,
CEPH_OSD_OP_DELETE, 0);
opcode = 0;
}
@@ -1971,18 +2485,13 @@ static void __rbd_obj_setup_zeroout(struct rbd_obj_request *obj_req,
}
if (opcode)
- osd_req_op_extent_init(obj_req->osd_req, which++, opcode,
+ osd_req_op_extent_init(osd_req, which, opcode,
obj_req->ex.oe_off, obj_req->ex.oe_len,
0, 0);
-
- rbd_assert(which == obj_req->osd_req->r_num_ops);
- rbd_osd_req_format_write(obj_req);
}
-static int rbd_obj_setup_zeroout(struct rbd_obj_request *obj_req)
+static int rbd_obj_init_zeroout(struct rbd_obj_request *obj_req)
{
- unsigned int num_osd_ops, which = 0;
- bool need_guard;
int ret;
/* reverse map the entire object onto the parent */
@@ -1990,31 +2499,66 @@ static int rbd_obj_setup_zeroout(struct rbd_obj_request *obj_req)
if (ret)
return ret;
- need_guard = rbd_obj_copyup_enabled(obj_req);
- num_osd_ops = need_guard + count_zeroout_ops(obj_req);
+ if (rbd_obj_copyup_enabled(obj_req))
+ obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
+ if (!obj_req->num_img_extents) {
+ obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
+ if (rbd_obj_is_entire(obj_req))
+ obj_req->flags |= RBD_OBJ_FLAG_DELETION;
+ }
- obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
- if (!obj_req->osd_req)
- return -ENOMEM;
+ obj_req->write_state = RBD_OBJ_WRITE_START;
+ return 0;
+}
- if (need_guard) {
- ret = __rbd_obj_setup_stat(obj_req, which++);
- if (ret)
- return ret;
+static int count_write_ops(struct rbd_obj_request *obj_req)
+{
+ struct rbd_img_request *img_req = obj_req->img_request;
- obj_req->write_state = RBD_OBJ_WRITE_GUARD;
- } else {
- obj_req->write_state = RBD_OBJ_WRITE_FLAT;
+ switch (img_req->op_type) {
+ case OBJ_OP_WRITE:
+ if (!use_object_map(img_req->rbd_dev) ||
+ !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST))
+ return 2; /* setallochint + write/writefull */
+
+ return 1; /* write/writefull */
+ case OBJ_OP_DISCARD:
+ return 1; /* delete/truncate/zero */
+ case OBJ_OP_ZEROOUT:
+ if (rbd_obj_is_entire(obj_req) && obj_req->num_img_extents &&
+ !(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
+ return 2; /* create + truncate */
+
+ return 1; /* delete/truncate/zero */
+ default:
+ BUG();
}
+}
- __rbd_obj_setup_zeroout(obj_req, which);
- return 0;
+static void rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
+ int which)
+{
+ struct rbd_obj_request *obj_req = osd_req->r_priv;
+
+ switch (obj_req->img_request->op_type) {
+ case OBJ_OP_WRITE:
+ __rbd_osd_setup_write_ops(osd_req, which);
+ break;
+ case OBJ_OP_DISCARD:
+ __rbd_osd_setup_discard_ops(osd_req, which);
+ break;
+ case OBJ_OP_ZEROOUT:
+ __rbd_osd_setup_zeroout_ops(osd_req, which);
+ break;
+ default:
+ BUG();
+ }
}
/*
- * For each object request in @img_req, allocate an OSD request, add
- * individual OSD ops and prepare them for submission. The number of
- * OSD ops depends on op_type and the overlap point (if any).
+ * Prune the list of object requests (adjust offset and/or length, drop
+ * redundant requests). Prepare object request state machines and image
+ * request state machine for execution.
*/
static int __rbd_img_fill_request(struct rbd_img_request *img_req)
{
@@ -2024,16 +2568,16 @@ static int __rbd_img_fill_request(struct rbd_img_request *img_req)
for_each_obj_request_safe(img_req, obj_req, next_obj_req) {
switch (img_req->op_type) {
case OBJ_OP_READ:
- ret = rbd_obj_setup_read(obj_req);
+ ret = rbd_obj_init_read(obj_req);
break;
case OBJ_OP_WRITE:
- ret = rbd_obj_setup_write(obj_req);
+ ret = rbd_obj_init_write(obj_req);
break;
case OBJ_OP_DISCARD:
- ret = rbd_obj_setup_discard(obj_req);
+ ret = rbd_obj_init_discard(obj_req);
break;
case OBJ_OP_ZEROOUT:
- ret = rbd_obj_setup_zeroout(obj_req);
+ ret = rbd_obj_init_zeroout(obj_req);
break;
default:
BUG();
@@ -2041,17 +2585,12 @@ static int __rbd_img_fill_request(struct rbd_img_request *img_req)
if (ret < 0)
return ret;
if (ret > 0) {
- img_req->xferred += obj_req->ex.oe_len;
- img_req->pending_count--;
rbd_img_obj_request_del(img_req, obj_req);
continue;
}
-
- ret = ceph_osdc_alloc_messages(obj_req->osd_req, GFP_NOIO);
- if (ret)
- return ret;
}
+ img_req->state = RBD_IMG_START;
return 0;
}
@@ -2340,17 +2879,55 @@ static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
&it);
}
-static void rbd_img_request_submit(struct rbd_img_request *img_request)
+static void rbd_img_handle_request_work(struct work_struct *work)
{
- struct rbd_obj_request *obj_request;
+ struct rbd_img_request *img_req =
+ container_of(work, struct rbd_img_request, work);
- dout("%s: img %p\n", __func__, img_request);
+ rbd_img_handle_request(img_req, img_req->work_result);
+}
- rbd_img_request_get(img_request);
- for_each_obj_request(img_request, obj_request)
- rbd_obj_request_submit(obj_request);
+static void rbd_img_schedule(struct rbd_img_request *img_req, int result)
+{
+ INIT_WORK(&img_req->work, rbd_img_handle_request_work);
+ img_req->work_result = result;
+ queue_work(rbd_wq, &img_req->work);
+}
- rbd_img_request_put(img_request);
+static bool rbd_obj_may_exist(struct rbd_obj_request *obj_req)
+{
+ struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+
+ if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno)) {
+ obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
+ return true;
+ }
+
+ dout("%s %p objno %llu assuming dne\n", __func__, obj_req,
+ obj_req->ex.oe_objno);
+ return false;
+}
+
+static int rbd_obj_read_object(struct rbd_obj_request *obj_req)
+{
+ struct ceph_osd_request *osd_req;
+ int ret;
+
+ osd_req = __rbd_obj_add_osd_request(obj_req, NULL, 1);
+ if (IS_ERR(osd_req))
+ return PTR_ERR(osd_req);
+
+ osd_req_op_extent_init(osd_req, 0, CEPH_OSD_OP_READ,
+ obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
+ rbd_osd_setup_data(osd_req, 0);
+ rbd_osd_format_read(osd_req);
+
+ ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
+ if (ret)
+ return ret;
+
+ rbd_osd_submit(osd_req);
+ return 0;
}
static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
@@ -2396,51 +2973,144 @@ static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
return ret;
}
- rbd_img_request_submit(child_img_req);
+ /* avoid parent chain recursion */
+ rbd_img_schedule(child_img_req, 0);
return 0;
}
-static bool rbd_obj_handle_read(struct rbd_obj_request *obj_req)
+static bool rbd_obj_advance_read(struct rbd_obj_request *obj_req, int *result)
{
struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
int ret;
- if (obj_req->result == -ENOENT &&
- rbd_dev->parent_overlap && !obj_req->tried_parent) {
- /* reverse map this object extent onto the parent */
- ret = rbd_obj_calc_img_extents(obj_req, false);
+again:
+ switch (obj_req->read_state) {
+ case RBD_OBJ_READ_START:
+ rbd_assert(!*result);
+
+ if (!rbd_obj_may_exist(obj_req)) {
+ *result = -ENOENT;
+ obj_req->read_state = RBD_OBJ_READ_OBJECT;
+ goto again;
+ }
+
+ ret = rbd_obj_read_object(obj_req);
if (ret) {
- obj_req->result = ret;
+ *result = ret;
return true;
}
-
- if (obj_req->num_img_extents) {
- obj_req->tried_parent = true;
- ret = rbd_obj_read_from_parent(obj_req);
+ obj_req->read_state = RBD_OBJ_READ_OBJECT;
+ return false;
+ case RBD_OBJ_READ_OBJECT:
+ if (*result == -ENOENT && rbd_dev->parent_overlap) {
+ /* reverse map this object extent onto the parent */
+ ret = rbd_obj_calc_img_extents(obj_req, false);
if (ret) {
- obj_req->result = ret;
+ *result = ret;
return true;
}
- return false;
+ if (obj_req->num_img_extents) {
+ ret = rbd_obj_read_from_parent(obj_req);
+ if (ret) {
+ *result = ret;
+ return true;
+ }
+ obj_req->read_state = RBD_OBJ_READ_PARENT;
+ return false;
+ }
+ }
+
+ /*
+ * -ENOENT means a hole in the image -- zero-fill the entire
+ * length of the request. A short read also implies zero-fill
+ * to the end of the request.
+ */
+ if (*result == -ENOENT) {
+ rbd_obj_zero_range(obj_req, 0, obj_req->ex.oe_len);
+ *result = 0;
+ } else if (*result >= 0) {
+ if (*result < obj_req->ex.oe_len)
+ rbd_obj_zero_range(obj_req, *result,
+ obj_req->ex.oe_len - *result);
+ else
+ rbd_assert(*result == obj_req->ex.oe_len);
+ *result = 0;
}
+ return true;
+ case RBD_OBJ_READ_PARENT:
+ return true;
+ default:
+ BUG();
}
+}
- /*
- * -ENOENT means a hole in the image -- zero-fill the entire
- * length of the request. A short read also implies zero-fill
- * to the end of the request. In both cases we update xferred
- * count to indicate the whole request was satisfied.
- */
- if (obj_req->result == -ENOENT ||
- (!obj_req->result && obj_req->xferred < obj_req->ex.oe_len)) {
- rbd_assert(!obj_req->xferred || !obj_req->result);
- rbd_obj_zero_range(obj_req, obj_req->xferred,
- obj_req->ex.oe_len - obj_req->xferred);
- obj_req->result = 0;
- obj_req->xferred = obj_req->ex.oe_len;
+static bool rbd_obj_write_is_noop(struct rbd_obj_request *obj_req)
+{
+ struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+
+ if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno))
+ obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
+
+ if (!(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST) &&
+ (obj_req->flags & RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT)) {
+ dout("%s %p noop for nonexistent\n", __func__, obj_req);
+ return true;
}
- return true;
+ return false;
+}
+
+/*
+ * Return:
+ * 0 - object map update sent
+ * 1 - object map update isn't needed
+ * <0 - error
+ */
+static int rbd_obj_write_pre_object_map(struct rbd_obj_request *obj_req)
+{
+ struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+ u8 new_state;
+
+ if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
+ return 1;
+
+ if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
+ new_state = OBJECT_PENDING;
+ else
+ new_state = OBJECT_EXISTS;
+
+ return rbd_object_map_update(obj_req, CEPH_NOSNAP, new_state, NULL);
+}
+
+static int rbd_obj_write_object(struct rbd_obj_request *obj_req)
+{
+ struct ceph_osd_request *osd_req;
+ int num_ops = count_write_ops(obj_req);
+ int which = 0;
+ int ret;
+
+ if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED)
+ num_ops++; /* stat */
+
+ osd_req = rbd_obj_add_osd_request(obj_req, num_ops);
+ if (IS_ERR(osd_req))
+ return PTR_ERR(osd_req);
+
+ if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
+ ret = rbd_osd_setup_stat(osd_req, which++);
+ if (ret)
+ return ret;
+ }
+
+ rbd_osd_setup_write_ops(osd_req, which);
+ rbd_osd_format_write(osd_req);
+
+ ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
+ if (ret)
+ return ret;
+
+ rbd_osd_submit(osd_req);
+ return 0;
}
/*
@@ -2463,123 +3133,67 @@ static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes)
#define MODS_ONLY U32_MAX
-static int rbd_obj_issue_copyup_empty_snapc(struct rbd_obj_request *obj_req,
- u32 bytes)
+static int rbd_obj_copyup_empty_snapc(struct rbd_obj_request *obj_req,
+ u32 bytes)
{
+ struct ceph_osd_request *osd_req;
int ret;
dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
- rbd_assert(obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_STAT);
rbd_assert(bytes > 0 && bytes != MODS_ONLY);
- rbd_osd_req_destroy(obj_req->osd_req);
- obj_req->osd_req = __rbd_osd_req_create(obj_req, &rbd_empty_snapc, 1);
- if (!obj_req->osd_req)
- return -ENOMEM;
+ osd_req = __rbd_obj_add_osd_request(obj_req, &rbd_empty_snapc, 1);
+ if (IS_ERR(osd_req))
+ return PTR_ERR(osd_req);
- ret = osd_req_op_cls_init(obj_req->osd_req, 0, "rbd", "copyup");
+ ret = rbd_osd_setup_copyup(osd_req, 0, bytes);
if (ret)
return ret;
- osd_req_op_cls_request_data_bvecs(obj_req->osd_req, 0,
- obj_req->copyup_bvecs,
- obj_req->copyup_bvec_count,
- bytes);
- rbd_osd_req_format_write(obj_req);
+ rbd_osd_format_write(osd_req);
- ret = ceph_osdc_alloc_messages(obj_req->osd_req, GFP_NOIO);
+ ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
if (ret)
return ret;
- rbd_obj_request_submit(obj_req);
+ rbd_osd_submit(osd_req);
return 0;
}
-static int rbd_obj_issue_copyup_ops(struct rbd_obj_request *obj_req, u32 bytes)
+static int rbd_obj_copyup_current_snapc(struct rbd_obj_request *obj_req,
+ u32 bytes)
{
- struct rbd_img_request *img_req = obj_req->img_request;
- unsigned int num_osd_ops = (bytes != MODS_ONLY);
- unsigned int which = 0;
+ struct ceph_osd_request *osd_req;
+ int num_ops = count_write_ops(obj_req);
+ int which = 0;
int ret;
dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
- rbd_assert(obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_STAT ||
- obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_CALL);
- rbd_osd_req_destroy(obj_req->osd_req);
- switch (img_req->op_type) {
- case OBJ_OP_WRITE:
- num_osd_ops += count_write_ops(obj_req);
- break;
- case OBJ_OP_ZEROOUT:
- num_osd_ops += count_zeroout_ops(obj_req);
- break;
- default:
- BUG();
- }
+ if (bytes != MODS_ONLY)
+ num_ops++; /* copyup */
- obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
- if (!obj_req->osd_req)
- return -ENOMEM;
+ osd_req = rbd_obj_add_osd_request(obj_req, num_ops);
+ if (IS_ERR(osd_req))
+ return PTR_ERR(osd_req);
if (bytes != MODS_ONLY) {
- ret = osd_req_op_cls_init(obj_req->osd_req, which, "rbd",
- "copyup");
+ ret = rbd_osd_setup_copyup(osd_req, which++, bytes);
if (ret)
return ret;
-
- osd_req_op_cls_request_data_bvecs(obj_req->osd_req, which++,
- obj_req->copyup_bvecs,
- obj_req->copyup_bvec_count,
- bytes);
}
- switch (img_req->op_type) {
- case OBJ_OP_WRITE:
- __rbd_obj_setup_write(obj_req, which);
- break;
- case OBJ_OP_ZEROOUT:
- __rbd_obj_setup_zeroout(obj_req, which);
- break;
- default:
- BUG();
- }
+ rbd_osd_setup_write_ops(osd_req, which);
+ rbd_osd_format_write(osd_req);
- ret = ceph_osdc_alloc_messages(obj_req->osd_req, GFP_NOIO);
+ ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
if (ret)
return ret;
- rbd_obj_request_submit(obj_req);
+ rbd_osd_submit(osd_req);
return 0;
}
-static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes)
-{
- /*
- * Only send non-zero copyup data to save some I/O and network
- * bandwidth -- zero copyup data is equivalent to the object not
- * existing.
- */
- if (is_zero_bvecs(obj_req->copyup_bvecs, bytes)) {
- dout("%s obj_req %p detected zeroes\n", __func__, obj_req);
- bytes = 0;
- }
-
- if (obj_req->img_request->snapc->num_snaps && bytes > 0) {
- /*
- * Send a copyup request with an empty snapshot context to
- * deep-copyup the object through all existing snapshots.
- * A second request with the current snapshot context will be
- * sent for the actual modification.
- */
- obj_req->write_state = RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC;
- return rbd_obj_issue_copyup_empty_snapc(obj_req, bytes);
- }
-
- obj_req->write_state = RBD_OBJ_WRITE_COPYUP_OPS;
- return rbd_obj_issue_copyup_ops(obj_req, bytes);
-}
-
static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
{
u32 i;
@@ -2608,7 +3222,12 @@ static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
return 0;
}
-static int rbd_obj_handle_write_guard(struct rbd_obj_request *obj_req)
+/*
+ * The target object doesn't exist. Read the data for the entire
+ * target object up to the overlap point (if any) from the parent,
+ * so we can use it for a copyup.
+ */
+static int rbd_obj_copyup_read_parent(struct rbd_obj_request *obj_req)
{
struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
int ret;
@@ -2623,178 +3242,492 @@ static int rbd_obj_handle_write_guard(struct rbd_obj_request *obj_req)
* request -- pass MODS_ONLY since the copyup isn't needed
* anymore.
*/
- obj_req->write_state = RBD_OBJ_WRITE_COPYUP_OPS;
- return rbd_obj_issue_copyup_ops(obj_req, MODS_ONLY);
+ return rbd_obj_copyup_current_snapc(obj_req, MODS_ONLY);
}
ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req));
if (ret)
return ret;
- obj_req->write_state = RBD_OBJ_WRITE_READ_FROM_PARENT;
return rbd_obj_read_from_parent(obj_req);
}
-static bool rbd_obj_handle_write(struct rbd_obj_request *obj_req)
+static void rbd_obj_copyup_object_maps(struct rbd_obj_request *obj_req)
{
+ struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+ struct ceph_snap_context *snapc = obj_req->img_request->snapc;
+ u8 new_state;
+ u32 i;
int ret;
- switch (obj_req->write_state) {
- case RBD_OBJ_WRITE_GUARD:
- rbd_assert(!obj_req->xferred);
- if (obj_req->result == -ENOENT) {
- /*
- * The target object doesn't exist. Read the data for
- * the entire target object up to the overlap point (if
- * any) from the parent, so we can use it for a copyup.
- */
- ret = rbd_obj_handle_write_guard(obj_req);
- if (ret) {
- obj_req->result = ret;
- return true;
- }
- return false;
+ rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
+
+ if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
+ return;
+
+ if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
+ return;
+
+ for (i = 0; i < snapc->num_snaps; i++) {
+ if ((rbd_dev->header.features & RBD_FEATURE_FAST_DIFF) &&
+ i + 1 < snapc->num_snaps)
+ new_state = OBJECT_EXISTS_CLEAN;
+ else
+ new_state = OBJECT_EXISTS;
+
+ ret = rbd_object_map_update(obj_req, snapc->snaps[i],
+ new_state, NULL);
+ if (ret < 0) {
+ obj_req->pending.result = ret;
+ return;
}
- /* fall through */
- case RBD_OBJ_WRITE_FLAT:
- case RBD_OBJ_WRITE_COPYUP_OPS:
- if (!obj_req->result)
- /*
- * There is no such thing as a successful short
- * write -- indicate the whole request was satisfied.
- */
- obj_req->xferred = obj_req->ex.oe_len;
- return true;
- case RBD_OBJ_WRITE_READ_FROM_PARENT:
- if (obj_req->result)
- return true;
- rbd_assert(obj_req->xferred);
- ret = rbd_obj_issue_copyup(obj_req, obj_req->xferred);
+ rbd_assert(!ret);
+ obj_req->pending.num_pending++;
+ }
+}
+
+static void rbd_obj_copyup_write_object(struct rbd_obj_request *obj_req)
+{
+ u32 bytes = rbd_obj_img_extents_bytes(obj_req);
+ int ret;
+
+ rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
+
+ /*
+ * Only send non-zero copyup data to save some I/O and network
+ * bandwidth -- zero copyup data is equivalent to the object not
+ * existing.
+ */
+ if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
+ bytes = 0;
+
+ if (obj_req->img_request->snapc->num_snaps && bytes > 0) {
+ /*
+ * Send a copyup request with an empty snapshot context to
+ * deep-copyup the object through all existing snapshots.
+ * A second request with the current snapshot context will be
+ * sent for the actual modification.
+ */
+ ret = rbd_obj_copyup_empty_snapc(obj_req, bytes);
+ if (ret) {
+ obj_req->pending.result = ret;
+ return;
+ }
+
+ obj_req->pending.num_pending++;
+ bytes = MODS_ONLY;
+ }
+
+ ret = rbd_obj_copyup_current_snapc(obj_req, bytes);
+ if (ret) {
+ obj_req->pending.result = ret;
+ return;
+ }
+
+ obj_req->pending.num_pending++;
+}
+
+static bool rbd_obj_advance_copyup(struct rbd_obj_request *obj_req, int *result)
+{
+ struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+ int ret;
+
+again:
+ switch (obj_req->copyup_state) {
+ case RBD_OBJ_COPYUP_START:
+ rbd_assert(!*result);
+
+ ret = rbd_obj_copyup_read_parent(obj_req);
if (ret) {
- obj_req->result = ret;
- obj_req->xferred = 0;
+ *result = ret;
return true;
}
+ if (obj_req->num_img_extents)
+ obj_req->copyup_state = RBD_OBJ_COPYUP_READ_PARENT;
+ else
+ obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
return false;
- case RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC:
- if (obj_req->result)
+ case RBD_OBJ_COPYUP_READ_PARENT:
+ if (*result)
return true;
- obj_req->write_state = RBD_OBJ_WRITE_COPYUP_OPS;
- ret = rbd_obj_issue_copyup_ops(obj_req, MODS_ONLY);
- if (ret) {
- obj_req->result = ret;
+ if (is_zero_bvecs(obj_req->copyup_bvecs,
+ rbd_obj_img_extents_bytes(obj_req))) {
+ dout("%s %p detected zeros\n", __func__, obj_req);
+ obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ZEROS;
+ }
+
+ rbd_obj_copyup_object_maps(obj_req);
+ if (!obj_req->pending.num_pending) {
+ *result = obj_req->pending.result;
+ obj_req->copyup_state = RBD_OBJ_COPYUP_OBJECT_MAPS;
+ goto again;
+ }
+ obj_req->copyup_state = __RBD_OBJ_COPYUP_OBJECT_MAPS;
+ return false;
+ case __RBD_OBJ_COPYUP_OBJECT_MAPS:
+ if (!pending_result_dec(&obj_req->pending, result))
+ return false;
+ /* fall through */
+ case RBD_OBJ_COPYUP_OBJECT_MAPS:
+ if (*result) {
+ rbd_warn(rbd_dev, "snap object map update failed: %d",
+ *result);
return true;
}
+
+ rbd_obj_copyup_write_object(obj_req);
+ if (!obj_req->pending.num_pending) {
+ *result = obj_req->pending.result;
+ obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
+ goto again;
+ }
+ obj_req->copyup_state = __RBD_OBJ_COPYUP_WRITE_OBJECT;
return false;
+ case __RBD_OBJ_COPYUP_WRITE_OBJECT:
+ if (!pending_result_dec(&obj_req->pending, result))
+ return false;
+ /* fall through */
+ case RBD_OBJ_COPYUP_WRITE_OBJECT:
+ return true;
default:
BUG();
}
}
/*
- * Returns true if @obj_req is completed, or false otherwise.
+ * Return:
+ * 0 - object map update sent
+ * 1 - object map update isn't needed
+ * <0 - error
*/
-static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req)
+static int rbd_obj_write_post_object_map(struct rbd_obj_request *obj_req)
{
- switch (obj_req->img_request->op_type) {
- case OBJ_OP_READ:
- return rbd_obj_handle_read(obj_req);
- case OBJ_OP_WRITE:
- return rbd_obj_handle_write(obj_req);
- case OBJ_OP_DISCARD:
- case OBJ_OP_ZEROOUT:
- if (rbd_obj_handle_write(obj_req)) {
+ struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+ u8 current_state = OBJECT_PENDING;
+
+ if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
+ return 1;
+
+ if (!(obj_req->flags & RBD_OBJ_FLAG_DELETION))
+ return 1;
+
+ return rbd_object_map_update(obj_req, CEPH_NOSNAP, OBJECT_NONEXISTENT,
+ &current_state);
+}
+
+static bool rbd_obj_advance_write(struct rbd_obj_request *obj_req, int *result)
+{
+ struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+ int ret;
+
+again:
+ switch (obj_req->write_state) {
+ case RBD_OBJ_WRITE_START:
+ rbd_assert(!*result);
+
+ if (rbd_obj_write_is_noop(obj_req))
+ return true;
+
+ ret = rbd_obj_write_pre_object_map(obj_req);
+ if (ret < 0) {
+ *result = ret;
+ return true;
+ }
+ obj_req->write_state = RBD_OBJ_WRITE_PRE_OBJECT_MAP;
+ if (ret > 0)
+ goto again;
+ return false;
+ case RBD_OBJ_WRITE_PRE_OBJECT_MAP:
+ if (*result) {
+ rbd_warn(rbd_dev, "pre object map update failed: %d",
+ *result);
+ return true;
+ }
+ ret = rbd_obj_write_object(obj_req);
+ if (ret) {
+ *result = ret;
+ return true;
+ }
+ obj_req->write_state = RBD_OBJ_WRITE_OBJECT;
+ return false;
+ case RBD_OBJ_WRITE_OBJECT:
+ if (*result == -ENOENT) {
+ if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
+ *result = 0;
+ obj_req->copyup_state = RBD_OBJ_COPYUP_START;
+ obj_req->write_state = __RBD_OBJ_WRITE_COPYUP;
+ goto again;
+ }
/*
- * Hide -ENOENT from delete/truncate/zero -- discarding
- * a non-existent object is not a problem.
+ * On a non-existent object:
+ * delete - -ENOENT, truncate/zero - 0
*/
- if (obj_req->result == -ENOENT) {
- obj_req->result = 0;
- obj_req->xferred = obj_req->ex.oe_len;
- }
+ if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
+ *result = 0;
+ }
+ if (*result)
+ return true;
+
+ obj_req->write_state = RBD_OBJ_WRITE_COPYUP;
+ goto again;
+ case __RBD_OBJ_WRITE_COPYUP:
+ if (!rbd_obj_advance_copyup(obj_req, result))
+ return false;
+ /* fall through */
+ case RBD_OBJ_WRITE_COPYUP:
+ if (*result) {
+ rbd_warn(rbd_dev, "copyup failed: %d", *result);
+ return true;
+ }
+ ret = rbd_obj_write_post_object_map(obj_req);
+ if (ret < 0) {
+ *result = ret;
return true;
}
+ obj_req->write_state = RBD_OBJ_WRITE_POST_OBJECT_MAP;
+ if (ret > 0)
+ goto again;
return false;
+ case RBD_OBJ_WRITE_POST_OBJECT_MAP:
+ if (*result)
+ rbd_warn(rbd_dev, "post object map update failed: %d",
+ *result);
+ return true;
default:
BUG();
}
}
-static void rbd_obj_end_request(struct rbd_obj_request *obj_req)
+/*
+ * Return true if @obj_req is completed.
+ */
+static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req,
+ int *result)
{
struct rbd_img_request *img_req = obj_req->img_request;
+ struct rbd_device *rbd_dev = img_req->rbd_dev;
+ bool done;
- rbd_assert((!obj_req->result &&
- obj_req->xferred == obj_req->ex.oe_len) ||
- (obj_req->result < 0 && !obj_req->xferred));
- if (!obj_req->result) {
- img_req->xferred += obj_req->xferred;
- return;
- }
+ mutex_lock(&obj_req->state_mutex);
+ if (!rbd_img_is_write(img_req))
+ done = rbd_obj_advance_read(obj_req, result);
+ else
+ done = rbd_obj_advance_write(obj_req, result);
+ mutex_unlock(&obj_req->state_mutex);
- rbd_warn(img_req->rbd_dev,
- "%s at objno %llu %llu~%llu result %d xferred %llu",
- obj_op_name(img_req->op_type), obj_req->ex.oe_objno,
- obj_req->ex.oe_off, obj_req->ex.oe_len, obj_req->result,
- obj_req->xferred);
- if (!img_req->result) {
- img_req->result = obj_req->result;
- img_req->xferred = 0;
+ if (done && *result) {
+ rbd_assert(*result < 0);
+ rbd_warn(rbd_dev, "%s at objno %llu %llu~%llu result %d",
+ obj_op_name(img_req->op_type), obj_req->ex.oe_objno,
+ obj_req->ex.oe_off, obj_req->ex.oe_len, *result);
}
+ return done;
}
-static void rbd_img_end_child_request(struct rbd_img_request *img_req)
+/*
+ * This is open-coded in rbd_img_handle_request() to avoid parent chain
+ * recursion.
+ */
+static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result)
{
- struct rbd_obj_request *obj_req = img_req->obj_request;
+ if (__rbd_obj_handle_request(obj_req, &result))
+ rbd_img_handle_request(obj_req->img_request, result);
+}
- rbd_assert(test_bit(IMG_REQ_CHILD, &img_req->flags));
- rbd_assert((!img_req->result &&
- img_req->xferred == rbd_obj_img_extents_bytes(obj_req)) ||
- (img_req->result < 0 && !img_req->xferred));
+static bool need_exclusive_lock(struct rbd_img_request *img_req)
+{
+ struct rbd_device *rbd_dev = img_req->rbd_dev;
- obj_req->result = img_req->result;
- obj_req->xferred = img_req->xferred;
- rbd_img_request_put(img_req);
+ if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK))
+ return false;
+
+ if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
+ return false;
+
+ rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags));
+ if (rbd_dev->opts->lock_on_read ||
+ (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
+ return true;
+
+ return rbd_img_is_write(img_req);
}
-static void rbd_img_end_request(struct rbd_img_request *img_req)
+static bool rbd_lock_add_request(struct rbd_img_request *img_req)
{
- rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags));
- rbd_assert((!img_req->result &&
- img_req->xferred == blk_rq_bytes(img_req->rq)) ||
- (img_req->result < 0 && !img_req->xferred));
+ struct rbd_device *rbd_dev = img_req->rbd_dev;
+ bool locked;
+
+ lockdep_assert_held(&rbd_dev->lock_rwsem);
+ locked = rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED;
+ spin_lock(&rbd_dev->lock_lists_lock);
+ rbd_assert(list_empty(&img_req->lock_item));
+ if (!locked)
+ list_add_tail(&img_req->lock_item, &rbd_dev->acquiring_list);
+ else
+ list_add_tail(&img_req->lock_item, &rbd_dev->running_list);
+ spin_unlock(&rbd_dev->lock_lists_lock);
+ return locked;
+}
+
+static void rbd_lock_del_request(struct rbd_img_request *img_req)
+{
+ struct rbd_device *rbd_dev = img_req->rbd_dev;
+ bool need_wakeup;
- blk_mq_end_request(img_req->rq,
- errno_to_blk_status(img_req->result));
- rbd_img_request_put(img_req);
+ lockdep_assert_held(&rbd_dev->lock_rwsem);
+ spin_lock(&rbd_dev->lock_lists_lock);
+ rbd_assert(!list_empty(&img_req->lock_item));
+ list_del_init(&img_req->lock_item);
+ need_wakeup = (rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING &&
+ list_empty(&rbd_dev->running_list));
+ spin_unlock(&rbd_dev->lock_lists_lock);
+ if (need_wakeup)
+ complete(&rbd_dev->releasing_wait);
}
-static void rbd_obj_handle_request(struct rbd_obj_request *obj_req)
+static int rbd_img_exclusive_lock(struct rbd_img_request *img_req)
{
- struct rbd_img_request *img_req;
+ struct rbd_device *rbd_dev = img_req->rbd_dev;
+
+ if (!need_exclusive_lock(img_req))
+ return 1;
+
+ if (rbd_lock_add_request(img_req))
+ return 1;
+
+ if (rbd_dev->opts->exclusive) {
+ WARN_ON(1); /* lock got released? */
+ return -EROFS;
+ }
+
+ /*
+ * Note the use of mod_delayed_work() in rbd_acquire_lock()
+ * and cancel_delayed_work() in wake_lock_waiters().
+ */
+ dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
+ queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
+ return 0;
+}
+
+static void rbd_img_object_requests(struct rbd_img_request *img_req)
+{
+ struct rbd_obj_request *obj_req;
+
+ rbd_assert(!img_req->pending.result && !img_req->pending.num_pending);
+
+ for_each_obj_request(img_req, obj_req) {
+ int result = 0;
+
+ if (__rbd_obj_handle_request(obj_req, &result)) {
+ if (result) {
+ img_req->pending.result = result;
+ return;
+ }
+ } else {
+ img_req->pending.num_pending++;
+ }
+ }
+}
+
+static bool rbd_img_advance(struct rbd_img_request *img_req, int *result)
+{
+ struct rbd_device *rbd_dev = img_req->rbd_dev;
+ int ret;
again:
- if (!__rbd_obj_handle_request(obj_req))
- return;
+ switch (img_req->state) {
+ case RBD_IMG_START:
+ rbd_assert(!*result);
- img_req = obj_req->img_request;
- spin_lock(&img_req->completion_lock);
- rbd_obj_end_request(obj_req);
- rbd_assert(img_req->pending_count);
- if (--img_req->pending_count) {
- spin_unlock(&img_req->completion_lock);
- return;
+ ret = rbd_img_exclusive_lock(img_req);
+ if (ret < 0) {
+ *result = ret;
+ return true;
+ }
+ img_req->state = RBD_IMG_EXCLUSIVE_LOCK;
+ if (ret > 0)
+ goto again;
+ return false;
+ case RBD_IMG_EXCLUSIVE_LOCK:
+ if (*result)
+ return true;
+
+ rbd_assert(!need_exclusive_lock(img_req) ||
+ __rbd_is_lock_owner(rbd_dev));
+
+ rbd_img_object_requests(img_req);
+ if (!img_req->pending.num_pending) {
+ *result = img_req->pending.result;
+ img_req->state = RBD_IMG_OBJECT_REQUESTS;
+ goto again;
+ }
+ img_req->state = __RBD_IMG_OBJECT_REQUESTS;
+ return false;
+ case __RBD_IMG_OBJECT_REQUESTS:
+ if (!pending_result_dec(&img_req->pending, result))
+ return false;
+ /* fall through */
+ case RBD_IMG_OBJECT_REQUESTS:
+ return true;
+ default:
+ BUG();
+ }
+}
+
+/*
+ * Return true if @img_req is completed.
+ */
+static bool __rbd_img_handle_request(struct rbd_img_request *img_req,
+ int *result)
+{
+ struct rbd_device *rbd_dev = img_req->rbd_dev;
+ bool done;
+
+ if (need_exclusive_lock(img_req)) {
+ down_read(&rbd_dev->lock_rwsem);
+ mutex_lock(&img_req->state_mutex);
+ done = rbd_img_advance(img_req, result);
+ if (done)
+ rbd_lock_del_request(img_req);
+ mutex_unlock(&img_req->state_mutex);
+ up_read(&rbd_dev->lock_rwsem);
+ } else {
+ mutex_lock(&img_req->state_mutex);
+ done = rbd_img_advance(img_req, result);
+ mutex_unlock(&img_req->state_mutex);
+ }
+
+ if (done && *result) {
+ rbd_assert(*result < 0);
+ rbd_warn(rbd_dev, "%s%s result %d",
+ test_bit(IMG_REQ_CHILD, &img_req->flags) ? "child " : "",
+ obj_op_name(img_req->op_type), *result);
}
+ return done;
+}
+
+static void rbd_img_handle_request(struct rbd_img_request *img_req, int result)
+{
+again:
+ if (!__rbd_img_handle_request(img_req, &result))
+ return;
- spin_unlock(&img_req->completion_lock);
if (test_bit(IMG_REQ_CHILD, &img_req->flags)) {
- obj_req = img_req->obj_request;
- rbd_img_end_child_request(img_req);
- goto again;
+ struct rbd_obj_request *obj_req = img_req->obj_request;
+
+ rbd_img_request_put(img_req);
+ if (__rbd_obj_handle_request(obj_req, &result)) {
+ img_req = obj_req->img_request;
+ goto again;
+ }
+ } else {
+ struct request *rq = img_req->rq;
+
+ rbd_img_request_put(img_req);
+ blk_mq_end_request(rq, errno_to_blk_status(result));
}
- rbd_img_end_request(img_req);
}
static const struct rbd_client_id rbd_empty_cid;
@@ -2839,6 +3772,7 @@ static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie)
{
struct rbd_client_id cid = rbd_get_cid(rbd_dev);
+ rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
strcpy(rbd_dev->lock_cookie, cookie);
rbd_set_owner_cid(rbd_dev, &cid);
queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
@@ -2863,7 +3797,6 @@ static int rbd_lock(struct rbd_device *rbd_dev)
if (ret)
return ret;
- rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
__rbd_lock(rbd_dev, cookie);
return 0;
}
@@ -2882,7 +3815,7 @@ static void rbd_unlock(struct rbd_device *rbd_dev)
ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
RBD_LOCK_NAME, rbd_dev->lock_cookie);
if (ret && ret != -ENOENT)
- rbd_warn(rbd_dev, "failed to unlock: %d", ret);
+ rbd_warn(rbd_dev, "failed to unlock header: %d", ret);
/* treat errors as the image is unlocked */
rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
@@ -3009,15 +3942,34 @@ e_inval:
goto out;
}
-static void wake_requests(struct rbd_device *rbd_dev, bool wake_all)
+/*
+ * Either image request state machine(s) or rbd_add_acquire_lock()
+ * (i.e. "rbd map").
+ */
+static void wake_lock_waiters(struct rbd_device *rbd_dev, int result)
{
- dout("%s rbd_dev %p wake_all %d\n", __func__, rbd_dev, wake_all);
+ struct rbd_img_request *img_req;
+
+ dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
+ lockdep_assert_held_write(&rbd_dev->lock_rwsem);
cancel_delayed_work(&rbd_dev->lock_dwork);
- if (wake_all)
- wake_up_all(&rbd_dev->lock_waitq);
- else
- wake_up(&rbd_dev->lock_waitq);
+ if (!completion_done(&rbd_dev->acquire_wait)) {
+ rbd_assert(list_empty(&rbd_dev->acquiring_list) &&
+ list_empty(&rbd_dev->running_list));
+ rbd_dev->acquire_err = result;
+ complete_all(&rbd_dev->acquire_wait);
+ return;
+ }
+
+ list_for_each_entry(img_req, &rbd_dev->acquiring_list, lock_item) {
+ mutex_lock(&img_req->state_mutex);
+ rbd_assert(img_req->state == RBD_IMG_EXCLUSIVE_LOCK);
+ rbd_img_schedule(img_req, result);
+ mutex_unlock(&img_req->state_mutex);
+ }
+
+ list_splice_tail_init(&rbd_dev->acquiring_list, &rbd_dev->running_list);
}
static int get_lock_owner_info(struct rbd_device *rbd_dev,
@@ -3132,13 +4084,10 @@ static int rbd_try_lock(struct rbd_device *rbd_dev)
goto again;
ret = find_watcher(rbd_dev, lockers);
- if (ret) {
- if (ret > 0)
- ret = 0; /* have to request lock */
- goto out;
- }
+ if (ret)
+ goto out; /* request lock or error */
- rbd_warn(rbd_dev, "%s%llu seems dead, breaking lock",
+ rbd_warn(rbd_dev, "breaking header lock owned by %s%llu",
ENTITY_NAME(lockers[0].id.name));
ret = ceph_monc_blacklist_add(&client->monc,
@@ -3165,53 +4114,90 @@ out:
return ret;
}
+static int rbd_post_acquire_action(struct rbd_device *rbd_dev)
+{
+ int ret;
+
+ if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) {
+ ret = rbd_object_map_open(rbd_dev);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
/*
- * ret is set only if lock_state is RBD_LOCK_STATE_UNLOCKED
+ * Return:
+ * 0 - lock acquired
+ * 1 - caller should call rbd_request_lock()
+ * <0 - error
*/
-static enum rbd_lock_state rbd_try_acquire_lock(struct rbd_device *rbd_dev,
- int *pret)
+static int rbd_try_acquire_lock(struct rbd_device *rbd_dev)
{
- enum rbd_lock_state lock_state;
+ int ret;
down_read(&rbd_dev->lock_rwsem);
dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
rbd_dev->lock_state);
if (__rbd_is_lock_owner(rbd_dev)) {
- lock_state = rbd_dev->lock_state;
up_read(&rbd_dev->lock_rwsem);
- return lock_state;
+ return 0;
}
up_read(&rbd_dev->lock_rwsem);
down_write(&rbd_dev->lock_rwsem);
dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
rbd_dev->lock_state);
- if (!__rbd_is_lock_owner(rbd_dev)) {
- *pret = rbd_try_lock(rbd_dev);
- if (*pret)
- rbd_warn(rbd_dev, "failed to acquire lock: %d", *pret);
+ if (__rbd_is_lock_owner(rbd_dev)) {
+ up_write(&rbd_dev->lock_rwsem);
+ return 0;
+ }
+
+ ret = rbd_try_lock(rbd_dev);
+ if (ret < 0) {
+ rbd_warn(rbd_dev, "failed to lock header: %d", ret);
+ if (ret == -EBLACKLISTED)
+ goto out;
+
+ ret = 1; /* request lock anyway */
+ }
+ if (ret > 0) {
+ up_write(&rbd_dev->lock_rwsem);
+ return ret;
+ }
+
+ rbd_assert(rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED);
+ rbd_assert(list_empty(&rbd_dev->running_list));
+
+ ret = rbd_post_acquire_action(rbd_dev);
+ if (ret) {
+ rbd_warn(rbd_dev, "post-acquire action failed: %d", ret);
+ /*
+ * Can't stay in RBD_LOCK_STATE_LOCKED because
+ * rbd_lock_add_request() would let the request through,
+ * assuming that e.g. object map is locked and loaded.
+ */
+ rbd_unlock(rbd_dev);
}
- lock_state = rbd_dev->lock_state;
+out:
+ wake_lock_waiters(rbd_dev, ret);
up_write(&rbd_dev->lock_rwsem);
- return lock_state;
+ return ret;
}
static void rbd_acquire_lock(struct work_struct *work)
{
struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
struct rbd_device, lock_dwork);
- enum rbd_lock_state lock_state;
- int ret = 0;
+ int ret;
dout("%s rbd_dev %p\n", __func__, rbd_dev);
again:
- lock_state = rbd_try_acquire_lock(rbd_dev, &ret);
- if (lock_state != RBD_LOCK_STATE_UNLOCKED || ret == -EBLACKLISTED) {
- if (lock_state == RBD_LOCK_STATE_LOCKED)
- wake_requests(rbd_dev, true);
- dout("%s rbd_dev %p lock_state %d ret %d - done\n", __func__,
- rbd_dev, lock_state, ret);
+ ret = rbd_try_acquire_lock(rbd_dev);
+ if (ret <= 0) {
+ dout("%s rbd_dev %p ret %d - done\n", __func__, rbd_dev, ret);
return;
}
@@ -3220,16 +4206,9 @@ again:
goto again; /* treat this as a dead client */
} else if (ret == -EROFS) {
rbd_warn(rbd_dev, "peer will not release lock");
- /*
- * If this is rbd_add_acquire_lock(), we want to fail
- * immediately -- reuse BLACKLISTED flag. Otherwise we
- * want to block.
- */
- if (!(rbd_dev->disk->flags & GENHD_FL_UP)) {
- set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
- /* wake "rbd map --exclusive" process */
- wake_requests(rbd_dev, false);
- }
+ down_write(&rbd_dev->lock_rwsem);
+ wake_lock_waiters(rbd_dev, ret);
+ up_write(&rbd_dev->lock_rwsem);
} else if (ret < 0) {
rbd_warn(rbd_dev, "error requesting lock: %d", ret);
mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
@@ -3246,43 +4225,67 @@ again:
}
}
-/*
- * lock_rwsem must be held for write
- */
-static bool rbd_release_lock(struct rbd_device *rbd_dev)
+static bool rbd_quiesce_lock(struct rbd_device *rbd_dev)
{
- dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
- rbd_dev->lock_state);
+ bool need_wait;
+
+ dout("%s rbd_dev %p\n", __func__, rbd_dev);
+ lockdep_assert_held_write(&rbd_dev->lock_rwsem);
+
if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
return false;
- rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
- downgrade_write(&rbd_dev->lock_rwsem);
/*
* Ensure that all in-flight IO is flushed.
- *
- * FIXME: ceph_osdc_sync() flushes the entire OSD client, which
- * may be shared with other devices.
*/
- ceph_osdc_sync(&rbd_dev->rbd_client->client->osdc);
+ rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
+ rbd_assert(!completion_done(&rbd_dev->releasing_wait));
+ need_wait = !list_empty(&rbd_dev->running_list);
+ downgrade_write(&rbd_dev->lock_rwsem);
+ if (need_wait)
+ wait_for_completion(&rbd_dev->releasing_wait);
up_read(&rbd_dev->lock_rwsem);
down_write(&rbd_dev->lock_rwsem);
- dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
- rbd_dev->lock_state);
if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
return false;
+ rbd_assert(list_empty(&rbd_dev->running_list));
+ return true;
+}
+
+static void rbd_pre_release_action(struct rbd_device *rbd_dev)
+{
+ if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)
+ rbd_object_map_close(rbd_dev);
+}
+
+static void __rbd_release_lock(struct rbd_device *rbd_dev)
+{
+ rbd_assert(list_empty(&rbd_dev->running_list));
+
+ rbd_pre_release_action(rbd_dev);
rbd_unlock(rbd_dev);
+}
+
+/*
+ * lock_rwsem must be held for write
+ */
+static void rbd_release_lock(struct rbd_device *rbd_dev)
+{
+ if (!rbd_quiesce_lock(rbd_dev))
+ return;
+
+ __rbd_release_lock(rbd_dev);
+
/*
* Give others a chance to grab the lock - we would re-acquire
- * almost immediately if we got new IO during ceph_osdc_sync()
- * otherwise. We need to ack our own notifications, so this
- * lock_dwork will be requeued from rbd_wait_state_locked()
- * after wake_requests() in rbd_handle_released_lock().
+ * almost immediately if we got new IO while draining the running
+ * list otherwise. We need to ack our own notifications, so this
+ * lock_dwork will be requeued from rbd_handle_released_lock() by
+ * way of maybe_kick_acquire().
*/
cancel_delayed_work(&rbd_dev->lock_dwork);
- return true;
}
static void rbd_release_lock_work(struct work_struct *work)
@@ -3295,6 +4298,23 @@ static void rbd_release_lock_work(struct work_struct *work)
up_write(&rbd_dev->lock_rwsem);
}
+static void maybe_kick_acquire(struct rbd_device *rbd_dev)
+{
+ bool have_requests;
+
+ dout("%s rbd_dev %p\n", __func__, rbd_dev);
+ if (__rbd_is_lock_owner(rbd_dev))
+ return;
+
+ spin_lock(&rbd_dev->lock_lists_lock);
+ have_requests = !list_empty(&rbd_dev->acquiring_list);
+ spin_unlock(&rbd_dev->lock_lists_lock);
+ if (have_requests || delayed_work_pending(&rbd_dev->lock_dwork)) {
+ dout("%s rbd_dev %p kicking lock_dwork\n", __func__, rbd_dev);
+ mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
+ }
+}
+
static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
void **p)
{
@@ -3324,8 +4344,7 @@ static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
down_read(&rbd_dev->lock_rwsem);
}
- if (!__rbd_is_lock_owner(rbd_dev))
- wake_requests(rbd_dev, false);
+ maybe_kick_acquire(rbd_dev);
up_read(&rbd_dev->lock_rwsem);
}
@@ -3357,8 +4376,7 @@ static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
down_read(&rbd_dev->lock_rwsem);
}
- if (!__rbd_is_lock_owner(rbd_dev))
- wake_requests(rbd_dev, false);
+ maybe_kick_acquire(rbd_dev);
up_read(&rbd_dev->lock_rwsem);
}
@@ -3608,7 +4626,6 @@ static void cancel_tasks_sync(struct rbd_device *rbd_dev)
static void rbd_unregister_watch(struct rbd_device *rbd_dev)
{
- WARN_ON(waitqueue_active(&rbd_dev->lock_waitq));
cancel_tasks_sync(rbd_dev);
mutex_lock(&rbd_dev->watch_mutex);
@@ -3630,7 +4647,8 @@ static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
char cookie[32];
int ret;
- WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
+ if (!rbd_quiesce_lock(rbd_dev))
+ return;
format_lock_cookie(rbd_dev, cookie);
ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
@@ -3646,11 +4664,11 @@ static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
* Lock cookie cannot be updated on older OSDs, so do
* a manual release and queue an acquire.
*/
- if (rbd_release_lock(rbd_dev))
- queue_delayed_work(rbd_dev->task_wq,
- &rbd_dev->lock_dwork, 0);
+ __rbd_release_lock(rbd_dev);
+ queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
} else {
__rbd_lock(rbd_dev, cookie);
+ wake_lock_waiters(rbd_dev, 0);
}
}
@@ -3671,15 +4689,18 @@ static void rbd_reregister_watch(struct work_struct *work)
ret = __rbd_register_watch(rbd_dev);
if (ret) {
rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
- if (ret == -EBLACKLISTED || ret == -ENOENT) {
- set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
- wake_requests(rbd_dev, true);
- } else {
+ if (ret != -EBLACKLISTED && ret != -ENOENT) {
queue_delayed_work(rbd_dev->task_wq,
&rbd_dev->watch_dwork,
RBD_RETRY_DELAY);
+ mutex_unlock(&rbd_dev->watch_mutex);
+ return;
}
+
mutex_unlock(&rbd_dev->watch_mutex);
+ down_write(&rbd_dev->lock_rwsem);
+ wake_lock_waiters(rbd_dev, ret);
+ up_write(&rbd_dev->lock_rwsem);
return;
}
@@ -3742,7 +4763,7 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
CEPH_OSD_FLAG_READ, req_page, outbound_size,
- reply_page, &inbound_size);
+ &reply_page, &inbound_size);
if (!ret) {
memcpy(inbound, page_address(reply_page), inbound_size);
ret = inbound_size;
@@ -3754,54 +4775,6 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
return ret;
}
-/*
- * lock_rwsem must be held for read
- */
-static int rbd_wait_state_locked(struct rbd_device *rbd_dev, bool may_acquire)
-{
- DEFINE_WAIT(wait);
- unsigned long timeout;
- int ret = 0;
-
- if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags))
- return -EBLACKLISTED;
-
- if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
- return 0;
-
- if (!may_acquire) {
- rbd_warn(rbd_dev, "exclusive lock required");
- return -EROFS;
- }
-
- do {
- /*
- * Note the use of mod_delayed_work() in rbd_acquire_lock()
- * and cancel_delayed_work() in wake_requests().
- */
- dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
- queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
- prepare_to_wait_exclusive(&rbd_dev->lock_waitq, &wait,
- TASK_UNINTERRUPTIBLE);
- up_read(&rbd_dev->lock_rwsem);
- timeout = schedule_timeout(ceph_timeout_jiffies(
- rbd_dev->opts->lock_timeout));
- down_read(&rbd_dev->lock_rwsem);
- if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
- ret = -EBLACKLISTED;
- break;
- }
- if (!timeout) {
- rbd_warn(rbd_dev, "timed out waiting for lock");
- ret = -ETIMEDOUT;
- break;
- }
- } while (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
-
- finish_wait(&rbd_dev->lock_waitq, &wait);
- return ret;
-}
-
static void rbd_queue_workfn(struct work_struct *work)
{
struct request *rq = blk_mq_rq_from_pdu(work);
@@ -3812,7 +4785,6 @@ static void rbd_queue_workfn(struct work_struct *work)
u64 length = blk_rq_bytes(rq);
enum obj_operation_type op_type;
u64 mapping_size;
- bool must_be_locked;
int result;
switch (req_op(rq)) {
@@ -3886,21 +4858,10 @@ static void rbd_queue_workfn(struct work_struct *work)
goto err_rq;
}
- must_be_locked =
- (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) &&
- (op_type != OBJ_OP_READ || rbd_dev->opts->lock_on_read);
- if (must_be_locked) {
- down_read(&rbd_dev->lock_rwsem);
- result = rbd_wait_state_locked(rbd_dev,
- !rbd_dev->opts->exclusive);
- if (result)
- goto err_unlock;
- }
-
img_request = rbd_img_request_create(rbd_dev, op_type, snapc);
if (!img_request) {
result = -ENOMEM;
- goto err_unlock;
+ goto err_rq;
}
img_request->rq = rq;
snapc = NULL; /* img_request consumes a ref */
@@ -3910,19 +4871,14 @@ static void rbd_queue_workfn(struct work_struct *work)
else
result = rbd_img_fill_from_bio(img_request, offset, length,
rq->bio);
- if (result || !img_request->pending_count)
+ if (result)
goto err_img_request;
- rbd_img_request_submit(img_request);
- if (must_be_locked)
- up_read(&rbd_dev->lock_rwsem);
+ rbd_img_handle_request(img_request, 0);
return;
err_img_request:
rbd_img_request_put(img_request);
-err_unlock:
- if (must_be_locked)
- up_read(&rbd_dev->lock_rwsem);
err_rq:
if (result)
rbd_warn(rbd_dev, "%s %llx at %llx result %d",
@@ -4589,7 +5545,13 @@ static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
- init_waitqueue_head(&rbd_dev->lock_waitq);
+ spin_lock_init(&rbd_dev->lock_lists_lock);
+ INIT_LIST_HEAD(&rbd_dev->acquiring_list);
+ INIT_LIST_HEAD(&rbd_dev->running_list);
+ init_completion(&rbd_dev->acquire_wait);
+ init_completion(&rbd_dev->releasing_wait);
+
+ spin_lock_init(&rbd_dev->object_map_lock);
rbd_dev->dev.bus = &rbd_bus_type;
rbd_dev->dev.type = &rbd_device_type;
@@ -4772,6 +5734,32 @@ static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
&rbd_dev->header.features);
}
+/*
+ * These are generic image flags, but since they are used only for
+ * object map, store them in rbd_dev->object_map_flags.
+ *
+ * For the same reason, this function is called only on object map
+ * (re)load and not on header refresh.
+ */
+static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev)
+{
+ __le64 snapid = cpu_to_le64(rbd_dev->spec->snap_id);
+ __le64 flags;
+ int ret;
+
+ ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+ &rbd_dev->header_oloc, "get_flags",
+ &snapid, sizeof(snapid),
+ &flags, sizeof(flags));
+ if (ret < 0)
+ return ret;
+ if (ret < sizeof(flags))
+ return -EBADMSG;
+
+ rbd_dev->object_map_flags = le64_to_cpu(flags);
+ return 0;
+}
+
struct parent_image_info {
u64 pool_id;
const char *pool_ns;
@@ -4829,7 +5817,7 @@ static int __get_parent_info(struct rbd_device *rbd_dev,
ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
"rbd", "parent_get", CEPH_OSD_FLAG_READ,
- req_page, sizeof(u64), reply_page, &reply_len);
+ req_page, sizeof(u64), &reply_page, &reply_len);
if (ret)
return ret == -EOPNOTSUPP ? 1 : ret;
@@ -4841,7 +5829,7 @@ static int __get_parent_info(struct rbd_device *rbd_dev,
ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
"rbd", "parent_overlap_get", CEPH_OSD_FLAG_READ,
- req_page, sizeof(u64), reply_page, &reply_len);
+ req_page, sizeof(u64), &reply_page, &reply_len);
if (ret)
return ret;
@@ -4872,7 +5860,7 @@ static int __get_parent_info_legacy(struct rbd_device *rbd_dev,
ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
"rbd", "get_parent", CEPH_OSD_FLAG_READ,
- req_page, sizeof(u64), reply_page, &reply_len);
+ req_page, sizeof(u64), &reply_page, &reply_len);
if (ret)
return ret;
@@ -5605,28 +6593,49 @@ static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
{
down_write(&rbd_dev->lock_rwsem);
if (__rbd_is_lock_owner(rbd_dev))
- rbd_unlock(rbd_dev);
+ __rbd_release_lock(rbd_dev);
up_write(&rbd_dev->lock_rwsem);
}
+/*
+ * If the wait is interrupted, an error is returned even if the lock
+ * was successfully acquired. rbd_dev_image_unlock() will release it
+ * if needed.
+ */
static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
{
- int ret;
+ long ret;
if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
+ if (!rbd_dev->opts->exclusive && !rbd_dev->opts->lock_on_read)
+ return 0;
+
rbd_warn(rbd_dev, "exclusive-lock feature is not enabled");
return -EINVAL;
}
- /* FIXME: "rbd map --exclusive" should be in interruptible */
- down_read(&rbd_dev->lock_rwsem);
- ret = rbd_wait_state_locked(rbd_dev, true);
- up_read(&rbd_dev->lock_rwsem);
+ if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
+ return 0;
+
+ rbd_assert(!rbd_is_lock_owner(rbd_dev));
+ queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
+ ret = wait_for_completion_killable_timeout(&rbd_dev->acquire_wait,
+ ceph_timeout_jiffies(rbd_dev->opts->lock_timeout));
+ if (ret > 0)
+ ret = rbd_dev->acquire_err;
+ else if (!ret)
+ ret = -ETIMEDOUT;
+
if (ret) {
- rbd_warn(rbd_dev, "failed to acquire exclusive lock");
- return -EROFS;
+ rbd_warn(rbd_dev, "failed to acquire exclusive lock: %ld", ret);
+ return ret;
}
+ /*
+ * The lock may have been released by now, unless automatic lock
+ * transitions are disabled.
+ */
+ rbd_assert(!rbd_dev->opts->exclusive || rbd_is_lock_owner(rbd_dev));
return 0;
}
@@ -5724,6 +6733,8 @@ static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
struct rbd_image_header *header;
rbd_dev_parent_put(rbd_dev);
+ rbd_object_map_free(rbd_dev);
+ rbd_dev_mapping_clear(rbd_dev);
/* Free dynamic fields from the header, then zero it out */
@@ -5824,7 +6835,6 @@ out_err:
static void rbd_dev_device_release(struct rbd_device *rbd_dev)
{
clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
- rbd_dev_mapping_clear(rbd_dev);
rbd_free_disk(rbd_dev);
if (!single_major)
unregister_blkdev(rbd_dev->major, rbd_dev->name);
@@ -5858,23 +6868,17 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
if (ret)
goto err_out_blkdev;
- ret = rbd_dev_mapping_set(rbd_dev);
- if (ret)
- goto err_out_disk;
-
set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
set_disk_ro(rbd_dev->disk, rbd_dev->opts->read_only);
ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
if (ret)
- goto err_out_mapping;
+ goto err_out_disk;
set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
up_write(&rbd_dev->header_rwsem);
return 0;
-err_out_mapping:
- rbd_dev_mapping_clear(rbd_dev);
err_out_disk:
rbd_free_disk(rbd_dev);
err_out_blkdev:
@@ -5975,6 +6979,17 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
goto err_out_probe;
}
+ ret = rbd_dev_mapping_set(rbd_dev);
+ if (ret)
+ goto err_out_probe;
+
+ if (rbd_dev->spec->snap_id != CEPH_NOSNAP &&
+ (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)) {
+ ret = rbd_object_map_load(rbd_dev);
+ if (ret)
+ goto err_out_probe;
+ }
+
if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
ret = rbd_dev_v2_parent_info(rbd_dev);
if (ret)
@@ -6071,11 +7086,9 @@ static ssize_t do_rbd_add(struct bus_type *bus,
if (rc)
goto err_out_image_probe;
- if (rbd_dev->opts->exclusive) {
- rc = rbd_add_acquire_lock(rbd_dev);
- if (rc)
- goto err_out_device_setup;
- }
+ rc = rbd_add_acquire_lock(rbd_dev);
+ if (rc)
+ goto err_out_image_lock;
/* Everything's ready. Announce the disk to the world. */
@@ -6101,7 +7114,6 @@ out:
err_out_image_lock:
rbd_dev_image_unlock(rbd_dev);
-err_out_device_setup:
rbd_dev_device_release(rbd_dev);
err_out_image_probe:
rbd_dev_image_release(rbd_dev);
diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h
index 62ff50d3e7a6..ac98ab6ccd3b 100644
--- a/drivers/block/rbd_types.h
+++ b/drivers/block/rbd_types.h
@@ -18,6 +18,7 @@
/* For format version 2, rbd image 'foo' consists of objects
* rbd_id.foo - id of image
* rbd_header.<id> - image metadata
+ * rbd_object_map.<id> - optional image object map
* rbd_data.<id>.0000000000000000
* rbd_data.<id>.0000000000000001
* ... - data
@@ -25,6 +26,7 @@
*/
#define RBD_HEADER_PREFIX "rbd_header."
+#define RBD_OBJECT_MAP_PREFIX "rbd_object_map."
#define RBD_ID_PREFIX "rbd_id."
#define RBD_V2_DATA_FORMAT "%s.%016llx"
@@ -39,6 +41,14 @@ enum rbd_notify_op {
RBD_NOTIFY_OP_HEADER_UPDATE = 3,
};
+#define OBJECT_NONEXISTENT 0
+#define OBJECT_EXISTS 1
+#define OBJECT_PENDING 2
+#define OBJECT_EXISTS_CLEAN 3
+
+#define RBD_FLAG_OBJECT_MAP_INVALID (1ULL << 0)
+#define RBD_FLAG_FAST_DIFF_INVALID (1ULL << 1)
+
/*
* For format version 1, rbd image 'foo' consists of objects
* foo.rbd - image metadata
diff --git a/drivers/cpufreq/bmips-cpufreq.c b/drivers/cpufreq/bmips-cpufreq.c
index 56a4ebbf00e0..f7c23fa468f0 100644
--- a/drivers/cpufreq/bmips-cpufreq.c
+++ b/drivers/cpufreq/bmips-cpufreq.c
@@ -131,23 +131,18 @@ static int bmips_cpufreq_exit(struct cpufreq_policy *policy)
static int bmips_cpufreq_init(struct cpufreq_policy *policy)
{
struct cpufreq_frequency_table *freq_table;
- int ret;
freq_table = bmips_cpufreq_get_freq_table(policy);
if (IS_ERR(freq_table)) {
- ret = PTR_ERR(freq_table);
- pr_err("%s: couldn't determine frequency table (%d).\n",
- BMIPS_CPUFREQ_NAME, ret);
- return ret;
+ pr_err("%s: couldn't determine frequency table (%ld).\n",
+ BMIPS_CPUFREQ_NAME, PTR_ERR(freq_table));
+ return PTR_ERR(freq_table);
}
- ret = cpufreq_generic_init(policy, freq_table, TRANSITION_LATENCY);
- if (ret)
- bmips_cpufreq_exit(policy);
- else
- pr_info("%s: registered\n", BMIPS_CPUFREQ_NAME);
+ cpufreq_generic_init(policy, freq_table, TRANSITION_LATENCY);
+ pr_info("%s: registered\n", BMIPS_CPUFREQ_NAME);
- return ret;
+ return 0;
}
static struct cpufreq_driver bmips_cpufreq_driver = {
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 0a9f675f2af4..8dda62367816 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -23,6 +23,7 @@
#include <linux/kernel_stat.h>
#include <linux/module.h>
#include <linux/mutex.h>
+#include <linux/pm_qos.h>
#include <linux/slab.h>
#include <linux/suspend.h>
#include <linux/syscore_ops.h>
@@ -158,7 +159,7 @@ EXPORT_SYMBOL_GPL(arch_set_freq_scale);
* - set policies transition latency
* - policy->cpus with all possible CPUs
*/
-int cpufreq_generic_init(struct cpufreq_policy *policy,
+void cpufreq_generic_init(struct cpufreq_policy *policy,
struct cpufreq_frequency_table *table,
unsigned int transition_latency)
{
@@ -170,8 +171,6 @@ int cpufreq_generic_init(struct cpufreq_policy *policy,
* share the clock and voltage and clock.
*/
cpumask_setall(policy->cpus);
-
- return 0;
}
EXPORT_SYMBOL_GPL(cpufreq_generic_init);
@@ -714,23 +713,15 @@ static ssize_t show_scaling_cur_freq(struct cpufreq_policy *policy, char *buf)
static ssize_t store_##file_name \
(struct cpufreq_policy *policy, const char *buf, size_t count) \
{ \
- int ret, temp; \
- struct cpufreq_policy new_policy; \
+ unsigned long val; \
+ int ret; \
\
- memcpy(&new_policy, policy, sizeof(*policy)); \
- new_policy.min = policy->user_policy.min; \
- new_policy.max = policy->user_policy.max; \
- \
- ret = sscanf(buf, "%u", &new_policy.object); \
+ ret = sscanf(buf, "%lu", &val); \
if (ret != 1) \
return -EINVAL; \
\
- temp = new_policy.object; \
- ret = cpufreq_set_policy(policy, &new_policy); \
- if (!ret) \
- policy->user_policy.object = temp; \
- \
- return ret ? ret : count; \
+ ret = dev_pm_qos_update_request(policy->object##_freq_req, val);\
+ return ret >= 0 ? count : ret; \
}
store_one(scaling_min_freq, min);
@@ -996,7 +987,7 @@ static void add_cpu_dev_symlink(struct cpufreq_policy *policy, unsigned int cpu)
{
struct device *dev = get_cpu_device(cpu);
- if (!dev)
+ if (unlikely(!dev))
return;
if (cpumask_test_and_set_cpu(cpu, policy->real_cpus))
@@ -1112,17 +1103,18 @@ static int cpufreq_add_policy_cpu(struct cpufreq_policy *policy, unsigned int cp
return ret;
}
-static void refresh_frequency_limits(struct cpufreq_policy *policy)
+void refresh_frequency_limits(struct cpufreq_policy *policy)
{
- struct cpufreq_policy new_policy = *policy;
-
- pr_debug("updating policy for CPU %u\n", policy->cpu);
+ struct cpufreq_policy new_policy;
- new_policy.min = policy->user_policy.min;
- new_policy.max = policy->user_policy.max;
+ if (!policy_is_inactive(policy)) {
+ new_policy = *policy;
+ pr_debug("updating policy for CPU %u\n", policy->cpu);
- cpufreq_set_policy(policy, &new_policy);
+ cpufreq_set_policy(policy, &new_policy);
+ }
}
+EXPORT_SYMBOL(refresh_frequency_limits);
static void handle_update(struct work_struct *work)
{
@@ -1130,14 +1122,60 @@ static void handle_update(struct work_struct *work)
container_of(work, struct cpufreq_policy, update);
pr_debug("handle_update for cpu %u called\n", policy->cpu);
+ down_write(&policy->rwsem);
refresh_frequency_limits(policy);
+ up_write(&policy->rwsem);
+}
+
+static int cpufreq_notifier_min(struct notifier_block *nb, unsigned long freq,
+ void *data)
+{
+ struct cpufreq_policy *policy = container_of(nb, struct cpufreq_policy, nb_min);
+
+ schedule_work(&policy->update);
+ return 0;
+}
+
+static int cpufreq_notifier_max(struct notifier_block *nb, unsigned long freq,
+ void *data)
+{
+ struct cpufreq_policy *policy = container_of(nb, struct cpufreq_policy, nb_max);
+
+ schedule_work(&policy->update);
+ return 0;
+}
+
+static void cpufreq_policy_put_kobj(struct cpufreq_policy *policy)
+{
+ struct kobject *kobj;
+ struct completion *cmp;
+
+ down_write(&policy->rwsem);
+ cpufreq_stats_free_table(policy);
+ kobj = &policy->kobj;
+ cmp = &policy->kobj_unregister;
+ up_write(&policy->rwsem);
+ kobject_put(kobj);
+
+ /*
+ * We need to make sure that the underlying kobj is
+ * actually not referenced anymore by anybody before we
+ * proceed with unloading.
+ */
+ pr_debug("waiting for dropping of refcount\n");
+ wait_for_completion(cmp);
+ pr_debug("wait complete\n");
}
static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)
{
struct cpufreq_policy *policy;
+ struct device *dev = get_cpu_device(cpu);
int ret;
+ if (!dev)
+ return NULL;
+
policy = kzalloc(sizeof(*policy), GFP_KERNEL);
if (!policy)
return NULL;
@@ -1154,7 +1192,7 @@ static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)
ret = kobject_init_and_add(&policy->kobj, &ktype_cpufreq,
cpufreq_global_kobject, "policy%u", cpu);
if (ret) {
- pr_err("%s: failed to init policy->kobj: %d\n", __func__, ret);
+ dev_err(dev, "%s: failed to init policy->kobj: %d\n", __func__, ret);
/*
* The entire policy object will be freed below, but the extra
* memory allocated for the kobject name needs to be freed by
@@ -1164,6 +1202,25 @@ static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)
goto err_free_real_cpus;
}
+ policy->nb_min.notifier_call = cpufreq_notifier_min;
+ policy->nb_max.notifier_call = cpufreq_notifier_max;
+
+ ret = dev_pm_qos_add_notifier(dev, &policy->nb_min,
+ DEV_PM_QOS_MIN_FREQUENCY);
+ if (ret) {
+ dev_err(dev, "Failed to register MIN QoS notifier: %d (%*pbl)\n",
+ ret, cpumask_pr_args(policy->cpus));
+ goto err_kobj_remove;
+ }
+
+ ret = dev_pm_qos_add_notifier(dev, &policy->nb_max,
+ DEV_PM_QOS_MAX_FREQUENCY);
+ if (ret) {
+ dev_err(dev, "Failed to register MAX QoS notifier: %d (%*pbl)\n",
+ ret, cpumask_pr_args(policy->cpus));
+ goto err_min_qos_notifier;
+ }
+
INIT_LIST_HEAD(&policy->policy_list);
init_rwsem(&policy->rwsem);
spin_lock_init(&policy->transition_lock);
@@ -1174,6 +1231,11 @@ static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)
policy->cpu = cpu;
return policy;
+err_min_qos_notifier:
+ dev_pm_qos_remove_notifier(dev, &policy->nb_min,
+ DEV_PM_QOS_MIN_FREQUENCY);
+err_kobj_remove:
+ cpufreq_policy_put_kobj(policy);
err_free_real_cpus:
free_cpumask_var(policy->real_cpus);
err_free_rcpumask:
@@ -1186,30 +1248,9 @@ err_free_policy:
return NULL;
}
-static void cpufreq_policy_put_kobj(struct cpufreq_policy *policy)
-{
- struct kobject *kobj;
- struct completion *cmp;
-
- down_write(&policy->rwsem);
- cpufreq_stats_free_table(policy);
- kobj = &policy->kobj;
- cmp = &policy->kobj_unregister;
- up_write(&policy->rwsem);
- kobject_put(kobj);
-
- /*
- * We need to make sure that the underlying kobj is
- * actually not referenced anymore by anybody before we
- * proceed with unloading.
- */
- pr_debug("waiting for dropping of refcount\n");
- wait_for_completion(cmp);
- pr_debug("wait complete\n");
-}
-
static void cpufreq_policy_free(struct cpufreq_policy *policy)
{
+ struct device *dev = get_cpu_device(policy->cpu);
unsigned long flags;
int cpu;
@@ -1221,6 +1262,14 @@ static void cpufreq_policy_free(struct cpufreq_policy *policy)
per_cpu(cpufreq_cpu_data, cpu) = NULL;
write_unlock_irqrestore(&cpufreq_driver_lock, flags);
+ dev_pm_qos_remove_notifier(dev, &policy->nb_max,
+ DEV_PM_QOS_MAX_FREQUENCY);
+ dev_pm_qos_remove_notifier(dev, &policy->nb_min,
+ DEV_PM_QOS_MIN_FREQUENCY);
+ dev_pm_qos_remove_request(policy->max_freq_req);
+ dev_pm_qos_remove_request(policy->min_freq_req);
+ kfree(policy->min_freq_req);
+
cpufreq_policy_put_kobj(policy);
free_cpumask_var(policy->real_cpus);
free_cpumask_var(policy->related_cpus);
@@ -1298,16 +1347,50 @@ static int cpufreq_online(unsigned int cpu)
cpumask_and(policy->cpus, policy->cpus, cpu_online_mask);
if (new_policy) {
- policy->user_policy.min = policy->min;
- policy->user_policy.max = policy->max;
+ struct device *dev = get_cpu_device(cpu);
for_each_cpu(j, policy->related_cpus) {
per_cpu(cpufreq_cpu_data, j) = policy;
add_cpu_dev_symlink(policy, j);
}
- } else {
- policy->min = policy->user_policy.min;
- policy->max = policy->user_policy.max;
+
+ policy->min_freq_req = kzalloc(2 * sizeof(*policy->min_freq_req),
+ GFP_KERNEL);
+ if (!policy->min_freq_req)
+ goto out_destroy_policy;
+
+ ret = dev_pm_qos_add_request(dev, policy->min_freq_req,
+ DEV_PM_QOS_MIN_FREQUENCY,
+ policy->min);
+ if (ret < 0) {
+ /*
+ * So we don't call dev_pm_qos_remove_request() for an
+ * uninitialized request.
+ */
+ kfree(policy->min_freq_req);
+ policy->min_freq_req = NULL;
+
+ dev_err(dev, "Failed to add min-freq constraint (%d)\n",
+ ret);
+ goto out_destroy_policy;
+ }
+
+ /*
+ * This must be initialized right here to avoid calling
+ * dev_pm_qos_remove_request() on uninitialized request in case
+ * of errors.
+ */
+ policy->max_freq_req = policy->min_freq_req + 1;
+
+ ret = dev_pm_qos_add_request(dev, policy->max_freq_req,
+ DEV_PM_QOS_MAX_FREQUENCY,
+ policy->max);
+ if (ret < 0) {
+ policy->max_freq_req = NULL;
+ dev_err(dev, "Failed to add max-freq constraint (%d)\n",
+ ret);
+ goto out_destroy_policy;
+ }
}
if (cpufreq_driver->get && has_target()) {
@@ -2280,6 +2363,7 @@ int cpufreq_set_policy(struct cpufreq_policy *policy,
struct cpufreq_policy *new_policy)
{
struct cpufreq_governor *old_gov;
+ struct device *cpu_dev = get_cpu_device(policy->cpu);
int ret;
pr_debug("setting new policy for CPU %u: %u - %u kHz\n",
@@ -2288,17 +2372,21 @@ int cpufreq_set_policy(struct cpufreq_policy *policy,
memcpy(&new_policy->cpuinfo, &policy->cpuinfo, sizeof(policy->cpuinfo));
/*
- * This check works well when we store new min/max freq attributes,
- * because new_policy is a copy of policy with one field updated.
- */
- if (new_policy->min > new_policy->max)
- return -EINVAL;
+ * PM QoS framework collects all the requests from users and provide us
+ * the final aggregated value here.
+ */
+ new_policy->min = dev_pm_qos_read_value(cpu_dev, DEV_PM_QOS_MIN_FREQUENCY);
+ new_policy->max = dev_pm_qos_read_value(cpu_dev, DEV_PM_QOS_MAX_FREQUENCY);
/* verify the cpu speed can be set within this limit */
ret = cpufreq_driver->verify(new_policy);
if (ret)
return ret;
+ /*
+ * The notifier-chain shall be removed once all the users of
+ * CPUFREQ_ADJUST are moved to use the QoS framework.
+ */
/* adjust if necessary - all reasons */
blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
CPUFREQ_ADJUST, new_policy);
@@ -2377,10 +2465,9 @@ int cpufreq_set_policy(struct cpufreq_policy *policy,
* @cpu: CPU to re-evaluate the policy for.
*
* Update the current frequency for the cpufreq policy of @cpu and use
- * cpufreq_set_policy() to re-apply the min and max limits saved in the
- * user_policy sub-structure of that policy, which triggers the evaluation
- * of policy notifiers and the cpufreq driver's ->verify() callback for the
- * policy in question, among other things.
+ * cpufreq_set_policy() to re-apply the min and max limits, which triggers the
+ * evaluation of policy notifiers and the cpufreq driver's ->verify() callback
+ * for the policy in question, among other things.
*/
void cpufreq_update_policy(unsigned int cpu)
{
@@ -2440,10 +2527,9 @@ static int cpufreq_boost_set_sw(int state)
break;
}
- down_write(&policy->rwsem);
- policy->user_policy.max = policy->max;
- cpufreq_governor_limits(policy);
- up_write(&policy->rwsem);
+ ret = dev_pm_qos_update_request(policy->max_freq_req, policy->max);
+ if (ret)
+ break;
}
return ret;
diff --git a/drivers/cpufreq/davinci-cpufreq.c b/drivers/cpufreq/davinci-cpufreq.c
index 3de48ae60c29..297d23cad8b5 100644
--- a/drivers/cpufreq/davinci-cpufreq.c
+++ b/drivers/cpufreq/davinci-cpufreq.c
@@ -90,7 +90,8 @@ static int davinci_cpu_init(struct cpufreq_policy *policy)
* Setting the latency to 2000 us to accommodate addition of drivers
* to pre/post change notification list.
*/
- return cpufreq_generic_init(policy, freq_table, 2000 * 1000);
+ cpufreq_generic_init(policy, freq_table, 2000 * 1000);
+ return 0;
}
static struct cpufreq_driver davinci_driver = {
diff --git a/drivers/cpufreq/imx-cpufreq-dt.c b/drivers/cpufreq/imx-cpufreq-dt.c
index b54fd26ea7df..4f85f3112784 100644
--- a/drivers/cpufreq/imx-cpufreq-dt.c
+++ b/drivers/cpufreq/imx-cpufreq-dt.c
@@ -44,10 +44,11 @@ static int imx_cpufreq_dt_probe(struct platform_device *pdev)
* According to datasheet minimum speed grading is not supported for
* consumer parts so clamp to 1 to avoid warning for "no OPPs"
*
- * Applies to 8mq and 8mm.
+ * Applies to i.MX8M series SoCs.
*/
if (mkt_segment == 0 && speed_grade == 0 && (
of_machine_is_compatible("fsl,imx8mm") ||
+ of_machine_is_compatible("fsl,imx8mn") ||
of_machine_is_compatible("fsl,imx8mq")))
speed_grade = 1;
diff --git a/drivers/cpufreq/imx6q-cpufreq.c b/drivers/cpufreq/imx6q-cpufreq.c
index 47ccfa6b17b7..648a09a1778a 100644
--- a/drivers/cpufreq/imx6q-cpufreq.c
+++ b/drivers/cpufreq/imx6q-cpufreq.c
@@ -190,14 +190,12 @@ static int imx6q_set_target(struct cpufreq_policy *policy, unsigned int index)
static int imx6q_cpufreq_init(struct cpufreq_policy *policy)
{
- int ret;
-
policy->clk = clks[ARM].clk;
- ret = cpufreq_generic_init(policy, freq_table, transition_latency);
+ cpufreq_generic_init(policy, freq_table, transition_latency);
policy->suspend_freq = max_freq;
dev_pm_opp_of_register_em(policy->cpus);
- return ret;
+ return 0;
}
static struct cpufreq_driver imx6q_cpufreq_driver = {
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index f2ff5de988c1..cc27d4c59dca 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -898,7 +898,6 @@ static void intel_pstate_update_policies(void)
static void intel_pstate_update_max_freq(unsigned int cpu)
{
struct cpufreq_policy *policy = cpufreq_cpu_acquire(cpu);
- struct cpufreq_policy new_policy;
struct cpudata *cpudata;
if (!policy)
@@ -908,11 +907,7 @@ static void intel_pstate_update_max_freq(unsigned int cpu)
policy->cpuinfo.max_freq = global.turbo_disabled_mf ?
cpudata->pstate.max_freq : cpudata->pstate.turbo_freq;
- memcpy(&new_policy, policy, sizeof(*policy));
- new_policy.max = min(policy->user_policy.max, policy->cpuinfo.max_freq);
- new_policy.min = min(policy->user_policy.min, new_policy.max);
-
- cpufreq_set_policy(policy, &new_policy);
+ refresh_frequency_limits(policy);
cpufreq_cpu_release(policy);
}
diff --git a/drivers/cpufreq/kirkwood-cpufreq.c b/drivers/cpufreq/kirkwood-cpufreq.c
index 7ab564c1f7ae..cb74bdc5baaa 100644
--- a/drivers/cpufreq/kirkwood-cpufreq.c
+++ b/drivers/cpufreq/kirkwood-cpufreq.c
@@ -85,7 +85,8 @@ static int kirkwood_cpufreq_target(struct cpufreq_policy *policy,
/* Module init and exit code */
static int kirkwood_cpufreq_cpu_init(struct cpufreq_policy *policy)
{
- return cpufreq_generic_init(policy, kirkwood_freq_table, 5000);
+ cpufreq_generic_init(policy, kirkwood_freq_table, 5000);
+ return 0;
}
static struct cpufreq_driver kirkwood_cpufreq_driver = {
diff --git a/drivers/cpufreq/loongson1-cpufreq.c b/drivers/cpufreq/loongson1-cpufreq.c
index 21c9ce8526c0..0ea88778882a 100644
--- a/drivers/cpufreq/loongson1-cpufreq.c
+++ b/drivers/cpufreq/loongson1-cpufreq.c
@@ -81,7 +81,7 @@ static int ls1x_cpufreq_init(struct cpufreq_policy *policy)
struct device *cpu_dev = get_cpu_device(policy->cpu);
struct cpufreq_frequency_table *freq_tbl;
unsigned int pll_freq, freq;
- int steps, i, ret;
+ int steps, i;
pll_freq = clk_get_rate(cpufreq->pll_clk) / 1000;
@@ -103,11 +103,9 @@ static int ls1x_cpufreq_init(struct cpufreq_policy *policy)
freq_tbl[i].frequency = CPUFREQ_TABLE_END;
policy->clk = cpufreq->clk;
- ret = cpufreq_generic_init(policy, freq_tbl, 0);
- if (ret)
- kfree(freq_tbl);
+ cpufreq_generic_init(policy, freq_tbl, 0);
- return ret;
+ return 0;
}
static int ls1x_cpufreq_exit(struct cpufreq_policy *policy)
diff --git a/drivers/cpufreq/loongson2_cpufreq.c b/drivers/cpufreq/loongson2_cpufreq.c
index da344696beed..890813e0bb76 100644
--- a/drivers/cpufreq/loongson2_cpufreq.c
+++ b/drivers/cpufreq/loongson2_cpufreq.c
@@ -95,7 +95,8 @@ static int loongson2_cpufreq_cpu_init(struct cpufreq_policy *policy)
}
policy->clk = cpuclk;
- return cpufreq_generic_init(policy, &loongson2_clockmod_table[0], 0);
+ cpufreq_generic_init(policy, &loongson2_clockmod_table[0], 0);
+ return 0;
}
static int loongson2_cpufreq_exit(struct cpufreq_policy *policy)
diff --git a/drivers/cpufreq/maple-cpufreq.c b/drivers/cpufreq/maple-cpufreq.c
index f5220b3d4ec5..28d346062166 100644
--- a/drivers/cpufreq/maple-cpufreq.c
+++ b/drivers/cpufreq/maple-cpufreq.c
@@ -140,7 +140,8 @@ static unsigned int maple_cpufreq_get_speed(unsigned int cpu)
static int maple_cpufreq_cpu_init(struct cpufreq_policy *policy)
{
- return cpufreq_generic_init(policy, maple_cpu_freqs, 12000);
+ cpufreq_generic_init(policy, maple_cpu_freqs, 12000);
+ return 0;
}
static struct cpufreq_driver maple_cpufreq_driver = {
diff --git a/drivers/cpufreq/omap-cpufreq.c b/drivers/cpufreq/omap-cpufreq.c
index 29643f06a3c3..8d14b42a8c6f 100644
--- a/drivers/cpufreq/omap-cpufreq.c
+++ b/drivers/cpufreq/omap-cpufreq.c
@@ -122,23 +122,18 @@ static int omap_cpu_init(struct cpufreq_policy *policy)
dev_err(mpu_dev,
"%s: cpu%d: failed creating freq table[%d]\n",
__func__, policy->cpu, result);
- goto fail;
+ clk_put(policy->clk);
+ return result;
}
}
atomic_inc_return(&freq_table_users);
/* FIXME: what's the actual transition time? */
- result = cpufreq_generic_init(policy, freq_table, 300 * 1000);
- if (!result) {
- dev_pm_opp_of_register_em(policy->cpus);
- return 0;
- }
+ cpufreq_generic_init(policy, freq_table, 300 * 1000);
+ dev_pm_opp_of_register_em(policy->cpus);
- freq_table_free();
-fail:
- clk_put(policy->clk);
- return result;
+ return 0;
}
static int omap_cpu_exit(struct cpufreq_policy *policy)
diff --git a/drivers/cpufreq/pasemi-cpufreq.c b/drivers/cpufreq/pasemi-cpufreq.c
index 6b1e4abe3248..93f39a1d4c3d 100644
--- a/drivers/cpufreq/pasemi-cpufreq.c
+++ b/drivers/cpufreq/pasemi-cpufreq.c
@@ -196,7 +196,8 @@ static int pas_cpufreq_cpu_init(struct cpufreq_policy *policy)
policy->cur = pas_freqs[cur_astate].frequency;
ppc_proc_freq = policy->cur * 1000ul;
- return cpufreq_generic_init(policy, pas_freqs, get_gizmo_latency());
+ cpufreq_generic_init(policy, pas_freqs, get_gizmo_latency());
+ return 0;
out_unmap_sdcpwr:
iounmap(sdcpwr_mapbase);
diff --git a/drivers/cpufreq/pmac32-cpufreq.c b/drivers/cpufreq/pmac32-cpufreq.c
index 650104d729f3..73621bc11976 100644
--- a/drivers/cpufreq/pmac32-cpufreq.c
+++ b/drivers/cpufreq/pmac32-cpufreq.c
@@ -372,7 +372,8 @@ static int pmac_cpufreq_target( struct cpufreq_policy *policy,
static int pmac_cpufreq_cpu_init(struct cpufreq_policy *policy)
{
- return cpufreq_generic_init(policy, pmac_cpu_freqs, transition_latency);
+ cpufreq_generic_init(policy, pmac_cpu_freqs, transition_latency);
+ return 0;
}
static u32 read_gpio(struct device_node *np)
diff --git a/drivers/cpufreq/pmac64-cpufreq.c b/drivers/cpufreq/pmac64-cpufreq.c
index 1af3492a000d..d7542a106e6b 100644
--- a/drivers/cpufreq/pmac64-cpufreq.c
+++ b/drivers/cpufreq/pmac64-cpufreq.c
@@ -321,7 +321,8 @@ static unsigned int g5_cpufreq_get_speed(unsigned int cpu)
static int g5_cpufreq_cpu_init(struct cpufreq_policy *policy)
{
- return cpufreq_generic_init(policy, g5_cpu_freqs, transition_latency);
+ cpufreq_generic_init(policy, g5_cpu_freqs, transition_latency);
+ return 0;
}
static struct cpufreq_driver g5_cpufreq_driver = {
diff --git a/drivers/cpufreq/s3c2416-cpufreq.c b/drivers/cpufreq/s3c2416-cpufreq.c
index f7ff1ed7fef1..106910351c41 100644
--- a/drivers/cpufreq/s3c2416-cpufreq.c
+++ b/drivers/cpufreq/s3c2416-cpufreq.c
@@ -447,21 +447,16 @@ static int s3c2416_cpufreq_driver_init(struct cpufreq_policy *policy)
/* Datasheet says PLL stabalisation time must be at least 300us,
* so but add some fudge. (reference in LOCKCON0 register description)
*/
- ret = cpufreq_generic_init(policy, s3c_freq->freq_table,
+ cpufreq_generic_init(policy, s3c_freq->freq_table,
(500 * 1000) + s3c_freq->regulator_latency);
- if (ret)
- goto err_freq_table;
-
register_reboot_notifier(&s3c2416_cpufreq_reboot_notifier);
return 0;
-err_freq_table:
#ifdef CONFIG_ARM_S3C2416_CPUFREQ_VCORESCALE
- regulator_put(s3c_freq->vddarm);
err_vddarm:
-#endif
clk_put(s3c_freq->armclk);
+#endif
err_armclk:
clk_put(s3c_freq->hclk);
err_hclk:
diff --git a/drivers/cpufreq/s3c64xx-cpufreq.c b/drivers/cpufreq/s3c64xx-cpufreq.c
index 37df2d892eb0..af0c00dabb22 100644
--- a/drivers/cpufreq/s3c64xx-cpufreq.c
+++ b/drivers/cpufreq/s3c64xx-cpufreq.c
@@ -144,7 +144,6 @@ out:
static int s3c64xx_cpufreq_driver_init(struct cpufreq_policy *policy)
{
- int ret;
struct cpufreq_frequency_table *freq;
if (policy->cpu != 0)
@@ -165,8 +164,7 @@ static int s3c64xx_cpufreq_driver_init(struct cpufreq_policy *policy)
#ifdef CONFIG_REGULATOR
vddarm = regulator_get(NULL, "vddarm");
if (IS_ERR(vddarm)) {
- ret = PTR_ERR(vddarm);
- pr_err("Failed to obtain VDDARM: %d\n", ret);
+ pr_err("Failed to obtain VDDARM: %ld\n", PTR_ERR(vddarm));
pr_err("Only frequency scaling available\n");
vddarm = NULL;
} else {
@@ -196,16 +194,9 @@ static int s3c64xx_cpufreq_driver_init(struct cpufreq_policy *policy)
* the PLLs, which we don't currently) is ~300us worst case,
* but add some fudge.
*/
- ret = cpufreq_generic_init(policy, s3c64xx_freq_table,
+ cpufreq_generic_init(policy, s3c64xx_freq_table,
(500 * 1000) + regulator_latency);
- if (ret != 0) {
- pr_err("Failed to configure frequency table: %d\n",
- ret);
- regulator_put(vddarm);
- clk_put(policy->clk);
- }
-
- return ret;
+ return 0;
}
static struct cpufreq_driver s3c64xx_cpufreq_driver = {
diff --git a/drivers/cpufreq/s5pv210-cpufreq.c b/drivers/cpufreq/s5pv210-cpufreq.c
index e5cb17d4be7b..5d10030f2560 100644
--- a/drivers/cpufreq/s5pv210-cpufreq.c
+++ b/drivers/cpufreq/s5pv210-cpufreq.c
@@ -541,7 +541,8 @@ static int s5pv210_cpu_init(struct cpufreq_policy *policy)
s5pv210_dram_conf[1].freq = clk_get_rate(dmc1_clk);
policy->suspend_freq = SLEEP_FREQ;
- return cpufreq_generic_init(policy, s5pv210_freq_table, 40000);
+ cpufreq_generic_init(policy, s5pv210_freq_table, 40000);
+ return 0;
out_dmc1:
clk_put(dmc0_clk);
diff --git a/drivers/cpufreq/sa1100-cpufreq.c b/drivers/cpufreq/sa1100-cpufreq.c
index ab5cab93e638..5c075ef6adc0 100644
--- a/drivers/cpufreq/sa1100-cpufreq.c
+++ b/drivers/cpufreq/sa1100-cpufreq.c
@@ -181,7 +181,8 @@ static int sa1100_target(struct cpufreq_policy *policy, unsigned int ppcr)
static int __init sa1100_cpu_init(struct cpufreq_policy *policy)
{
- return cpufreq_generic_init(policy, sa11x0_freq_table, 0);
+ cpufreq_generic_init(policy, sa11x0_freq_table, 0);
+ return 0;
}
static struct cpufreq_driver sa1100_driver __refdata = {
diff --git a/drivers/cpufreq/sa1110-cpufreq.c b/drivers/cpufreq/sa1110-cpufreq.c
index dab54e051c0e..d9d04d935b3a 100644
--- a/drivers/cpufreq/sa1110-cpufreq.c
+++ b/drivers/cpufreq/sa1110-cpufreq.c
@@ -303,7 +303,8 @@ static int sa1110_target(struct cpufreq_policy *policy, unsigned int ppcr)
static int __init sa1110_cpu_init(struct cpufreq_policy *policy)
{
- return cpufreq_generic_init(policy, sa11x0_freq_table, 0);
+ cpufreq_generic_init(policy, sa11x0_freq_table, 0);
+ return 0;
}
/* sa1110_driver needs __refdata because it must remain after init registers
diff --git a/drivers/cpufreq/spear-cpufreq.c b/drivers/cpufreq/spear-cpufreq.c
index 4074e2615522..73bd8dc47074 100644
--- a/drivers/cpufreq/spear-cpufreq.c
+++ b/drivers/cpufreq/spear-cpufreq.c
@@ -153,8 +153,9 @@ static int spear_cpufreq_target(struct cpufreq_policy *policy,
static int spear_cpufreq_init(struct cpufreq_policy *policy)
{
policy->clk = spear_cpufreq.clk;
- return cpufreq_generic_init(policy, spear_cpufreq.freq_tbl,
+ cpufreq_generic_init(policy, spear_cpufreq.freq_tbl,
spear_cpufreq.transition_latency);
+ return 0;
}
static struct cpufreq_driver spear_cpufreq_driver = {
diff --git a/drivers/cpufreq/tegra20-cpufreq.c b/drivers/cpufreq/tegra20-cpufreq.c
index 3c32cc7b0671..f84ecd22f488 100644
--- a/drivers/cpufreq/tegra20-cpufreq.c
+++ b/drivers/cpufreq/tegra20-cpufreq.c
@@ -118,17 +118,11 @@ static int tegra_target(struct cpufreq_policy *policy, unsigned int index)
static int tegra_cpu_init(struct cpufreq_policy *policy)
{
struct tegra20_cpufreq *cpufreq = cpufreq_get_driver_data();
- int ret;
clk_prepare_enable(cpufreq->cpu_clk);
/* FIXME: what's the actual transition time? */
- ret = cpufreq_generic_init(policy, freq_table, 300 * 1000);
- if (ret) {
- clk_disable_unprepare(cpufreq->cpu_clk);
- return ret;
- }
-
+ cpufreq_generic_init(policy, freq_table, 300 * 1000);
policy->clk = cpufreq->cpu_clk;
policy->suspend_freq = freq_table[0].frequency;
return 0;
diff --git a/drivers/cpuidle/governor.c b/drivers/cpuidle/governor.c
index 9fddf828a76f..2e3e14192bee 100644
--- a/drivers/cpuidle/governor.c
+++ b/drivers/cpuidle/governor.c
@@ -110,7 +110,7 @@ int cpuidle_governor_latency_req(unsigned int cpu)
{
int global_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
struct device *device = get_cpu_device(cpu);
- int device_req = dev_pm_qos_raw_read_value(device);
+ int device_req = dev_pm_qos_raw_resume_latency(device);
return device_req < global_req ? device_req : global_req;
}
diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index 2109cfe80219..8fafbeab510a 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -295,6 +295,22 @@ static ssize_t target_node_show(struct device *dev,
}
static DEVICE_ATTR_RO(target_node);
+static unsigned long long dev_dax_resource(struct dev_dax *dev_dax)
+{
+ struct dax_region *dax_region = dev_dax->region;
+
+ return dax_region->res.start;
+}
+
+static ssize_t resource_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct dev_dax *dev_dax = to_dev_dax(dev);
+
+ return sprintf(buf, "%#llx\n", dev_dax_resource(dev_dax));
+}
+static DEVICE_ATTR_RO(resource);
+
static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
@@ -313,6 +329,8 @@ static umode_t dev_dax_visible(struct kobject *kobj, struct attribute *a, int n)
if (a == &dev_attr_target_node.attr && dev_dax_target_node(dev_dax) < 0)
return 0;
+ if (a == &dev_attr_resource.attr)
+ return 0400;
return a->mode;
}
@@ -320,6 +338,7 @@ static struct attribute *dev_dax_attributes[] = {
&dev_attr_modalias.attr,
&dev_attr_size.attr,
&dev_attr_target_node.attr,
+ &dev_attr_resource.attr,
NULL,
};
@@ -388,7 +407,7 @@ struct dev_dax *__devm_create_dev_dax(struct dax_region *dax_region, int id,
* No 'host' or dax_operations since there is no access to this
* device outside of mmap of the resulting character device.
*/
- dax_dev = alloc_dax(dev_dax, NULL, NULL);
+ dax_dev = alloc_dax(dev_dax, NULL, NULL, DAXDEV_F_SYNC);
if (!dax_dev)
goto err;
diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 4e5ae7e8b557..8ab12068eea3 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -195,6 +195,8 @@ enum dax_device_flags {
DAXDEV_ALIVE,
/* gate whether dax_flush() calls the low level flush routine */
DAXDEV_WRITE_CACHE,
+ /* flag to check if device supports synchronous flush */
+ DAXDEV_SYNC,
};
/**
@@ -372,6 +374,18 @@ bool dax_write_cache_enabled(struct dax_device *dax_dev)
}
EXPORT_SYMBOL_GPL(dax_write_cache_enabled);
+bool __dax_synchronous(struct dax_device *dax_dev)
+{
+ return test_bit(DAXDEV_SYNC, &dax_dev->flags);
+}
+EXPORT_SYMBOL_GPL(__dax_synchronous);
+
+void __set_dax_synchronous(struct dax_device *dax_dev)
+{
+ set_bit(DAXDEV_SYNC, &dax_dev->flags);
+}
+EXPORT_SYMBOL_GPL(__set_dax_synchronous);
+
bool dax_alive(struct dax_device *dax_dev)
{
lockdep_assert_held(&dax_srcu);
@@ -526,7 +540,7 @@ static void dax_add_host(struct dax_device *dax_dev, const char *host)
}
struct dax_device *alloc_dax(void *private, const char *__host,
- const struct dax_operations *ops)
+ const struct dax_operations *ops, unsigned long flags)
{
struct dax_device *dax_dev;
const char *host;
@@ -549,6 +563,9 @@ struct dax_device *alloc_dax(void *private, const char *__host,
dax_add_host(dax_dev, host);
dax_dev->ops = ops;
dax_dev->private = private;
+ if (flags & DAXDEV_F_SYNC)
+ set_dax_synchronous(dax_dev);
+
return dax_dev;
err_dev:
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 671c24332802..df2011de7be2 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -28,10 +28,27 @@
#include "dm-core.h"
-#define SUB_JOB_SIZE 128
#define SPLIT_COUNT 8
#define MIN_JOBS 8
-#define RESERVE_PAGES (DIV_ROUND_UP(SUB_JOB_SIZE << SECTOR_SHIFT, PAGE_SIZE))
+
+#define DEFAULT_SUB_JOB_SIZE_KB 512
+#define MAX_SUB_JOB_SIZE_KB 1024
+
+static unsigned kcopyd_subjob_size_kb = DEFAULT_SUB_JOB_SIZE_KB;
+
+module_param(kcopyd_subjob_size_kb, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(kcopyd_subjob_size_kb, "Sub-job size for dm-kcopyd clients");
+
+static unsigned dm_get_kcopyd_subjob_size(void)
+{
+ unsigned sub_job_size_kb;
+
+ sub_job_size_kb = __dm_get_module_param(&kcopyd_subjob_size_kb,
+ DEFAULT_SUB_JOB_SIZE_KB,
+ MAX_SUB_JOB_SIZE_KB);
+
+ return sub_job_size_kb << 1;
+}
/*-----------------------------------------------------------------
* Each kcopyd client has its own little pool of preallocated
@@ -41,6 +58,7 @@ struct dm_kcopyd_client {
struct page_list *pages;
unsigned nr_reserved_pages;
unsigned nr_free_pages;
+ unsigned sub_job_size;
struct dm_io_client *io_client;
@@ -693,8 +711,8 @@ static void segment_complete(int read_err, unsigned long write_err,
progress = job->progress;
count = job->source.count - progress;
if (count) {
- if (count > SUB_JOB_SIZE)
- count = SUB_JOB_SIZE;
+ if (count > kc->sub_job_size)
+ count = kc->sub_job_size;
job->progress += count;
}
@@ -821,7 +839,7 @@ void dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
job->master_job = job;
job->write_offset = 0;
- if (job->source.count <= SUB_JOB_SIZE)
+ if (job->source.count <= kc->sub_job_size)
dispatch_job(job);
else {
job->progress = 0;
@@ -888,6 +906,7 @@ int kcopyd_cancel(struct kcopyd_job *job, int block)
struct dm_kcopyd_client *dm_kcopyd_client_create(struct dm_kcopyd_throttle *throttle)
{
int r;
+ unsigned reserve_pages;
struct dm_kcopyd_client *kc;
kc = kzalloc(sizeof(*kc), GFP_KERNEL);
@@ -912,9 +931,12 @@ struct dm_kcopyd_client *dm_kcopyd_client_create(struct dm_kcopyd_throttle *thro
goto bad_workqueue;
}
+ kc->sub_job_size = dm_get_kcopyd_subjob_size();
+ reserve_pages = DIV_ROUND_UP(kc->sub_job_size << SECTOR_SHIFT, PAGE_SIZE);
+
kc->pages = NULL;
kc->nr_reserved_pages = kc->nr_free_pages = 0;
- r = client_reserve_pages(kc, RESERVE_PAGES);
+ r = client_reserve_pages(kc, reserve_pages);
if (r)
goto bad_client_pages;
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 63916e1dc569..f150f5c5492b 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -2072,6 +2072,12 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio)
return DM_MAPIO_REMAPPED;
}
+ if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
+ /* Once merging, discards no longer effect change */
+ bio_endio(bio);
+ return DM_MAPIO_SUBMITTED;
+ }
+
chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
down_write(&s->lock);
@@ -2331,6 +2337,8 @@ static void snapshot_io_hints(struct dm_target *ti, struct queue_limits *limits)
if (snap->discard_zeroes_cow) {
struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
+ down_read(&_origins_lock);
+
(void) __find_snapshots_sharing_cow(snap, &snap_src, &snap_dest, NULL);
if (snap_src && snap_dest)
snap = snap_src;
@@ -2338,6 +2346,8 @@ static void snapshot_io_hints(struct dm_target *ti, struct queue_limits *limits)
/* All discards are split on chunk_size boundary */
limits->discard_granularity = snap->store->chunk_size;
limits->max_discard_sectors = snap->store->chunk_size;
+
+ up_read(&_origins_lock);
}
}
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index ec8b27e20de3..caaee8032afe 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -881,7 +881,7 @@ void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type)
EXPORT_SYMBOL_GPL(dm_table_set_type);
/* validate the dax capability of the target device span */
-static int device_supports_dax(struct dm_target *ti, struct dm_dev *dev,
+int device_supports_dax(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data)
{
int blocksize = *(int *) data;
@@ -890,7 +890,15 @@ static int device_supports_dax(struct dm_target *ti, struct dm_dev *dev,
start, len);
}
-bool dm_table_supports_dax(struct dm_table *t, int blocksize)
+/* Check devices support synchronous DAX */
+static int device_synchronous(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
+{
+ return dax_synchronous(dev->dax_dev);
+}
+
+bool dm_table_supports_dax(struct dm_table *t,
+ iterate_devices_callout_fn iterate_fn, int *blocksize)
{
struct dm_target *ti;
unsigned i;
@@ -903,8 +911,7 @@ bool dm_table_supports_dax(struct dm_table *t, int blocksize)
return false;
if (!ti->type->iterate_devices ||
- !ti->type->iterate_devices(ti, device_supports_dax,
- &blocksize))
+ !ti->type->iterate_devices(ti, iterate_fn, blocksize))
return false;
}
@@ -940,6 +947,7 @@ static int dm_table_determine_type(struct dm_table *t)
struct dm_target *tgt;
struct list_head *devices = dm_table_get_devices(t);
enum dm_queue_mode live_md_type = dm_get_md_type(t->md);
+ int page_size = PAGE_SIZE;
if (t->type != DM_TYPE_NONE) {
/* target already set the table's type */
@@ -984,7 +992,7 @@ static int dm_table_determine_type(struct dm_table *t)
verify_bio_based:
/* We must use this table as bio-based */
t->type = DM_TYPE_BIO_BASED;
- if (dm_table_supports_dax(t, PAGE_SIZE) ||
+ if (dm_table_supports_dax(t, device_supports_dax, &page_size) ||
(list_empty(devices) && live_md_type == DM_TYPE_DAX_BIO_BASED)) {
t->type = DM_TYPE_DAX_BIO_BASED;
} else {
@@ -1883,6 +1891,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
struct queue_limits *limits)
{
bool wc = false, fua = false;
+ int page_size = PAGE_SIZE;
/*
* Copy table's limits to the DM device's request_queue
@@ -1910,8 +1919,11 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
}
blk_queue_write_cache(q, wc, fua);
- if (dm_table_supports_dax(t, PAGE_SIZE))
+ if (dm_table_supports_dax(t, device_supports_dax, &page_size)) {
blk_queue_flag_set(QUEUE_FLAG_DAX, q);
+ if (dm_table_supports_dax(t, device_synchronous, NULL))
+ set_dax_synchronous(t->md->dax_dev);
+ }
else
blk_queue_flag_clear(QUEUE_FLAG_DAX, q);
diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
index 9faf3e49c7af..8545dcee9fd0 100644
--- a/drivers/md/dm-zoned-metadata.c
+++ b/drivers/md/dm-zoned-metadata.c
@@ -1602,30 +1602,6 @@ struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd)
}
/*
- * Activate a zone (increment its reference count).
- */
-void dmz_activate_zone(struct dm_zone *zone)
-{
- set_bit(DMZ_ACTIVE, &zone->flags);
- atomic_inc(&zone->refcount);
-}
-
-/*
- * Deactivate a zone. This decrement the zone reference counter
- * and clears the active state of the zone once the count reaches 0,
- * indicating that all BIOs to the zone have completed. Returns
- * true if the zone was deactivated.
- */
-void dmz_deactivate_zone(struct dm_zone *zone)
-{
- if (atomic_dec_and_test(&zone->refcount)) {
- WARN_ON(!test_bit(DMZ_ACTIVE, &zone->flags));
- clear_bit_unlock(DMZ_ACTIVE, &zone->flags);
- smp_mb__after_atomic();
- }
-}
-
-/*
* Get the zone mapping a chunk, if the chunk is mapped already.
* If no mapping exist and the operation is WRITE, a zone is
* allocated and used to map the chunk.
diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h
index 12419f0bfe78..ed8de49c9a08 100644
--- a/drivers/md/dm-zoned.h
+++ b/drivers/md/dm-zoned.h
@@ -115,7 +115,6 @@ enum {
DMZ_BUF,
/* Zone internal state */
- DMZ_ACTIVE,
DMZ_RECLAIM,
DMZ_SEQ_WRITE_ERR,
};
@@ -128,7 +127,6 @@ enum {
#define dmz_is_empty(z) ((z)->wp_block == 0)
#define dmz_is_offline(z) test_bit(DMZ_OFFLINE, &(z)->flags)
#define dmz_is_readonly(z) test_bit(DMZ_READ_ONLY, &(z)->flags)
-#define dmz_is_active(z) test_bit(DMZ_ACTIVE, &(z)->flags)
#define dmz_in_reclaim(z) test_bit(DMZ_RECLAIM, &(z)->flags)
#define dmz_seq_write_err(z) test_bit(DMZ_SEQ_WRITE_ERR, &(z)->flags)
@@ -188,8 +186,30 @@ void dmz_unmap_zone(struct dmz_metadata *zmd, struct dm_zone *zone);
unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd);
unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd);
-void dmz_activate_zone(struct dm_zone *zone);
-void dmz_deactivate_zone(struct dm_zone *zone);
+/*
+ * Activate a zone (increment its reference count).
+ */
+static inline void dmz_activate_zone(struct dm_zone *zone)
+{
+ atomic_inc(&zone->refcount);
+}
+
+/*
+ * Deactivate a zone. This decrement the zone reference counter
+ * indicating that all BIOs to the zone have completed when the count is 0.
+ */
+static inline void dmz_deactivate_zone(struct dm_zone *zone)
+{
+ atomic_dec(&zone->refcount);
+}
+
+/*
+ * Test if a zone is active, that is, has a refcount > 0.
+ */
+static inline bool dmz_is_active(struct dm_zone *zone)
+{
+ return atomic_read(&zone->refcount);
+}
int dmz_lock_zone_reclaim(struct dm_zone *zone);
void dmz_unlock_zone_reclaim(struct dm_zone *zone);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 61f1152b74e9..d0beef033e2f 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1117,7 +1117,7 @@ static bool dm_dax_supported(struct dax_device *dax_dev, struct block_device *bd
if (!map)
return false;
- ret = dm_table_supports_dax(map, blocksize);
+ ret = dm_table_supports_dax(map, device_supports_dax, &blocksize);
dm_put_live_table(md, srcu_idx);
@@ -1989,7 +1989,8 @@ static struct mapped_device *alloc_dev(int minor)
sprintf(md->disk->disk_name, "dm-%d", minor);
if (IS_ENABLED(CONFIG_DAX_DRIVER)) {
- md->dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops);
+ md->dax_dev = alloc_dax(md, md->disk->disk_name,
+ &dm_dax_ops, 0);
if (!md->dax_dev)
goto bad;
}
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 17e3db54404c..0475673337f3 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -72,7 +72,10 @@ bool dm_table_bio_based(struct dm_table *t);
bool dm_table_request_based(struct dm_table *t);
void dm_table_free_md_mempools(struct dm_table *t);
struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
-bool dm_table_supports_dax(struct dm_table *t, int blocksize);
+bool dm_table_supports_dax(struct dm_table *t, iterate_devices_callout_fn fn,
+ int *blocksize);
+int device_supports_dax(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data);
void dm_lock_md_type(struct mapped_device *md);
void dm_unlock_md_type(struct mapped_device *md);
diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile
index 6f2a088afad6..cefe233e0b52 100644
--- a/drivers/nvdimm/Makefile
+++ b/drivers/nvdimm/Makefile
@@ -5,6 +5,7 @@ obj-$(CONFIG_ND_BTT) += nd_btt.o
obj-$(CONFIG_ND_BLK) += nd_blk.o
obj-$(CONFIG_X86_PMEM_LEGACY) += nd_e820.o
obj-$(CONFIG_OF_PMEM) += of_pmem.o
+obj-$(CONFIG_VIRTIO_PMEM) += virtio_pmem.o nd_virtio.o
nd_pmem-y := pmem.o
diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
index 26c1c7618891..2985ca949912 100644
--- a/drivers/nvdimm/claim.c
+++ b/drivers/nvdimm/claim.c
@@ -255,7 +255,7 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512);
sector_t sector = offset >> 9;
- int rc = 0;
+ int rc = 0, ret = 0;
if (unlikely(!size))
return 0;
@@ -293,7 +293,9 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
}
memcpy_flushcache(nsio->addr + offset, buf, size);
- nvdimm_flush(to_nd_region(ndns->dev.parent));
+ ret = nvdimm_flush(to_nd_region(ndns->dev.parent), NULL);
+ if (ret)
+ rc = ret;
return rc;
}
diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index a434a5964cb9..2d8d7e554877 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -1822,8 +1822,8 @@ static bool has_uuid_at_pos(struct nd_region *nd_region, u8 *uuid,
&& !guid_equal(&nd_set->type_guid,
&nd_label->type_guid)) {
dev_dbg(ndd->dev, "expect type_guid %pUb got %pUb\n",
- nd_set->type_guid.b,
- nd_label->type_guid.b);
+ &nd_set->type_guid,
+ &nd_label->type_guid);
continue;
}
@@ -2227,8 +2227,8 @@ static struct device *create_namespace_blk(struct nd_region *nd_region,
if (namespace_label_has(ndd, type_guid)) {
if (!guid_equal(&nd_set->type_guid, &nd_label->type_guid)) {
dev_dbg(ndd->dev, "expect type_guid %pUb got %pUb\n",
- nd_set->type_guid.b,
- nd_label->type_guid.b);
+ &nd_set->type_guid,
+ &nd_label->type_guid);
return ERR_PTR(-EAGAIN);
}
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index d24304c0e6d7..1b9955651379 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -155,6 +155,7 @@ struct nd_region {
struct badblocks bb;
struct nd_interleave_set *nd_set;
struct nd_percpu_lane __percpu *lane;
+ int (*flush)(struct nd_region *nd_region, struct bio *bio);
struct nd_mapping mapping[0];
};
diff --git a/drivers/nvdimm/nd_virtio.c b/drivers/nvdimm/nd_virtio.c
new file mode 100644
index 000000000000..10351d5b49fa
--- /dev/null
+++ b/drivers/nvdimm/nd_virtio.c
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * virtio_pmem.c: Virtio pmem Driver
+ *
+ * Discovers persistent memory range information
+ * from host and provides a virtio based flushing
+ * interface.
+ */
+#include "virtio_pmem.h"
+#include "nd.h"
+
+ /* The interrupt handler */
+void virtio_pmem_host_ack(struct virtqueue *vq)
+{
+ struct virtio_pmem *vpmem = vq->vdev->priv;
+ struct virtio_pmem_request *req_data, *req_buf;
+ unsigned long flags;
+ unsigned int len;
+
+ spin_lock_irqsave(&vpmem->pmem_lock, flags);
+ while ((req_data = virtqueue_get_buf(vq, &len)) != NULL) {
+ req_data->done = true;
+ wake_up(&req_data->host_acked);
+
+ if (!list_empty(&vpmem->req_list)) {
+ req_buf = list_first_entry(&vpmem->req_list,
+ struct virtio_pmem_request, list);
+ req_buf->wq_buf_avail = true;
+ wake_up(&req_buf->wq_buf);
+ list_del(&req_buf->list);
+ }
+ }
+ spin_unlock_irqrestore(&vpmem->pmem_lock, flags);
+}
+EXPORT_SYMBOL_GPL(virtio_pmem_host_ack);
+
+ /* The request submission function */
+static int virtio_pmem_flush(struct nd_region *nd_region)
+{
+ struct virtio_device *vdev = nd_region->provider_data;
+ struct virtio_pmem *vpmem = vdev->priv;
+ struct virtio_pmem_request *req_data;
+ struct scatterlist *sgs[2], sg, ret;
+ unsigned long flags;
+ int err, err1;
+
+ might_sleep();
+ req_data = kmalloc(sizeof(*req_data), GFP_KERNEL);
+ if (!req_data)
+ return -ENOMEM;
+
+ req_data->done = false;
+ init_waitqueue_head(&req_data->host_acked);
+ init_waitqueue_head(&req_data->wq_buf);
+ INIT_LIST_HEAD(&req_data->list);
+ req_data->req.type = cpu_to_le32(VIRTIO_PMEM_REQ_TYPE_FLUSH);
+ sg_init_one(&sg, &req_data->req, sizeof(req_data->req));
+ sgs[0] = &sg;
+ sg_init_one(&ret, &req_data->resp.ret, sizeof(req_data->resp));
+ sgs[1] = &ret;
+
+ spin_lock_irqsave(&vpmem->pmem_lock, flags);
+ /*
+ * If virtqueue_add_sgs returns -ENOSPC then req_vq virtual
+ * queue does not have free descriptor. We add the request
+ * to req_list and wait for host_ack to wake us up when free
+ * slots are available.
+ */
+ while ((err = virtqueue_add_sgs(vpmem->req_vq, sgs, 1, 1, req_data,
+ GFP_ATOMIC)) == -ENOSPC) {
+
+ dev_info(&vdev->dev, "failed to send command to virtio pmem device, no free slots in the virtqueue\n");
+ req_data->wq_buf_avail = false;
+ list_add_tail(&req_data->list, &vpmem->req_list);
+ spin_unlock_irqrestore(&vpmem->pmem_lock, flags);
+
+ /* A host response results in "host_ack" getting called */
+ wait_event(req_data->wq_buf, req_data->wq_buf_avail);
+ spin_lock_irqsave(&vpmem->pmem_lock, flags);
+ }
+ err1 = virtqueue_kick(vpmem->req_vq);
+ spin_unlock_irqrestore(&vpmem->pmem_lock, flags);
+ /*
+ * virtqueue_add_sgs failed with error different than -ENOSPC, we can't
+ * do anything about that.
+ */
+ if (err || !err1) {
+ dev_info(&vdev->dev, "failed to send command to virtio pmem device\n");
+ err = -EIO;
+ } else {
+ /* A host repsonse results in "host_ack" getting called */
+ wait_event(req_data->host_acked, req_data->done);
+ err = le32_to_cpu(req_data->resp.ret);
+ }
+
+ kfree(req_data);
+ return err;
+};
+
+/* The asynchronous flush callback function */
+int async_pmem_flush(struct nd_region *nd_region, struct bio *bio)
+{
+ /*
+ * Create child bio for asynchronous flush and chain with
+ * parent bio. Otherwise directly call nd_region flush.
+ */
+ if (bio && bio->bi_iter.bi_sector != -1) {
+ struct bio *child = bio_alloc(GFP_ATOMIC, 0);
+
+ if (!child)
+ return -ENOMEM;
+ bio_copy_dev(child, bio);
+ child->bi_opf = REQ_PREFLUSH;
+ child->bi_iter.bi_sector = -1;
+ bio_chain(child, bio);
+ submit_bio(child);
+ return 0;
+ }
+ if (virtio_pmem_flush(nd_region))
+ return -EIO;
+
+ return 0;
+};
+EXPORT_SYMBOL_GPL(async_pmem_flush);
+MODULE_LICENSE("GPL");
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index e7d8cc9f41e8..2bf3acd69613 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -184,6 +184,7 @@ static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page,
static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
{
+ int ret = 0;
blk_status_t rc = 0;
bool do_acct;
unsigned long start;
@@ -193,7 +194,7 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
struct nd_region *nd_region = to_region(pmem);
if (bio->bi_opf & REQ_PREFLUSH)
- nvdimm_flush(nd_region);
+ ret = nvdimm_flush(nd_region, bio);
do_acct = nd_iostat_start(bio, &start);
bio_for_each_segment(bvec, bio, iter) {
@@ -208,7 +209,10 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
nd_iostat_end(bio, start);
if (bio->bi_opf & REQ_FUA)
- nvdimm_flush(nd_region);
+ ret = nvdimm_flush(nd_region, bio);
+
+ if (ret)
+ bio->bi_status = errno_to_blk_status(ret);
bio_endio(bio);
return BLK_QC_T_NONE;
@@ -362,6 +366,7 @@ static int pmem_attach_disk(struct device *dev,
struct gendisk *disk;
void *addr;
int rc;
+ unsigned long flags = 0UL;
pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL);
if (!pmem)
@@ -457,14 +462,15 @@ static int pmem_attach_disk(struct device *dev,
nvdimm_badblocks_populate(nd_region, &pmem->bb, &bb_res);
disk->bb = &pmem->bb;
- dax_dev = alloc_dax(pmem, disk->disk_name, &pmem_dax_ops);
+ if (is_nvdimm_sync(nd_region))
+ flags = DAXDEV_F_SYNC;
+ dax_dev = alloc_dax(pmem, disk->disk_name, &pmem_dax_ops, flags);
if (!dax_dev) {
put_disk(disk);
return -ENOMEM;
}
dax_write_cache(dax_dev, nvdimm_has_cache(nd_region));
pmem->dax_dev = dax_dev;
-
gendev = disk_to_dev(disk);
gendev->groups = pmem_attribute_groups;
@@ -522,14 +528,14 @@ static int nd_pmem_remove(struct device *dev)
sysfs_put(pmem->bb_state);
pmem->bb_state = NULL;
}
- nvdimm_flush(to_nd_region(dev->parent));
+ nvdimm_flush(to_nd_region(dev->parent), NULL);
return 0;
}
static void nd_pmem_shutdown(struct device *dev)
{
- nvdimm_flush(to_nd_region(dev->parent));
+ nvdimm_flush(to_nd_region(dev->parent), NULL);
}
static void nd_pmem_notify(struct device *dev, enum nvdimm_event event)
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 4fed9ce9c2fe..56f2227f192a 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -287,7 +287,9 @@ static ssize_t deep_flush_store(struct device *dev, struct device_attribute *att
return rc;
if (!flush)
return -EINVAL;
- nvdimm_flush(nd_region);
+ rc = nvdimm_flush(nd_region, NULL);
+ if (rc)
+ return rc;
return len;
}
@@ -1077,6 +1079,11 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
dev->of_node = ndr_desc->of_node;
nd_region->ndr_size = resource_size(ndr_desc->res);
nd_region->ndr_start = ndr_desc->res->start;
+ if (ndr_desc->flush)
+ nd_region->flush = ndr_desc->flush;
+ else
+ nd_region->flush = NULL;
+
nd_device_register(dev);
return nd_region;
@@ -1117,11 +1124,24 @@ struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus,
}
EXPORT_SYMBOL_GPL(nvdimm_volatile_region_create);
+int nvdimm_flush(struct nd_region *nd_region, struct bio *bio)
+{
+ int rc = 0;
+
+ if (!nd_region->flush)
+ rc = generic_nvdimm_flush(nd_region);
+ else {
+ if (nd_region->flush(nd_region, bio))
+ rc = -EIO;
+ }
+
+ return rc;
+}
/**
* nvdimm_flush - flush any posted write queues between the cpu and pmem media
* @nd_region: blk or interleaved pmem region
*/
-void nvdimm_flush(struct nd_region *nd_region)
+int generic_nvdimm_flush(struct nd_region *nd_region)
{
struct nd_region_data *ndrd = dev_get_drvdata(&nd_region->dev);
int i, idx;
@@ -1145,6 +1165,8 @@ void nvdimm_flush(struct nd_region *nd_region)
if (ndrd_get_flush_wpq(ndrd, i, 0))
writeq(1, ndrd_get_flush_wpq(ndrd, i, idx));
wmb();
+
+ return 0;
}
EXPORT_SYMBOL_GPL(nvdimm_flush);
@@ -1189,6 +1211,13 @@ int nvdimm_has_cache(struct nd_region *nd_region)
}
EXPORT_SYMBOL_GPL(nvdimm_has_cache);
+bool is_nvdimm_sync(struct nd_region *nd_region)
+{
+ return is_nd_pmem(&nd_region->dev) &&
+ !test_bit(ND_REGION_ASYNC, &nd_region->flags);
+}
+EXPORT_SYMBOL_GPL(is_nvdimm_sync);
+
struct conflict_context {
struct nd_region *nd_region;
resource_size_t start, size;
diff --git a/drivers/nvdimm/virtio_pmem.c b/drivers/nvdimm/virtio_pmem.c
new file mode 100644
index 000000000000..5e3d07b47e0c
--- /dev/null
+++ b/drivers/nvdimm/virtio_pmem.c
@@ -0,0 +1,122 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * virtio_pmem.c: Virtio pmem Driver
+ *
+ * Discovers persistent memory range information
+ * from host and registers the virtual pmem device
+ * with libnvdimm core.
+ */
+#include "virtio_pmem.h"
+#include "nd.h"
+
+static struct virtio_device_id id_table[] = {
+ { VIRTIO_ID_PMEM, VIRTIO_DEV_ANY_ID },
+ { 0 },
+};
+
+ /* Initialize virt queue */
+static int init_vq(struct virtio_pmem *vpmem)
+{
+ /* single vq */
+ vpmem->req_vq = virtio_find_single_vq(vpmem->vdev,
+ virtio_pmem_host_ack, "flush_queue");
+ if (IS_ERR(vpmem->req_vq))
+ return PTR_ERR(vpmem->req_vq);
+
+ spin_lock_init(&vpmem->pmem_lock);
+ INIT_LIST_HEAD(&vpmem->req_list);
+
+ return 0;
+};
+
+static int virtio_pmem_probe(struct virtio_device *vdev)
+{
+ struct nd_region_desc ndr_desc = {};
+ int nid = dev_to_node(&vdev->dev);
+ struct nd_region *nd_region;
+ struct virtio_pmem *vpmem;
+ struct resource res;
+ int err = 0;
+
+ if (!vdev->config->get) {
+ dev_err(&vdev->dev, "%s failure: config access disabled\n",
+ __func__);
+ return -EINVAL;
+ }
+
+ vpmem = devm_kzalloc(&vdev->dev, sizeof(*vpmem), GFP_KERNEL);
+ if (!vpmem) {
+ err = -ENOMEM;
+ goto out_err;
+ }
+
+ vpmem->vdev = vdev;
+ vdev->priv = vpmem;
+ err = init_vq(vpmem);
+ if (err) {
+ dev_err(&vdev->dev, "failed to initialize virtio pmem vq's\n");
+ goto out_err;
+ }
+
+ virtio_cread(vpmem->vdev, struct virtio_pmem_config,
+ start, &vpmem->start);
+ virtio_cread(vpmem->vdev, struct virtio_pmem_config,
+ size, &vpmem->size);
+
+ res.start = vpmem->start;
+ res.end = vpmem->start + vpmem->size - 1;
+ vpmem->nd_desc.provider_name = "virtio-pmem";
+ vpmem->nd_desc.module = THIS_MODULE;
+
+ vpmem->nvdimm_bus = nvdimm_bus_register(&vdev->dev,
+ &vpmem->nd_desc);
+ if (!vpmem->nvdimm_bus) {
+ dev_err(&vdev->dev, "failed to register device with nvdimm_bus\n");
+ err = -ENXIO;
+ goto out_vq;
+ }
+
+ dev_set_drvdata(&vdev->dev, vpmem->nvdimm_bus);
+
+ ndr_desc.res = &res;
+ ndr_desc.numa_node = nid;
+ ndr_desc.flush = async_pmem_flush;
+ set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags);
+ set_bit(ND_REGION_ASYNC, &ndr_desc.flags);
+ nd_region = nvdimm_pmem_region_create(vpmem->nvdimm_bus, &ndr_desc);
+ if (!nd_region) {
+ dev_err(&vdev->dev, "failed to create nvdimm region\n");
+ err = -ENXIO;
+ goto out_nd;
+ }
+ nd_region->provider_data = dev_to_virtio(nd_region->dev.parent->parent);
+ return 0;
+out_nd:
+ nvdimm_bus_unregister(vpmem->nvdimm_bus);
+out_vq:
+ vdev->config->del_vqs(vdev);
+out_err:
+ return err;
+}
+
+static void virtio_pmem_remove(struct virtio_device *vdev)
+{
+ struct nvdimm_bus *nvdimm_bus = dev_get_drvdata(&vdev->dev);
+
+ nvdimm_bus_unregister(nvdimm_bus);
+ vdev->config->del_vqs(vdev);
+ vdev->config->reset(vdev);
+}
+
+static struct virtio_driver virtio_pmem_driver = {
+ .driver.name = KBUILD_MODNAME,
+ .driver.owner = THIS_MODULE,
+ .id_table = id_table,
+ .probe = virtio_pmem_probe,
+ .remove = virtio_pmem_remove,
+};
+
+module_virtio_driver(virtio_pmem_driver);
+MODULE_DEVICE_TABLE(virtio, id_table);
+MODULE_DESCRIPTION("Virtio pmem driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/nvdimm/virtio_pmem.h b/drivers/nvdimm/virtio_pmem.h
new file mode 100644
index 000000000000..0dddefe594c4
--- /dev/null
+++ b/drivers/nvdimm/virtio_pmem.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * virtio_pmem.h: virtio pmem Driver
+ *
+ * Discovers persistent memory range information
+ * from host and provides a virtio based flushing
+ * interface.
+ **/
+
+#ifndef _LINUX_VIRTIO_PMEM_H
+#define _LINUX_VIRTIO_PMEM_H
+
+#include <linux/module.h>
+#include <uapi/linux/virtio_pmem.h>
+#include <linux/libnvdimm.h>
+#include <linux/spinlock.h>
+
+struct virtio_pmem_request {
+ struct virtio_pmem_req req;
+ struct virtio_pmem_resp resp;
+
+ /* Wait queue to process deferred work after ack from host */
+ wait_queue_head_t host_acked;
+ bool done;
+
+ /* Wait queue to process deferred work after virt queue buffer avail */
+ wait_queue_head_t wq_buf;
+ bool wq_buf_avail;
+ struct list_head list;
+};
+
+struct virtio_pmem {
+ struct virtio_device *vdev;
+
+ /* Virtio pmem request queue */
+ struct virtqueue *req_vq;
+
+ /* nvdimm bus registers virtio pmem device */
+ struct nvdimm_bus *nvdimm_bus;
+ struct nvdimm_bus_descriptor nd_desc;
+
+ /* List to store deferred work if virtqueue is full */
+ struct list_head req_list;
+
+ /* Synchronize virtqueue data */
+ spinlock_t pmem_lock;
+
+ /* Memory region information */
+ __u64 start;
+ __u64 size;
+};
+
+void virtio_pmem_host_ack(struct virtqueue *vq);
+int async_pmem_flush(struct nd_region *nd_region, struct bio *bio);
+#endif
diff --git a/drivers/powercap/Kconfig b/drivers/powercap/Kconfig
index 42d3798c88f0..dc1c1381d7fa 100644
--- a/drivers/powercap/Kconfig
+++ b/drivers/powercap/Kconfig
@@ -16,14 +16,17 @@ menuconfig POWERCAP
if POWERCAP
# Client driver configurations go here.
+config INTEL_RAPL_CORE
+ tristate
+
config INTEL_RAPL
- tristate "Intel RAPL Support"
+ tristate "Intel RAPL Support via MSR Interface"
depends on X86 && IOSF_MBI
- default n
+ select INTEL_RAPL_CORE
---help---
This enables support for the Intel Running Average Power Limit (RAPL)
- technology which allows power limits to be enforced and monitored on
- modern Intel processors (Sandy Bridge and later).
+ technology via MSR interface, which allows power limits to be enforced
+ and monitored on modern Intel processors (Sandy Bridge and later).
In RAPL, the platform level settings are divided into domains for
fine grained control. These domains include processor package, DRAM
diff --git a/drivers/powercap/Makefile b/drivers/powercap/Makefile
index 81c8ccaba6e7..7255c94ec61c 100644
--- a/drivers/powercap/Makefile
+++ b/drivers/powercap/Makefile
@@ -1,4 +1,5 @@
# SPDX-License-Identifier: GPL-2.0-only
obj-$(CONFIG_POWERCAP) += powercap_sys.o
-obj-$(CONFIG_INTEL_RAPL) += intel_rapl.o
+obj-$(CONFIG_INTEL_RAPL_CORE) += intel_rapl_common.o
+obj-$(CONFIG_INTEL_RAPL) += intel_rapl_msr.o
obj-$(CONFIG_IDLE_INJECT) += idle_inject.o
diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl_common.c
index 8692f6b79f93..9fd6dd342169 100644
--- a/drivers/powercap/intel_rapl.c
+++ b/drivers/powercap/intel_rapl_common.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * Intel Running Average Power Limit (RAPL) Driver
- * Copyright (c) 2013, Intel Corporation.
+ * Common code for Intel Running Average Power Limit (RAPL) support.
+ * Copyright (c) 2019, Intel Corporation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -18,9 +18,11 @@
#include <linux/cpu.h>
#include <linux/powercap.h>
#include <linux/suspend.h>
-#include <asm/iosf_mbi.h>
+#include <linux/intel_rapl.h>
+#include <linux/processor.h>
+#include <linux/platform_device.h>
-#include <asm/processor.h>
+#include <asm/iosf_mbi.h>
#include <asm/cpu_device_id.h>
#include <asm/intel-family.h>
@@ -37,8 +39,8 @@
#define POWER_LIMIT2_MASK (0x7FFFULL<<32)
#define POWER_LIMIT2_ENABLE BIT_ULL(47)
#define POWER_LIMIT2_CLAMP BIT_ULL(48)
-#define POWER_PACKAGE_LOCK BIT_ULL(63)
-#define POWER_PP_LOCK BIT(31)
+#define POWER_HIGH_LOCK BIT_ULL(63)
+#define POWER_LOW_LOCK BIT(31)
#define TIME_WINDOW1_MASK (0x7FULL<<17)
#define TIME_WINDOW2_MASK (0x7FULL<<49)
@@ -61,143 +63,38 @@
#define PP_POLICY_MASK 0x1F
/* Non HW constants */
-#define RAPL_PRIMITIVE_DERIVED BIT(1) /* not from raw data */
+#define RAPL_PRIMITIVE_DERIVED BIT(1) /* not from raw data */
#define RAPL_PRIMITIVE_DUMMY BIT(2)
#define TIME_WINDOW_MAX_MSEC 40000
#define TIME_WINDOW_MIN_MSEC 250
-#define ENERGY_UNIT_SCALE 1000 /* scale from driver unit to powercap unit */
+#define ENERGY_UNIT_SCALE 1000 /* scale from driver unit to powercap unit */
enum unit_type {
- ARBITRARY_UNIT, /* no translation */
+ ARBITRARY_UNIT, /* no translation */
POWER_UNIT,
ENERGY_UNIT,
TIME_UNIT,
};
-enum rapl_domain_type {
- RAPL_DOMAIN_PACKAGE, /* entire package/socket */
- RAPL_DOMAIN_PP0, /* core power plane */
- RAPL_DOMAIN_PP1, /* graphics uncore */
- RAPL_DOMAIN_DRAM,/* DRAM control_type */
- RAPL_DOMAIN_PLATFORM, /* PSys control_type */
- RAPL_DOMAIN_MAX,
-};
-
-enum rapl_domain_msr_id {
- RAPL_DOMAIN_MSR_LIMIT,
- RAPL_DOMAIN_MSR_STATUS,
- RAPL_DOMAIN_MSR_PERF,
- RAPL_DOMAIN_MSR_POLICY,
- RAPL_DOMAIN_MSR_INFO,
- RAPL_DOMAIN_MSR_MAX,
-};
-
/* per domain data, some are optional */
-enum rapl_primitives {
- ENERGY_COUNTER,
- POWER_LIMIT1,
- POWER_LIMIT2,
- FW_LOCK,
-
- PL1_ENABLE, /* power limit 1, aka long term */
- PL1_CLAMP, /* allow frequency to go below OS request */
- PL2_ENABLE, /* power limit 2, aka short term, instantaneous */
- PL2_CLAMP,
-
- TIME_WINDOW1, /* long term */
- TIME_WINDOW2, /* short term */
- THERMAL_SPEC_POWER,
- MAX_POWER,
-
- MIN_POWER,
- MAX_TIME_WINDOW,
- THROTTLED_TIME,
- PRIORITY_LEVEL,
-
- /* below are not raw primitive data */
- AVERAGE_POWER,
- NR_RAPL_PRIMITIVES,
-};
-
#define NR_RAW_PRIMITIVES (NR_RAPL_PRIMITIVES - 2)
-/* Can be expanded to include events, etc.*/
-struct rapl_domain_data {
- u64 primitives[NR_RAPL_PRIMITIVES];
- unsigned long timestamp;
-};
-
-struct msrl_action {
- u32 msr_no;
- u64 clear_mask;
- u64 set_mask;
- int err;
-};
-
#define DOMAIN_STATE_INACTIVE BIT(0)
#define DOMAIN_STATE_POWER_LIMIT_SET BIT(1)
#define DOMAIN_STATE_BIOS_LOCKED BIT(2)
-#define NR_POWER_LIMITS (2)
-struct rapl_power_limit {
- struct powercap_zone_constraint *constraint;
- int prim_id; /* primitive ID used to enable */
- struct rapl_domain *domain;
- const char *name;
- u64 last_power_limit;
-};
-
static const char pl1_name[] = "long_term";
static const char pl2_name[] = "short_term";
-struct rapl_package;
-struct rapl_domain {
- const char *name;
- enum rapl_domain_type id;
- int msrs[RAPL_DOMAIN_MSR_MAX];
- struct powercap_zone power_zone;
- struct rapl_domain_data rdd;
- struct rapl_power_limit rpl[NR_POWER_LIMITS];
- u64 attr_map; /* track capabilities */
- unsigned int state;
- unsigned int domain_energy_unit;
- struct rapl_package *rp;
-};
#define power_zone_to_rapl_domain(_zone) \
container_of(_zone, struct rapl_domain, power_zone)
-/* maximum rapl package domain name: package-%d-die-%d */
-#define PACKAGE_DOMAIN_NAME_LENGTH 30
-
-
-/* Each rapl package contains multiple domains, these are the common
- * data across RAPL domains within a package.
- */
-struct rapl_package {
- unsigned int id; /* logical die id, equals physical 1-die systems */
- unsigned int nr_domains;
- unsigned long domain_map; /* bit map of active domains */
- unsigned int power_unit;
- unsigned int energy_unit;
- unsigned int time_unit;
- struct rapl_domain *domains; /* array of domains, sized at runtime */
- struct powercap_zone *power_zone; /* keep track of parent zone */
- unsigned long power_limit_irq; /* keep track of package power limit
- * notify interrupt enable status.
- */
- struct list_head plist;
- int lead_cpu; /* one active cpu per package for access */
- /* Track active cpus */
- struct cpumask cpumask;
- char name[PACKAGE_DOMAIN_NAME_LENGTH];
-};
-
struct rapl_defaults {
u8 floor_freq_reg_addr;
int (*check_unit)(struct rapl_package *rp, int cpu);
void (*set_floor_freq)(struct rapl_domain *rd, bool mode);
u64 (*compute_time_window)(struct rapl_package *rp, u64 val,
- bool to_raw);
+ bool to_raw);
unsigned int dram_domain_energy_unit;
};
static struct rapl_defaults *rapl_defaults;
@@ -216,7 +113,7 @@ struct rapl_primitive_info {
const char *name;
u64 mask;
int shift;
- enum rapl_domain_msr_id id;
+ enum rapl_domain_reg_id id;
enum unit_type unit;
u32 flag;
};
@@ -232,19 +129,18 @@ struct rapl_primitive_info {
static void rapl_init_domains(struct rapl_package *rp);
static int rapl_read_data_raw(struct rapl_domain *rd,
- enum rapl_primitives prim,
- bool xlate, u64 *data);
+ enum rapl_primitives prim,
+ bool xlate, u64 *data);
static int rapl_write_data_raw(struct rapl_domain *rd,
- enum rapl_primitives prim,
- unsigned long long value);
+ enum rapl_primitives prim,
+ unsigned long long value);
static u64 rapl_unit_xlate(struct rapl_domain *rd,
- enum unit_type type, u64 value,
- int to_raw);
+ enum unit_type type, u64 value, int to_raw);
static void package_power_limit_irq_save(struct rapl_package *rp);
-static LIST_HEAD(rapl_packages); /* guarded by CPU hotplug lock */
+static LIST_HEAD(rapl_packages); /* guarded by CPU hotplug lock */
-static const char * const rapl_domain_names[] = {
+static const char *const rapl_domain_names[] = {
"package",
"core",
"uncore",
@@ -252,24 +148,8 @@ static const char * const rapl_domain_names[] = {
"psys",
};
-static struct powercap_control_type *control_type; /* PowerCap Controller */
-static struct rapl_domain *platform_rapl_domain; /* Platform (PSys) domain */
-
-/* caller to ensure CPU hotplug lock is held */
-static struct rapl_package *rapl_find_package_domain(int cpu)
-{
- int id = topology_logical_die_id(cpu);
- struct rapl_package *rp;
-
- list_for_each_entry(rp, &rapl_packages, plist) {
- if (rp->id == id)
- return rp;
- }
-
- return NULL;
-}
-
-static int get_energy_counter(struct powercap_zone *power_zone, u64 *energy_raw)
+static int get_energy_counter(struct powercap_zone *power_zone,
+ u64 *energy_raw)
{
struct rapl_domain *rd;
u64 energy_now;
@@ -368,50 +248,49 @@ static int get_domain_enable(struct powercap_zone *power_zone, bool *mode)
static const struct powercap_zone_ops zone_ops[] = {
/* RAPL_DOMAIN_PACKAGE */
{
- .get_energy_uj = get_energy_counter,
- .get_max_energy_range_uj = get_max_energy_counter,
- .release = release_zone,
- .set_enable = set_domain_enable,
- .get_enable = get_domain_enable,
- },
+ .get_energy_uj = get_energy_counter,
+ .get_max_energy_range_uj = get_max_energy_counter,
+ .release = release_zone,
+ .set_enable = set_domain_enable,
+ .get_enable = get_domain_enable,
+ },
/* RAPL_DOMAIN_PP0 */
{
- .get_energy_uj = get_energy_counter,
- .get_max_energy_range_uj = get_max_energy_counter,
- .release = release_zone,
- .set_enable = set_domain_enable,
- .get_enable = get_domain_enable,
- },
+ .get_energy_uj = get_energy_counter,
+ .get_max_energy_range_uj = get_max_energy_counter,
+ .release = release_zone,
+ .set_enable = set_domain_enable,
+ .get_enable = get_domain_enable,
+ },
/* RAPL_DOMAIN_PP1 */
{
- .get_energy_uj = get_energy_counter,
- .get_max_energy_range_uj = get_max_energy_counter,
- .release = release_zone,
- .set_enable = set_domain_enable,
- .get_enable = get_domain_enable,
- },
+ .get_energy_uj = get_energy_counter,
+ .get_max_energy_range_uj = get_max_energy_counter,
+ .release = release_zone,
+ .set_enable = set_domain_enable,
+ .get_enable = get_domain_enable,
+ },
/* RAPL_DOMAIN_DRAM */
{
- .get_energy_uj = get_energy_counter,
- .get_max_energy_range_uj = get_max_energy_counter,
- .release = release_zone,
- .set_enable = set_domain_enable,
- .get_enable = get_domain_enable,
- },
+ .get_energy_uj = get_energy_counter,
+ .get_max_energy_range_uj = get_max_energy_counter,
+ .release = release_zone,
+ .set_enable = set_domain_enable,
+ .get_enable = get_domain_enable,
+ },
/* RAPL_DOMAIN_PLATFORM */
{
- .get_energy_uj = get_energy_counter,
- .get_max_energy_range_uj = get_max_energy_counter,
- .release = release_zone,
- .set_enable = set_domain_enable,
- .get_enable = get_domain_enable,
- },
+ .get_energy_uj = get_energy_counter,
+ .get_max_energy_range_uj = get_max_energy_counter,
+ .release = release_zone,
+ .set_enable = set_domain_enable,
+ .get_enable = get_domain_enable,
+ },
};
-
/*
* Constraint index used by powercap can be different than power limit (PL)
- * index in that some PLs maybe missing due to non-existant MSRs. So we
+ * index in that some PLs maybe missing due to non-existent MSRs. So we
* need to convert here by finding the valid PLs only (name populated).
*/
static int contraint_to_pl(struct rapl_domain *rd, int cid)
@@ -430,7 +309,7 @@ static int contraint_to_pl(struct rapl_domain *rd, int cid)
}
static int set_power_limit(struct powercap_zone *power_zone, int cid,
- u64 power_limit)
+ u64 power_limit)
{
struct rapl_domain *rd;
struct rapl_package *rp;
@@ -448,8 +327,8 @@ static int set_power_limit(struct powercap_zone *power_zone, int cid,
rp = rd->rp;
if (rd->state & DOMAIN_STATE_BIOS_LOCKED) {
- dev_warn(&power_zone->dev, "%s locked by BIOS, monitoring only\n",
- rd->name);
+ dev_warn(&power_zone->dev,
+ "%s locked by BIOS, monitoring only\n", rd->name);
ret = -EACCES;
goto set_exit;
}
@@ -472,7 +351,7 @@ set_exit:
}
static int get_current_power_limit(struct powercap_zone *power_zone, int cid,
- u64 *data)
+ u64 *data)
{
struct rapl_domain *rd;
u64 val;
@@ -511,7 +390,7 @@ get_exit:
}
static int set_time_window(struct powercap_zone *power_zone, int cid,
- u64 window)
+ u64 window)
{
struct rapl_domain *rd;
int ret = 0;
@@ -541,7 +420,8 @@ set_time_exit:
return ret;
}
-static int get_time_window(struct powercap_zone *power_zone, int cid, u64 *data)
+static int get_time_window(struct powercap_zone *power_zone, int cid,
+ u64 *data)
{
struct rapl_domain *rd;
u64 val;
@@ -576,7 +456,8 @@ get_time_exit:
return ret;
}
-static const char *get_constraint_name(struct powercap_zone *power_zone, int cid)
+static const char *get_constraint_name(struct powercap_zone *power_zone,
+ int cid)
{
struct rapl_domain *rd;
int id;
@@ -589,9 +470,7 @@ static const char *get_constraint_name(struct powercap_zone *power_zone, int cid
return NULL;
}
-
-static int get_max_power(struct powercap_zone *power_zone, int id,
- u64 *data)
+static int get_max_power(struct powercap_zone *power_zone, int id, u64 *data)
{
struct rapl_domain *rd;
u64 val;
@@ -633,73 +512,43 @@ static const struct powercap_zone_constraint_ops constraint_ops = {
/* called after domain detection and package level data are set */
static void rapl_init_domains(struct rapl_package *rp)
{
- int i;
+ enum rapl_domain_type i;
+ enum rapl_domain_reg_id j;
struct rapl_domain *rd = rp->domains;
for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
unsigned int mask = rp->domain_map & (1 << i);
- switch (mask) {
- case BIT(RAPL_DOMAIN_PACKAGE):
- rd->name = rapl_domain_names[RAPL_DOMAIN_PACKAGE];
- rd->id = RAPL_DOMAIN_PACKAGE;
- rd->msrs[0] = MSR_PKG_POWER_LIMIT;
- rd->msrs[1] = MSR_PKG_ENERGY_STATUS;
- rd->msrs[2] = MSR_PKG_PERF_STATUS;
- rd->msrs[3] = 0;
- rd->msrs[4] = MSR_PKG_POWER_INFO;
- rd->rpl[0].prim_id = PL1_ENABLE;
- rd->rpl[0].name = pl1_name;
+
+ if (!mask)
+ continue;
+
+ rd->rp = rp;
+ rd->name = rapl_domain_names[i];
+ rd->id = i;
+ rd->rpl[0].prim_id = PL1_ENABLE;
+ rd->rpl[0].name = pl1_name;
+ /* some domain may support two power limits */
+ if (rp->priv->limits[i] == 2) {
rd->rpl[1].prim_id = PL2_ENABLE;
rd->rpl[1].name = pl2_name;
- break;
- case BIT(RAPL_DOMAIN_PP0):
- rd->name = rapl_domain_names[RAPL_DOMAIN_PP0];
- rd->id = RAPL_DOMAIN_PP0;
- rd->msrs[0] = MSR_PP0_POWER_LIMIT;
- rd->msrs[1] = MSR_PP0_ENERGY_STATUS;
- rd->msrs[2] = 0;
- rd->msrs[3] = MSR_PP0_POLICY;
- rd->msrs[4] = 0;
- rd->rpl[0].prim_id = PL1_ENABLE;
- rd->rpl[0].name = pl1_name;
- break;
- case BIT(RAPL_DOMAIN_PP1):
- rd->name = rapl_domain_names[RAPL_DOMAIN_PP1];
- rd->id = RAPL_DOMAIN_PP1;
- rd->msrs[0] = MSR_PP1_POWER_LIMIT;
- rd->msrs[1] = MSR_PP1_ENERGY_STATUS;
- rd->msrs[2] = 0;
- rd->msrs[3] = MSR_PP1_POLICY;
- rd->msrs[4] = 0;
- rd->rpl[0].prim_id = PL1_ENABLE;
- rd->rpl[0].name = pl1_name;
- break;
- case BIT(RAPL_DOMAIN_DRAM):
- rd->name = rapl_domain_names[RAPL_DOMAIN_DRAM];
- rd->id = RAPL_DOMAIN_DRAM;
- rd->msrs[0] = MSR_DRAM_POWER_LIMIT;
- rd->msrs[1] = MSR_DRAM_ENERGY_STATUS;
- rd->msrs[2] = MSR_DRAM_PERF_STATUS;
- rd->msrs[3] = 0;
- rd->msrs[4] = MSR_DRAM_POWER_INFO;
- rd->rpl[0].prim_id = PL1_ENABLE;
- rd->rpl[0].name = pl1_name;
+ }
+
+ for (j = 0; j < RAPL_DOMAIN_REG_MAX; j++)
+ rd->regs[j] = rp->priv->regs[i][j];
+
+ if (i == RAPL_DOMAIN_DRAM) {
rd->domain_energy_unit =
- rapl_defaults->dram_domain_energy_unit;
+ rapl_defaults->dram_domain_energy_unit;
if (rd->domain_energy_unit)
pr_info("DRAM domain energy unit %dpj\n",
rd->domain_energy_unit);
- break;
- }
- if (mask) {
- rd->rp = rp;
- rd++;
}
+ rd++;
}
}
static u64 rapl_unit_xlate(struct rapl_domain *rd, enum unit_type type,
- u64 value, int to_raw)
+ u64 value, int to_raw)
{
u64 units = 1;
struct rapl_package *rp = rd->rp;
@@ -736,40 +585,40 @@ static u64 rapl_unit_xlate(struct rapl_domain *rd, enum unit_type type,
static struct rapl_primitive_info rpi[] = {
/* name, mask, shift, msr index, unit divisor */
PRIMITIVE_INFO_INIT(ENERGY_COUNTER, ENERGY_STATUS_MASK, 0,
- RAPL_DOMAIN_MSR_STATUS, ENERGY_UNIT, 0),
+ RAPL_DOMAIN_REG_STATUS, ENERGY_UNIT, 0),
PRIMITIVE_INFO_INIT(POWER_LIMIT1, POWER_LIMIT1_MASK, 0,
- RAPL_DOMAIN_MSR_LIMIT, POWER_UNIT, 0),
+ RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
PRIMITIVE_INFO_INIT(POWER_LIMIT2, POWER_LIMIT2_MASK, 32,
- RAPL_DOMAIN_MSR_LIMIT, POWER_UNIT, 0),
- PRIMITIVE_INFO_INIT(FW_LOCK, POWER_PP_LOCK, 31,
- RAPL_DOMAIN_MSR_LIMIT, ARBITRARY_UNIT, 0),
+ RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
+ PRIMITIVE_INFO_INIT(FW_LOCK, POWER_LOW_LOCK, 31,
+ RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
PRIMITIVE_INFO_INIT(PL1_ENABLE, POWER_LIMIT1_ENABLE, 15,
- RAPL_DOMAIN_MSR_LIMIT, ARBITRARY_UNIT, 0),
+ RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
PRIMITIVE_INFO_INIT(PL1_CLAMP, POWER_LIMIT1_CLAMP, 16,
- RAPL_DOMAIN_MSR_LIMIT, ARBITRARY_UNIT, 0),
+ RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
PRIMITIVE_INFO_INIT(PL2_ENABLE, POWER_LIMIT2_ENABLE, 47,
- RAPL_DOMAIN_MSR_LIMIT, ARBITRARY_UNIT, 0),
+ RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
PRIMITIVE_INFO_INIT(PL2_CLAMP, POWER_LIMIT2_CLAMP, 48,
- RAPL_DOMAIN_MSR_LIMIT, ARBITRARY_UNIT, 0),
+ RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
PRIMITIVE_INFO_INIT(TIME_WINDOW1, TIME_WINDOW1_MASK, 17,
- RAPL_DOMAIN_MSR_LIMIT, TIME_UNIT, 0),
+ RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
PRIMITIVE_INFO_INIT(TIME_WINDOW2, TIME_WINDOW2_MASK, 49,
- RAPL_DOMAIN_MSR_LIMIT, TIME_UNIT, 0),
+ RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
PRIMITIVE_INFO_INIT(THERMAL_SPEC_POWER, POWER_INFO_THERMAL_SPEC_MASK,
- 0, RAPL_DOMAIN_MSR_INFO, POWER_UNIT, 0),
+ 0, RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
PRIMITIVE_INFO_INIT(MAX_POWER, POWER_INFO_MAX_MASK, 32,
- RAPL_DOMAIN_MSR_INFO, POWER_UNIT, 0),
+ RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
PRIMITIVE_INFO_INIT(MIN_POWER, POWER_INFO_MIN_MASK, 16,
- RAPL_DOMAIN_MSR_INFO, POWER_UNIT, 0),
+ RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
PRIMITIVE_INFO_INIT(MAX_TIME_WINDOW, POWER_INFO_MAX_TIME_WIN_MASK, 48,
- RAPL_DOMAIN_MSR_INFO, TIME_UNIT, 0),
+ RAPL_DOMAIN_REG_INFO, TIME_UNIT, 0),
PRIMITIVE_INFO_INIT(THROTTLED_TIME, PERF_STATUS_THROTTLE_TIME_MASK, 0,
- RAPL_DOMAIN_MSR_PERF, TIME_UNIT, 0),
+ RAPL_DOMAIN_REG_PERF, TIME_UNIT, 0),
PRIMITIVE_INFO_INIT(PRIORITY_LEVEL, PP_POLICY_MASK, 0,
- RAPL_DOMAIN_MSR_POLICY, ARBITRARY_UNIT, 0),
+ RAPL_DOMAIN_REG_POLICY, ARBITRARY_UNIT, 0),
/* non-hardware */
PRIMITIVE_INFO_INIT(AVERAGE_POWER, 0, 0, 0, POWER_UNIT,
- RAPL_PRIMITIVE_DERIVED),
+ RAPL_PRIMITIVE_DERIVED),
{NULL, 0, 0, 0},
};
@@ -787,26 +636,25 @@ static struct rapl_primitive_info rpi[] = {
* 63-------------------------- 31--------------------------- 0
*/
static int rapl_read_data_raw(struct rapl_domain *rd,
- enum rapl_primitives prim,
- bool xlate, u64 *data)
+ enum rapl_primitives prim, bool xlate, u64 *data)
{
- u64 value, final;
- u32 msr;
+ u64 value;
struct rapl_primitive_info *rp = &rpi[prim];
+ struct reg_action ra;
int cpu;
if (!rp->name || rp->flag & RAPL_PRIMITIVE_DUMMY)
return -EINVAL;
- msr = rd->msrs[rp->id];
- if (!msr)
+ ra.reg = rd->regs[rp->id];
+ if (!ra.reg)
return -EINVAL;
cpu = rd->rp->lead_cpu;
- /* special-case package domain, which uses a different bit*/
- if (prim == FW_LOCK && rd->id == RAPL_DOMAIN_PACKAGE) {
- rp->mask = POWER_PACKAGE_LOCK;
+ /* domain with 2 limits has different bit */
+ if (prim == FW_LOCK && rd->rp->priv->limits[rd->id] == 2) {
+ rp->mask = POWER_HIGH_LOCK;
rp->shift = 63;
}
/* non-hardware data are collected by the polling thread */
@@ -815,56 +663,32 @@ static int rapl_read_data_raw(struct rapl_domain *rd,
return 0;
}
- if (rdmsrl_safe_on_cpu(cpu, msr, &value)) {
- pr_debug("failed to read msr 0x%x on cpu %d\n", msr, cpu);
+ ra.mask = rp->mask;
+
+ if (rd->rp->priv->read_raw(cpu, &ra)) {
+ pr_debug("failed to read reg 0x%llx on cpu %d\n", ra.reg, cpu);
return -EIO;
}
- final = value & rp->mask;
- final = final >> rp->shift;
+ value = ra.value >> rp->shift;
+
if (xlate)
- *data = rapl_unit_xlate(rd, rp->unit, final, 0);
+ *data = rapl_unit_xlate(rd, rp->unit, value, 0);
else
- *data = final;
+ *data = value;
return 0;
}
-
-static int msrl_update_safe(u32 msr_no, u64 clear_mask, u64 set_mask)
-{
- int err;
- u64 val;
-
- err = rdmsrl_safe(msr_no, &val);
- if (err)
- goto out;
-
- val &= ~clear_mask;
- val |= set_mask;
-
- err = wrmsrl_safe(msr_no, val);
-
-out:
- return err;
-}
-
-static void msrl_update_func(void *info)
-{
- struct msrl_action *ma = info;
-
- ma->err = msrl_update_safe(ma->msr_no, ma->clear_mask, ma->set_mask);
-}
-
/* Similar use of primitive info in the read counterpart */
static int rapl_write_data_raw(struct rapl_domain *rd,
- enum rapl_primitives prim,
- unsigned long long value)
+ enum rapl_primitives prim,
+ unsigned long long value)
{
struct rapl_primitive_info *rp = &rpi[prim];
int cpu;
u64 bits;
- struct msrl_action ma;
+ struct reg_action ra;
int ret;
cpu = rd->rp->lead_cpu;
@@ -872,17 +696,13 @@ static int rapl_write_data_raw(struct rapl_domain *rd,
bits <<= rp->shift;
bits &= rp->mask;
- memset(&ma, 0, sizeof(ma));
+ memset(&ra, 0, sizeof(ra));
- ma.msr_no = rd->msrs[rp->id];
- ma.clear_mask = rp->mask;
- ma.set_mask = bits;
+ ra.reg = rd->regs[rp->id];
+ ra.mask = rp->mask;
+ ra.value = bits;
- ret = smp_call_function_single(cpu, msrl_update_func, &ma, 1);
- if (ret)
- WARN_ON_ONCE(ret);
- else
- ret = ma.err;
+ ret = rd->rp->priv->write_raw(cpu, &ra);
return ret;
}
@@ -900,51 +720,56 @@ static int rapl_write_data_raw(struct rapl_domain *rd,
*/
static int rapl_check_unit_core(struct rapl_package *rp, int cpu)
{
- u64 msr_val;
+ struct reg_action ra;
u32 value;
- if (rdmsrl_safe_on_cpu(cpu, MSR_RAPL_POWER_UNIT, &msr_val)) {
- pr_err("Failed to read power unit MSR 0x%x on CPU %d, exit.\n",
- MSR_RAPL_POWER_UNIT, cpu);
+ ra.reg = rp->priv->reg_unit;
+ ra.mask = ~0;
+ if (rp->priv->read_raw(cpu, &ra)) {
+ pr_err("Failed to read power unit REG 0x%llx on CPU %d, exit.\n",
+ rp->priv->reg_unit, cpu);
return -ENODEV;
}
- value = (msr_val & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
+ value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
rp->energy_unit = ENERGY_UNIT_SCALE * 1000000 / (1 << value);
- value = (msr_val & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET;
+ value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET;
rp->power_unit = 1000000 / (1 << value);
- value = (msr_val & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET;
+ value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET;
rp->time_unit = 1000000 / (1 << value);
pr_debug("Core CPU %s energy=%dpJ, time=%dus, power=%duW\n",
- rp->name, rp->energy_unit, rp->time_unit, rp->power_unit);
+ rp->name, rp->energy_unit, rp->time_unit, rp->power_unit);
return 0;
}
static int rapl_check_unit_atom(struct rapl_package *rp, int cpu)
{
- u64 msr_val;
+ struct reg_action ra;
u32 value;
- if (rdmsrl_safe_on_cpu(cpu, MSR_RAPL_POWER_UNIT, &msr_val)) {
- pr_err("Failed to read power unit MSR 0x%x on CPU %d, exit.\n",
- MSR_RAPL_POWER_UNIT, cpu);
+ ra.reg = rp->priv->reg_unit;
+ ra.mask = ~0;
+ if (rp->priv->read_raw(cpu, &ra)) {
+ pr_err("Failed to read power unit REG 0x%llx on CPU %d, exit.\n",
+ rp->priv->reg_unit, cpu);
return -ENODEV;
}
- value = (msr_val & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
+
+ value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
rp->energy_unit = ENERGY_UNIT_SCALE * 1 << value;
- value = (msr_val & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET;
+ value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET;
rp->power_unit = (1 << value) * 1000;
- value = (msr_val & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET;
+ value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET;
rp->time_unit = 1000000 / (1 << value);
pr_debug("Atom %s energy=%dpJ, time=%dus, power=%duW\n",
- rp->name, rp->energy_unit, rp->time_unit, rp->power_unit);
+ rp->name, rp->energy_unit, rp->time_unit, rp->power_unit);
return 0;
}
@@ -964,7 +789,6 @@ static void power_limit_irq_save_cpu(void *info)
wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
}
-
/* REVISIT:
* When package power limit is set artificially low by RAPL, LVT
* thermal interrupt for package power limit should be ignored
@@ -1048,9 +872,9 @@ static void set_floor_freq_atom(struct rapl_domain *rd, bool enable)
}
static u64 rapl_compute_time_window_core(struct rapl_package *rp, u64 value,
- bool to_raw)
+ bool to_raw)
{
- u64 f, y; /* fraction and exp. used for time unit */
+ u64 f, y; /* fraction and exp. used for time unit */
/*
* Special processing based on 2^Y*(1+F/4), refer
@@ -1070,7 +894,7 @@ static u64 rapl_compute_time_window_core(struct rapl_package *rp, u64 value,
}
static u64 rapl_compute_time_window_atom(struct rapl_package *rp, u64 value,
- bool to_raw)
+ bool to_raw)
{
/*
* Atom time unit encoding is straight forward val * time_unit,
@@ -1078,8 +902,8 @@ static u64 rapl_compute_time_window_atom(struct rapl_package *rp, u64 value,
*/
if (!to_raw)
return (value) ? value *= rp->time_unit : rp->time_unit;
- else
- value = div64_u64(value, rp->time_unit);
+
+ value = div64_u64(value, rp->time_unit);
return value;
}
@@ -1127,43 +951,48 @@ static const struct rapl_defaults rapl_defaults_cht = {
};
static const struct x86_cpu_id rapl_ids[] __initconst = {
- INTEL_CPU_FAM6(SANDYBRIDGE, rapl_defaults_core),
- INTEL_CPU_FAM6(SANDYBRIDGE_X, rapl_defaults_core),
-
- INTEL_CPU_FAM6(IVYBRIDGE, rapl_defaults_core),
- INTEL_CPU_FAM6(IVYBRIDGE_X, rapl_defaults_core),
-
- INTEL_CPU_FAM6(HASWELL_CORE, rapl_defaults_core),
- INTEL_CPU_FAM6(HASWELL_ULT, rapl_defaults_core),
- INTEL_CPU_FAM6(HASWELL_GT3E, rapl_defaults_core),
- INTEL_CPU_FAM6(HASWELL_X, rapl_defaults_hsw_server),
-
- INTEL_CPU_FAM6(BROADWELL_CORE, rapl_defaults_core),
- INTEL_CPU_FAM6(BROADWELL_GT3E, rapl_defaults_core),
- INTEL_CPU_FAM6(BROADWELL_XEON_D, rapl_defaults_core),
- INTEL_CPU_FAM6(BROADWELL_X, rapl_defaults_hsw_server),
-
- INTEL_CPU_FAM6(SKYLAKE_DESKTOP, rapl_defaults_core),
- INTEL_CPU_FAM6(SKYLAKE_MOBILE, rapl_defaults_core),
- INTEL_CPU_FAM6(SKYLAKE_X, rapl_defaults_hsw_server),
- INTEL_CPU_FAM6(KABYLAKE_MOBILE, rapl_defaults_core),
- INTEL_CPU_FAM6(KABYLAKE_DESKTOP, rapl_defaults_core),
- INTEL_CPU_FAM6(CANNONLAKE_MOBILE, rapl_defaults_core),
- INTEL_CPU_FAM6(ICELAKE_MOBILE, rapl_defaults_core),
-
- INTEL_CPU_FAM6(ATOM_SILVERMONT, rapl_defaults_byt),
- INTEL_CPU_FAM6(ATOM_AIRMONT, rapl_defaults_cht),
- INTEL_CPU_FAM6(ATOM_SILVERMONT_MID, rapl_defaults_tng),
- INTEL_CPU_FAM6(ATOM_AIRMONT_MID, rapl_defaults_ann),
- INTEL_CPU_FAM6(ATOM_GOLDMONT, rapl_defaults_core),
- INTEL_CPU_FAM6(ATOM_GOLDMONT_PLUS, rapl_defaults_core),
- INTEL_CPU_FAM6(ATOM_GOLDMONT_X, rapl_defaults_core),
- INTEL_CPU_FAM6(ATOM_TREMONT_X, rapl_defaults_core),
-
- INTEL_CPU_FAM6(XEON_PHI_KNL, rapl_defaults_hsw_server),
- INTEL_CPU_FAM6(XEON_PHI_KNM, rapl_defaults_hsw_server),
+ INTEL_CPU_FAM6(SANDYBRIDGE, rapl_defaults_core),
+ INTEL_CPU_FAM6(SANDYBRIDGE_X, rapl_defaults_core),
+
+ INTEL_CPU_FAM6(IVYBRIDGE, rapl_defaults_core),
+ INTEL_CPU_FAM6(IVYBRIDGE_X, rapl_defaults_core),
+
+ INTEL_CPU_FAM6(HASWELL_CORE, rapl_defaults_core),
+ INTEL_CPU_FAM6(HASWELL_ULT, rapl_defaults_core),
+ INTEL_CPU_FAM6(HASWELL_GT3E, rapl_defaults_core),
+ INTEL_CPU_FAM6(HASWELL_X, rapl_defaults_hsw_server),
+
+ INTEL_CPU_FAM6(BROADWELL_CORE, rapl_defaults_core),
+ INTEL_CPU_FAM6(BROADWELL_GT3E, rapl_defaults_core),
+ INTEL_CPU_FAM6(BROADWELL_XEON_D, rapl_defaults_core),
+ INTEL_CPU_FAM6(BROADWELL_X, rapl_defaults_hsw_server),
+
+ INTEL_CPU_FAM6(SKYLAKE_DESKTOP, rapl_defaults_core),
+ INTEL_CPU_FAM6(SKYLAKE_MOBILE, rapl_defaults_core),
+ INTEL_CPU_FAM6(SKYLAKE_X, rapl_defaults_hsw_server),
+ INTEL_CPU_FAM6(KABYLAKE_MOBILE, rapl_defaults_core),
+ INTEL_CPU_FAM6(KABYLAKE_DESKTOP, rapl_defaults_core),
+ INTEL_CPU_FAM6(CANNONLAKE_MOBILE, rapl_defaults_core),
+ INTEL_CPU_FAM6(ICELAKE_MOBILE, rapl_defaults_core),
+ INTEL_CPU_FAM6(ICELAKE_DESKTOP, rapl_defaults_core),
+ INTEL_CPU_FAM6(ICELAKE_NNPI, rapl_defaults_core),
+ INTEL_CPU_FAM6(ICELAKE_X, rapl_defaults_hsw_server),
+ INTEL_CPU_FAM6(ICELAKE_XEON_D, rapl_defaults_hsw_server),
+
+ INTEL_CPU_FAM6(ATOM_SILVERMONT, rapl_defaults_byt),
+ INTEL_CPU_FAM6(ATOM_AIRMONT, rapl_defaults_cht),
+ INTEL_CPU_FAM6(ATOM_SILVERMONT_MID, rapl_defaults_tng),
+ INTEL_CPU_FAM6(ATOM_AIRMONT_MID, rapl_defaults_ann),
+ INTEL_CPU_FAM6(ATOM_GOLDMONT, rapl_defaults_core),
+ INTEL_CPU_FAM6(ATOM_GOLDMONT_PLUS, rapl_defaults_core),
+ INTEL_CPU_FAM6(ATOM_GOLDMONT_X, rapl_defaults_core),
+ INTEL_CPU_FAM6(ATOM_TREMONT_X, rapl_defaults_core),
+
+ INTEL_CPU_FAM6(XEON_PHI_KNL, rapl_defaults_hsw_server),
+ INTEL_CPU_FAM6(XEON_PHI_KNM, rapl_defaults_hsw_server),
{}
};
+
MODULE_DEVICE_TABLE(x86cpu, rapl_ids);
/* Read once for all raw primitive data for domains */
@@ -1179,22 +1008,12 @@ static void rapl_update_domain_data(struct rapl_package *rp)
for (prim = 0; prim < NR_RAW_PRIMITIVES; prim++) {
if (!rapl_read_data_raw(&rp->domains[dmn], prim,
rpi[prim].unit, &val))
- rp->domains[dmn].rdd.primitives[prim] = val;
+ rp->domains[dmn].rdd.primitives[prim] = val;
}
}
}
-static void rapl_unregister_powercap(void)
-{
- if (platform_rapl_domain) {
- powercap_unregister_zone(control_type,
- &platform_rapl_domain->power_zone);
- kfree(platform_rapl_domain);
- }
- powercap_unregister_control_type(control_type);
-}
-
static int rapl_package_register_powercap(struct rapl_package *rp)
{
struct rapl_domain *rd;
@@ -1204,20 +1023,18 @@ static int rapl_package_register_powercap(struct rapl_package *rp)
/* Update the domain data of the new package */
rapl_update_domain_data(rp);
- /* first we register package domain as the parent zone*/
+ /* first we register package domain as the parent zone */
for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
if (rd->id == RAPL_DOMAIN_PACKAGE) {
nr_pl = find_nr_power_limit(rd);
pr_debug("register package domain %s\n", rp->name);
power_zone = powercap_register_zone(&rd->power_zone,
- control_type,
- rp->name, NULL,
- &zone_ops[rd->id],
- nr_pl,
- &constraint_ops);
+ rp->priv->control_type, rp->name,
+ NULL, &zone_ops[rd->id], nr_pl,
+ &constraint_ops);
if (IS_ERR(power_zone)) {
pr_debug("failed to register power zone %s\n",
- rp->name);
+ rp->name);
return PTR_ERR(power_zone);
}
/* track parent zone in per package/socket data */
@@ -1230,21 +1047,21 @@ static int rapl_package_register_powercap(struct rapl_package *rp)
pr_err("no package domain found, unknown topology!\n");
return -ENODEV;
}
- /* now register domains as children of the socket/package*/
+ /* now register domains as children of the socket/package */
for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
if (rd->id == RAPL_DOMAIN_PACKAGE)
continue;
/* number of power limits per domain varies */
nr_pl = find_nr_power_limit(rd);
power_zone = powercap_register_zone(&rd->power_zone,
- control_type, rd->name,
- rp->power_zone,
- &zone_ops[rd->id], nr_pl,
- &constraint_ops);
+ rp->priv->control_type,
+ rd->name, rp->power_zone,
+ &zone_ops[rd->id], nr_pl,
+ &constraint_ops);
if (IS_ERR(power_zone)) {
pr_debug("failed to register power_zone, %s:%s\n",
- rp->name, rd->name);
+ rp->name, rd->name);
ret = PTR_ERR(power_zone);
goto err_cleanup;
}
@@ -1258,22 +1075,30 @@ err_cleanup:
*/
while (--rd >= rp->domains) {
pr_debug("unregister %s domain %s\n", rp->name, rd->name);
- powercap_unregister_zone(control_type, &rd->power_zone);
+ powercap_unregister_zone(rp->priv->control_type,
+ &rd->power_zone);
}
return ret;
}
-static int __init rapl_register_psys(void)
+int rapl_add_platform_domain(struct rapl_if_priv *priv)
{
struct rapl_domain *rd;
struct powercap_zone *power_zone;
- u64 val;
+ struct reg_action ra;
+ int ret;
- if (rdmsrl_safe_on_cpu(0, MSR_PLATFORM_ENERGY_STATUS, &val) || !val)
+ ra.reg = priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_STATUS];
+ ra.mask = ~0;
+ ret = priv->read_raw(0, &ra);
+ if (ret || !ra.value)
return -ENODEV;
- if (rdmsrl_safe_on_cpu(0, MSR_PLATFORM_POWER_LIMIT, &val) || !val)
+ ra.reg = priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_LIMIT];
+ ra.mask = ~0;
+ ret = priv->read_raw(0, &ra);
+ if (ret || !ra.value)
return -ENODEV;
rd = kzalloc(sizeof(*rd), GFP_KERNEL);
@@ -1282,15 +1107,17 @@ static int __init rapl_register_psys(void)
rd->name = rapl_domain_names[RAPL_DOMAIN_PLATFORM];
rd->id = RAPL_DOMAIN_PLATFORM;
- rd->msrs[0] = MSR_PLATFORM_POWER_LIMIT;
- rd->msrs[1] = MSR_PLATFORM_ENERGY_STATUS;
+ rd->regs[RAPL_DOMAIN_REG_LIMIT] =
+ priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_LIMIT];
+ rd->regs[RAPL_DOMAIN_REG_STATUS] =
+ priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_STATUS];
rd->rpl[0].prim_id = PL1_ENABLE;
rd->rpl[0].name = pl1_name;
rd->rpl[1].prim_id = PL2_ENABLE;
rd->rpl[1].name = pl2_name;
- rd->rp = rapl_find_package_domain(0);
+ rd->rp = rapl_find_package_domain(0, priv);
- power_zone = powercap_register_zone(&rd->power_zone, control_type,
+ power_zone = powercap_register_zone(&rd->power_zone, priv->control_type,
"psys", NULL,
&zone_ops[RAPL_DOMAIN_PLATFORM],
2, &constraint_ops);
@@ -1300,38 +1127,32 @@ static int __init rapl_register_psys(void)
return PTR_ERR(power_zone);
}
- platform_rapl_domain = rd;
+ priv->platform_rapl_domain = rd;
return 0;
}
+EXPORT_SYMBOL_GPL(rapl_add_platform_domain);
-static int __init rapl_register_powercap(void)
+void rapl_remove_platform_domain(struct rapl_if_priv *priv)
{
- control_type = powercap_register_control_type(NULL, "intel-rapl", NULL);
- if (IS_ERR(control_type)) {
- pr_debug("failed to register powercap control_type.\n");
- return PTR_ERR(control_type);
+ if (priv->platform_rapl_domain) {
+ powercap_unregister_zone(priv->control_type,
+ &priv->platform_rapl_domain->power_zone);
+ kfree(priv->platform_rapl_domain);
}
- return 0;
}
+EXPORT_SYMBOL_GPL(rapl_remove_platform_domain);
-static int rapl_check_domain(int cpu, int domain)
+static int rapl_check_domain(int cpu, int domain, struct rapl_package *rp)
{
- unsigned msr;
- u64 val = 0;
+ struct reg_action ra;
switch (domain) {
case RAPL_DOMAIN_PACKAGE:
- msr = MSR_PKG_ENERGY_STATUS;
- break;
case RAPL_DOMAIN_PP0:
- msr = MSR_PP0_ENERGY_STATUS;
- break;
case RAPL_DOMAIN_PP1:
- msr = MSR_PP1_ENERGY_STATUS;
- break;
case RAPL_DOMAIN_DRAM:
- msr = MSR_DRAM_ENERGY_STATUS;
+ ra.reg = rp->priv->regs[domain][RAPL_DOMAIN_REG_STATUS];
break;
case RAPL_DOMAIN_PLATFORM:
/* PSYS(PLATFORM) is not a CPU domain, so avoid printng error */
@@ -1343,19 +1164,20 @@ static int rapl_check_domain(int cpu, int domain)
/* make sure domain counters are available and contains non-zero
* values, otherwise skip it.
*/
- if (rdmsrl_safe_on_cpu(cpu, msr, &val) || !val)
+
+ ra.mask = ~0;
+ if (rp->priv->read_raw(cpu, &ra) || !ra.value)
return -ENODEV;
return 0;
}
-
/*
* Check if power limits are available. Two cases when they are not available:
* 1. Locked by BIOS, in this case we still provide read-only access so that
* users can see what limit is set by the BIOS.
* 2. Some CPUs make some domains monitoring only which means PLx MSRs may not
- * exist at all. In this case, we do not show the contraints in powercap.
+ * exist at all. In this case, we do not show the constraints in powercap.
*
* Called after domains are detected and initialized.
*/
@@ -1372,9 +1194,10 @@ static void rapl_detect_powerlimit(struct rapl_domain *rd)
rd->state |= DOMAIN_STATE_BIOS_LOCKED;
}
}
- /* check if power limit MSRs exists, otherwise domain is monitoring only */
+ /* check if power limit MSR exists, otherwise domain is monitoring only */
for (i = 0; i < NR_POWER_LIMITS; i++) {
int prim = rd->rpl[i].prim_id;
+
if (rapl_read_data_raw(rd, prim, false, &val64))
rd->rpl[i].name = NULL;
}
@@ -1390,12 +1213,12 @@ static int rapl_detect_domains(struct rapl_package *rp, int cpu)
for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
/* use physical package id to read counters */
- if (!rapl_check_domain(cpu, i)) {
+ if (!rapl_check_domain(cpu, i, rp)) {
rp->domain_map |= 1 << i;
pr_info("Found RAPL domain %s\n", rapl_domain_names[i]);
}
}
- rp->nr_domains = bitmap_weight(&rp->domain_map, RAPL_DOMAIN_MAX);
+ rp->nr_domains = bitmap_weight(&rp->domain_map, RAPL_DOMAIN_MAX);
if (!rp->nr_domains) {
pr_debug("no valid rapl domains found in %s\n", rp->name);
return -ENODEV;
@@ -1403,7 +1226,7 @@ static int rapl_detect_domains(struct rapl_package *rp, int cpu)
pr_debug("found %d domains on %s\n", rp->nr_domains, rp->name);
rp->domains = kcalloc(rp->nr_domains + 1, sizeof(struct rapl_domain),
- GFP_KERNEL);
+ GFP_KERNEL);
if (!rp->domains)
return -ENOMEM;
@@ -1416,7 +1239,7 @@ static int rapl_detect_domains(struct rapl_package *rp, int cpu)
}
/* called from CPU hotplug notifier, hotplug lock held */
-static void rapl_remove_package(struct rapl_package *rp)
+void rapl_remove_package(struct rapl_package *rp)
{
struct rapl_domain *rd, *rd_package = NULL;
@@ -1435,16 +1258,35 @@ static void rapl_remove_package(struct rapl_package *rp)
}
pr_debug("remove package, undo power limit on %s: %s\n",
rp->name, rd->name);
- powercap_unregister_zone(control_type, &rd->power_zone);
+ powercap_unregister_zone(rp->priv->control_type,
+ &rd->power_zone);
}
/* do parent zone last */
- powercap_unregister_zone(control_type, &rd_package->power_zone);
+ powercap_unregister_zone(rp->priv->control_type,
+ &rd_package->power_zone);
list_del(&rp->plist);
kfree(rp);
}
+EXPORT_SYMBOL_GPL(rapl_remove_package);
+
+/* caller to ensure CPU hotplug lock is held */
+struct rapl_package *rapl_find_package_domain(int cpu, struct rapl_if_priv *priv)
+{
+ int id = topology_logical_die_id(cpu);
+ struct rapl_package *rp;
+
+ list_for_each_entry(rp, &rapl_packages, plist) {
+ if (rp->id == id
+ && rp->priv->control_type == priv->control_type)
+ return rp;
+ }
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(rapl_find_package_domain);
/* called from CPU hotplug notifier, hotplug lock held */
-static struct rapl_package *rapl_add_package(int cpu)
+struct rapl_package *rapl_add_package(int cpu, struct rapl_if_priv *priv)
{
int id = topology_logical_die_id(cpu);
struct rapl_package *rp;
@@ -1458,17 +1300,17 @@ static struct rapl_package *rapl_add_package(int cpu)
/* add the new package to the list */
rp->id = id;
rp->lead_cpu = cpu;
+ rp->priv = priv;
if (topology_max_die_per_package() > 1)
snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH,
- "package-%d-die-%d", c->phys_proc_id, c->cpu_die_id);
+ "package-%d-die-%d", c->phys_proc_id, c->cpu_die_id);
else
snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH, "package-%d",
- c->phys_proc_id);
+ c->phys_proc_id);
/* check if the package contains valid domains */
- if (rapl_detect_domains(rp, cpu) ||
- rapl_defaults->check_unit(rp, cpu)) {
+ if (rapl_detect_domains(rp, cpu) || rapl_defaults->check_unit(rp, cpu)) {
ret = -ENODEV;
goto err_free_package;
}
@@ -1484,47 +1326,7 @@ err_free_package:
kfree(rp);
return ERR_PTR(ret);
}
-
-/* Handles CPU hotplug on multi-socket systems.
- * If a CPU goes online as the first CPU of the physical package
- * we add the RAPL package to the system. Similarly, when the last
- * CPU of the package is removed, we remove the RAPL package and its
- * associated domains. Cooling devices are handled accordingly at
- * per-domain level.
- */
-static int rapl_cpu_online(unsigned int cpu)
-{
- struct rapl_package *rp;
-
- rp = rapl_find_package_domain(cpu);
- if (!rp) {
- rp = rapl_add_package(cpu);
- if (IS_ERR(rp))
- return PTR_ERR(rp);
- }
- cpumask_set_cpu(cpu, &rp->cpumask);
- return 0;
-}
-
-static int rapl_cpu_down_prep(unsigned int cpu)
-{
- struct rapl_package *rp;
- int lead_cpu;
-
- rp = rapl_find_package_domain(cpu);
- if (!rp)
- return 0;
-
- cpumask_clear_cpu(cpu, &rp->cpumask);
- lead_cpu = cpumask_first(&rp->cpumask);
- if (lead_cpu >= nr_cpu_ids)
- rapl_remove_package(rp);
- else if (rp->lead_cpu == cpu)
- rp->lead_cpu = lead_cpu;
- return 0;
-}
-
-static enum cpuhp_state pcap_rapl_online;
+EXPORT_SYMBOL_GPL(rapl_add_package);
static void power_limit_state_save(void)
{
@@ -1542,17 +1344,15 @@ static void power_limit_state_save(void)
switch (rd->rpl[i].prim_id) {
case PL1_ENABLE:
ret = rapl_read_data_raw(rd,
- POWER_LIMIT1,
- true,
- &rd->rpl[i].last_power_limit);
+ POWER_LIMIT1, true,
+ &rd->rpl[i].last_power_limit);
if (ret)
rd->rpl[i].last_power_limit = 0;
break;
case PL2_ENABLE:
ret = rapl_read_data_raw(rd,
- POWER_LIMIT2,
- true,
- &rd->rpl[i].last_power_limit);
+ POWER_LIMIT2, true,
+ &rd->rpl[i].last_power_limit);
if (ret)
rd->rpl[i].last_power_limit = 0;
break;
@@ -1578,15 +1378,13 @@ static void power_limit_state_restore(void)
switch (rd->rpl[i].prim_id) {
case PL1_ENABLE:
if (rd->rpl[i].last_power_limit)
- rapl_write_data_raw(rd,
- POWER_LIMIT1,
- rd->rpl[i].last_power_limit);
+ rapl_write_data_raw(rd, POWER_LIMIT1,
+ rd->rpl[i].last_power_limit);
break;
case PL2_ENABLE:
if (rd->rpl[i].last_power_limit)
- rapl_write_data_raw(rd,
- POWER_LIMIT2,
- rd->rpl[i].last_power_limit);
+ rapl_write_data_raw(rd, POWER_LIMIT2,
+ rd->rpl[i].last_power_limit);
break;
}
}
@@ -1595,7 +1393,7 @@ static void power_limit_state_restore(void)
}
static int rapl_pm_callback(struct notifier_block *nb,
- unsigned long mode, void *_unused)
+ unsigned long mode, void *_unused)
{
switch (mode) {
case PM_SUSPEND_PREPARE:
@@ -1612,6 +1410,8 @@ static struct notifier_block rapl_pm_notifier = {
.notifier_call = rapl_pm_callback,
};
+static struct platform_device *rapl_msr_platdev;
+
static int __init rapl_init(void)
{
const struct x86_cpu_id *id;
@@ -1620,50 +1420,43 @@ static int __init rapl_init(void)
id = x86_match_cpu(rapl_ids);
if (!id) {
pr_err("driver does not support CPU family %d model %d\n",
- boot_cpu_data.x86, boot_cpu_data.x86_model);
+ boot_cpu_data.x86, boot_cpu_data.x86_model);
return -ENODEV;
}
rapl_defaults = (struct rapl_defaults *)id->driver_data;
- ret = rapl_register_powercap();
+ ret = register_pm_notifier(&rapl_pm_notifier);
if (ret)
return ret;
- ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powercap/rapl:online",
- rapl_cpu_online, rapl_cpu_down_prep);
- if (ret < 0)
- goto err_unreg;
- pcap_rapl_online = ret;
-
- /* Don't bail out if PSys is not supported */
- rapl_register_psys();
+ rapl_msr_platdev = platform_device_alloc("intel_rapl_msr", 0);
+ if (!rapl_msr_platdev) {
+ ret = -ENOMEM;
+ goto end;
+ }
- ret = register_pm_notifier(&rapl_pm_notifier);
+ ret = platform_device_add(rapl_msr_platdev);
if (ret)
- goto err_unreg_all;
+ platform_device_put(rapl_msr_platdev);
- return 0;
-
-err_unreg_all:
- cpuhp_remove_state(pcap_rapl_online);
+end:
+ if (ret)
+ unregister_pm_notifier(&rapl_pm_notifier);
-err_unreg:
- rapl_unregister_powercap();
return ret;
}
static void __exit rapl_exit(void)
{
+ platform_device_unregister(rapl_msr_platdev);
unregister_pm_notifier(&rapl_pm_notifier);
- cpuhp_remove_state(pcap_rapl_online);
- rapl_unregister_powercap();
}
module_init(rapl_init);
module_exit(rapl_exit);
-MODULE_DESCRIPTION("Driver for Intel RAPL (Running Average Power Limit)");
+MODULE_DESCRIPTION("Intel Runtime Average Power Limit (RAPL) common code");
MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@intel.com>");
MODULE_LICENSE("GPL v2");
diff --git a/drivers/powercap/intel_rapl_msr.c b/drivers/powercap/intel_rapl_msr.c
new file mode 100644
index 000000000000..d5487965bdfe
--- /dev/null
+++ b/drivers/powercap/intel_rapl_msr.c
@@ -0,0 +1,183 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Intel Running Average Power Limit (RAPL) Driver via MSR interface
+ * Copyright (c) 2019, Intel Corporation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/log2.h>
+#include <linux/bitmap.h>
+#include <linux/delay.h>
+#include <linux/sysfs.h>
+#include <linux/cpu.h>
+#include <linux/powercap.h>
+#include <linux/suspend.h>
+#include <linux/intel_rapl.h>
+#include <linux/processor.h>
+#include <linux/platform_device.h>
+
+#include <asm/iosf_mbi.h>
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
+
+/* Local defines */
+#define MSR_PLATFORM_POWER_LIMIT 0x0000065C
+
+/* private data for RAPL MSR Interface */
+static struct rapl_if_priv rapl_msr_priv = {
+ .reg_unit = MSR_RAPL_POWER_UNIT,
+ .regs[RAPL_DOMAIN_PACKAGE] = {
+ MSR_PKG_POWER_LIMIT, MSR_PKG_ENERGY_STATUS, MSR_PKG_PERF_STATUS, 0, MSR_PKG_POWER_INFO },
+ .regs[RAPL_DOMAIN_PP0] = {
+ MSR_PP0_POWER_LIMIT, MSR_PP0_ENERGY_STATUS, 0, MSR_PP0_POLICY, 0 },
+ .regs[RAPL_DOMAIN_PP1] = {
+ MSR_PP1_POWER_LIMIT, MSR_PP1_ENERGY_STATUS, 0, MSR_PP1_POLICY, 0 },
+ .regs[RAPL_DOMAIN_DRAM] = {
+ MSR_DRAM_POWER_LIMIT, MSR_DRAM_ENERGY_STATUS, MSR_DRAM_PERF_STATUS, 0, MSR_DRAM_POWER_INFO },
+ .regs[RAPL_DOMAIN_PLATFORM] = {
+ MSR_PLATFORM_POWER_LIMIT, MSR_PLATFORM_ENERGY_STATUS, 0, 0, 0},
+ .limits[RAPL_DOMAIN_PACKAGE] = 2,
+};
+
+/* Handles CPU hotplug on multi-socket systems.
+ * If a CPU goes online as the first CPU of the physical package
+ * we add the RAPL package to the system. Similarly, when the last
+ * CPU of the package is removed, we remove the RAPL package and its
+ * associated domains. Cooling devices are handled accordingly at
+ * per-domain level.
+ */
+static int rapl_cpu_online(unsigned int cpu)
+{
+ struct rapl_package *rp;
+
+ rp = rapl_find_package_domain(cpu, &rapl_msr_priv);
+ if (!rp) {
+ rp = rapl_add_package(cpu, &rapl_msr_priv);
+ if (IS_ERR(rp))
+ return PTR_ERR(rp);
+ }
+ cpumask_set_cpu(cpu, &rp->cpumask);
+ return 0;
+}
+
+static int rapl_cpu_down_prep(unsigned int cpu)
+{
+ struct rapl_package *rp;
+ int lead_cpu;
+
+ rp = rapl_find_package_domain(cpu, &rapl_msr_priv);
+ if (!rp)
+ return 0;
+
+ cpumask_clear_cpu(cpu, &rp->cpumask);
+ lead_cpu = cpumask_first(&rp->cpumask);
+ if (lead_cpu >= nr_cpu_ids)
+ rapl_remove_package(rp);
+ else if (rp->lead_cpu == cpu)
+ rp->lead_cpu = lead_cpu;
+ return 0;
+}
+
+static int rapl_msr_read_raw(int cpu, struct reg_action *ra)
+{
+ u32 msr = (u32)ra->reg;
+
+ if (rdmsrl_safe_on_cpu(cpu, msr, &ra->value)) {
+ pr_debug("failed to read msr 0x%x on cpu %d\n", msr, cpu);
+ return -EIO;
+ }
+ ra->value &= ra->mask;
+ return 0;
+}
+
+static void rapl_msr_update_func(void *info)
+{
+ struct reg_action *ra = info;
+ u32 msr = (u32)ra->reg;
+ u64 val;
+
+ ra->err = rdmsrl_safe(msr, &val);
+ if (ra->err)
+ return;
+
+ val &= ~ra->mask;
+ val |= ra->value;
+
+ ra->err = wrmsrl_safe(msr, val);
+}
+
+static int rapl_msr_write_raw(int cpu, struct reg_action *ra)
+{
+ int ret;
+
+ ret = smp_call_function_single(cpu, rapl_msr_update_func, ra, 1);
+ if (WARN_ON_ONCE(ret))
+ return ret;
+
+ return ra->err;
+}
+
+static int rapl_msr_probe(struct platform_device *pdev)
+{
+ int ret;
+
+ rapl_msr_priv.read_raw = rapl_msr_read_raw;
+ rapl_msr_priv.write_raw = rapl_msr_write_raw;
+
+ rapl_msr_priv.control_type = powercap_register_control_type(NULL, "intel-rapl", NULL);
+ if (IS_ERR(rapl_msr_priv.control_type)) {
+ pr_debug("failed to register powercap control_type.\n");
+ return PTR_ERR(rapl_msr_priv.control_type);
+ }
+
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powercap/rapl:online",
+ rapl_cpu_online, rapl_cpu_down_prep);
+ if (ret < 0)
+ goto out;
+ rapl_msr_priv.pcap_rapl_online = ret;
+
+ /* Don't bail out if PSys is not supported */
+ rapl_add_platform_domain(&rapl_msr_priv);
+
+ return 0;
+
+out:
+ if (ret)
+ powercap_unregister_control_type(rapl_msr_priv.control_type);
+ return ret;
+}
+
+static int rapl_msr_remove(struct platform_device *pdev)
+{
+ cpuhp_remove_state(rapl_msr_priv.pcap_rapl_online);
+ rapl_remove_platform_domain(&rapl_msr_priv);
+ powercap_unregister_control_type(rapl_msr_priv.control_type);
+ return 0;
+}
+
+static const struct platform_device_id rapl_msr_ids[] = {
+ { .name = "intel_rapl_msr", },
+ {}
+};
+MODULE_DEVICE_TABLE(platform, rapl_msr_ids);
+
+static struct platform_driver intel_rapl_msr_driver = {
+ .probe = rapl_msr_probe,
+ .remove = rapl_msr_remove,
+ .id_table = rapl_msr_ids,
+ .driver = {
+ .name = "intel_rapl_msr",
+ },
+};
+
+module_platform_driver(intel_rapl_msr_driver);
+
+MODULE_DESCRIPTION("Driver for Intel RAPL (Running Average Power Limit) control via MSR interface");
+MODULE_AUTHOR("Zhang Rui <rui.zhang@intel.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index d04d4378ca50..63502ca537eb 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -679,7 +679,7 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char
goto put_dev;
dev_info->dax_dev = alloc_dax(dev_info, dev_info->gd->disk_name,
- &dcssblk_dax_ops);
+ &dcssblk_dax_ops, DAXDEV_F_SYNC);
if (!dev_info->dax_dev) {
rc = -ENOMEM;
goto put_dev;
diff --git a/drivers/thermal/intel/int340x_thermal/Kconfig b/drivers/thermal/intel/int340x_thermal/Kconfig
index 5333e018c88c..797907542e43 100644
--- a/drivers/thermal/intel/int340x_thermal/Kconfig
+++ b/drivers/thermal/intel/int340x_thermal/Kconfig
@@ -40,4 +40,10 @@ config INT3406_THERMAL
brightness in order to address a thermal condition or to reduce
power consumed by display device.
+config PROC_THERMAL_MMIO_RAPL
+ bool
+ depends on 64BIT
+ depends on POWERCAP
+ select INTEL_RAPL_CORE
+ default y
endif
diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_device.c b/drivers/thermal/intel/int340x_thermal/processor_thermal_device.c
index 77dae1e7c3bf..213ab3cc6b80 100644
--- a/drivers/thermal/intel/int340x_thermal/processor_thermal_device.c
+++ b/drivers/thermal/intel/int340x_thermal/processor_thermal_device.c
@@ -11,6 +11,8 @@
#include <linux/platform_device.h>
#include <linux/acpi.h>
#include <linux/thermal.h>
+#include <linux/cpuhotplug.h>
+#include <linux/intel_rapl.h>
#include "int340x_thermal_zone.h"
#include "../intel_soc_dts_iosf.h"
@@ -37,6 +39,8 @@
/* GeminiLake thermal reporting device */
#define PCI_DEVICE_ID_PROC_GLK_THERMAL 0x318C
+#define DRV_NAME "proc_thermal"
+
struct power_config {
u32 index;
u32 min_uw;
@@ -52,6 +56,7 @@ struct proc_thermal_device {
struct power_config power_limits[2];
struct int34x_thermal_zone *int340x_zone;
struct intel_soc_dts_sensors *soc_dts;
+ void __iomem *mmio_base;
};
enum proc_thermal_emum_mode_type {
@@ -60,6 +65,12 @@ enum proc_thermal_emum_mode_type {
PROC_THERMAL_PLATFORM_DEV
};
+struct rapl_mmio_regs {
+ u64 reg_unit;
+ u64 regs[RAPL_DOMAIN_MAX][RAPL_DOMAIN_REG_MAX];
+ int limits[RAPL_DOMAIN_MAX];
+};
+
/*
* We can have only one type of enumeration, PCI or Platform,
* not both. So we don't need instance specific data.
@@ -367,8 +378,151 @@ static irqreturn_t proc_thermal_pci_msi_irq(int irq, void *devid)
return IRQ_HANDLED;
}
+#ifdef CONFIG_PROC_THERMAL_MMIO_RAPL
+
+#define MCHBAR 0
+
+/* RAPL Support via MMIO interface */
+static struct rapl_if_priv rapl_mmio_priv;
+
+static int rapl_mmio_cpu_online(unsigned int cpu)
+{
+ struct rapl_package *rp;
+
+ /* mmio rapl supports package 0 only for now */
+ if (topology_physical_package_id(cpu))
+ return 0;
+
+ rp = rapl_find_package_domain(cpu, &rapl_mmio_priv);
+ if (!rp) {
+ rp = rapl_add_package(cpu, &rapl_mmio_priv);
+ if (IS_ERR(rp))
+ return PTR_ERR(rp);
+ }
+ cpumask_set_cpu(cpu, &rp->cpumask);
+ return 0;
+}
+
+static int rapl_mmio_cpu_down_prep(unsigned int cpu)
+{
+ struct rapl_package *rp;
+ int lead_cpu;
+
+ rp = rapl_find_package_domain(cpu, &rapl_mmio_priv);
+ if (!rp)
+ return 0;
+
+ cpumask_clear_cpu(cpu, &rp->cpumask);
+ lead_cpu = cpumask_first(&rp->cpumask);
+ if (lead_cpu >= nr_cpu_ids)
+ rapl_remove_package(rp);
+ else if (rp->lead_cpu == cpu)
+ rp->lead_cpu = lead_cpu;
+ return 0;
+}
+
+static int rapl_mmio_read_raw(int cpu, struct reg_action *ra)
+{
+ if (!ra->reg)
+ return -EINVAL;
+
+ ra->value = readq((void __iomem *)ra->reg);
+ ra->value &= ra->mask;
+ return 0;
+}
+
+static int rapl_mmio_write_raw(int cpu, struct reg_action *ra)
+{
+ u64 val;
+
+ if (!ra->reg)
+ return -EINVAL;
+
+ val = readq((void __iomem *)ra->reg);
+ val &= ~ra->mask;
+ val |= ra->value;
+ writeq(val, (void __iomem *)ra->reg);
+ return 0;
+}
+
+static int proc_thermal_rapl_add(struct pci_dev *pdev,
+ struct proc_thermal_device *proc_priv,
+ struct rapl_mmio_regs *rapl_regs)
+{
+ enum rapl_domain_reg_id reg;
+ enum rapl_domain_type domain;
+ int ret;
+
+ if (!rapl_regs)
+ return 0;
+
+ ret = pcim_iomap_regions(pdev, 1 << MCHBAR, DRV_NAME);
+ if (ret) {
+ dev_err(&pdev->dev, "cannot reserve PCI memory region\n");
+ return -ENOMEM;
+ }
+
+ proc_priv->mmio_base = pcim_iomap_table(pdev)[MCHBAR];
+
+ for (domain = RAPL_DOMAIN_PACKAGE; domain < RAPL_DOMAIN_MAX; domain++) {
+ for (reg = RAPL_DOMAIN_REG_LIMIT; reg < RAPL_DOMAIN_REG_MAX; reg++)
+ if (rapl_regs->regs[domain][reg])
+ rapl_mmio_priv.regs[domain][reg] =
+ (u64)proc_priv->mmio_base +
+ rapl_regs->regs[domain][reg];
+ rapl_mmio_priv.limits[domain] = rapl_regs->limits[domain];
+ }
+ rapl_mmio_priv.reg_unit = (u64)proc_priv->mmio_base + rapl_regs->reg_unit;
+
+ rapl_mmio_priv.read_raw = rapl_mmio_read_raw;
+ rapl_mmio_priv.write_raw = rapl_mmio_write_raw;
+
+ rapl_mmio_priv.control_type = powercap_register_control_type(NULL, "intel-rapl-mmio", NULL);
+ if (IS_ERR(rapl_mmio_priv.control_type)) {
+ pr_debug("failed to register powercap control_type.\n");
+ return PTR_ERR(rapl_mmio_priv.control_type);
+ }
+
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powercap/rapl:online",
+ rapl_mmio_cpu_online, rapl_mmio_cpu_down_prep);
+ if (ret < 0) {
+ powercap_unregister_control_type(rapl_mmio_priv.control_type);
+ return ret;
+ }
+ rapl_mmio_priv.pcap_rapl_online = ret;
+
+ return 0;
+}
+
+static void proc_thermal_rapl_remove(void)
+{
+ cpuhp_remove_state(rapl_mmio_priv.pcap_rapl_online);
+ powercap_unregister_control_type(rapl_mmio_priv.control_type);
+}
+
+static const struct rapl_mmio_regs rapl_mmio_hsw = {
+ .reg_unit = 0x5938,
+ .regs[RAPL_DOMAIN_PACKAGE] = { 0x59a0, 0x593c, 0x58f0, 0, 0x5930},
+ .regs[RAPL_DOMAIN_DRAM] = { 0x58e0, 0x58e8, 0x58ec, 0, 0},
+ .limits[RAPL_DOMAIN_PACKAGE] = 2,
+ .limits[RAPL_DOMAIN_DRAM] = 2,
+};
+
+#else
+
+static int proc_thermal_rapl_add(struct pci_dev *pdev,
+ struct proc_thermal_device *proc_priv,
+ struct rapl_mmio_regs *rapl_regs)
+{
+ return 0;
+}
+static void proc_thermal_rapl_remove(void) {}
+static const struct rapl_mmio_regs rapl_mmio_hsw;
+
+#endif /* CONFIG_MMIO_RAPL */
+
static int proc_thermal_pci_probe(struct pci_dev *pdev,
- const struct pci_device_id *unused)
+ const struct pci_device_id *id)
{
struct proc_thermal_device *proc_priv;
int ret;
@@ -378,15 +532,21 @@ static int proc_thermal_pci_probe(struct pci_dev *pdev,
return -ENODEV;
}
- ret = pci_enable_device(pdev);
+ ret = pcim_enable_device(pdev);
if (ret < 0) {
dev_err(&pdev->dev, "error: could not enable device\n");
return ret;
}
ret = proc_thermal_add(&pdev->dev, &proc_priv);
+ if (ret)
+ return ret;
+
+ ret = proc_thermal_rapl_add(pdev, proc_priv,
+ (struct rapl_mmio_regs *)id->driver_data);
if (ret) {
- pci_disable_device(pdev);
+ dev_err(&pdev->dev, "failed to add RAPL MMIO interface\n");
+ proc_thermal_remove(proc_priv);
return ret;
}
@@ -439,8 +599,8 @@ static void proc_thermal_pci_remove(struct pci_dev *pdev)
pci_disable_msi(pdev);
}
}
+ proc_thermal_rapl_remove();
proc_thermal_remove(proc_priv);
- pci_disable_device(pdev);
}
#ifdef CONFIG_PM_SLEEP
@@ -462,7 +622,8 @@ static SIMPLE_DEV_PM_OPS(proc_thermal_pm, NULL, proc_thermal_resume);
static const struct pci_device_id proc_thermal_pci_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_PROC_BDW_THERMAL)},
{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_PROC_HSB_THERMAL)},
- { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_PROC_SKL_THERMAL)},
+ { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_PROC_SKL_THERMAL),
+ .driver_data = (kernel_ulong_t)&rapl_mmio_hsw, },
{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_PROC_BSW_THERMAL)},
{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_PROC_BXT0_THERMAL)},
{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_PROC_BXT1_THERMAL)},
@@ -477,7 +638,7 @@ static const struct pci_device_id proc_thermal_pci_ids[] = {
MODULE_DEVICE_TABLE(pci, proc_thermal_pci_ids);
static struct pci_driver proc_thermal_pci_driver = {
- .name = "proc_thermal",
+ .name = DRV_NAME,
.probe = proc_thermal_pci_probe,
.remove = proc_thermal_pci_remove,
.id_table = proc_thermal_pci_ids,
diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
index 023fc3bc01c6..078615cf2afc 100644
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -43,6 +43,17 @@ config VIRTIO_PCI_LEGACY
If unsure, say Y.
+config VIRTIO_PMEM
+ tristate "Support for virtio pmem driver"
+ depends on VIRTIO
+ depends on LIBNVDIMM
+ help
+ This driver provides access to virtio-pmem devices, storage devices
+ that are mapped into the physical address space - similar to NVDIMMs
+ - with a virtio-based flushing interface.
+
+ If unsure, say Y.
+
config VIRTIO_BALLOON
tristate "Virtio balloon driver"
depends on VIRTIO
diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig
index 6cad0b33d7ad..8188963a405b 100644
--- a/drivers/watchdog/Kconfig
+++ b/drivers/watchdog/Kconfig
@@ -58,6 +58,15 @@ config WATCHDOG_HANDLE_BOOT_ENABLED
the watchdog on its own. Thus if your userspace does not start fast
enough your device will reboot.
+config WATCHDOG_OPEN_TIMEOUT
+ int "Timeout value for opening watchdog device"
+ default 0
+ help
+ The maximum time, in seconds, for which the watchdog framework takes
+ care of pinging a hardware watchdog. A value of 0 means infinite. The
+ value set here can be overridden by the commandline parameter
+ "watchdog.open_timeout".
+
config WATCHDOG_SYSFS
bool "Read different watchdog information through sysfs"
help
@@ -717,6 +726,7 @@ config IMX2_WDT
config IMX_SC_WDT
tristate "IMX SC Watchdog"
depends on HAVE_ARM_SMCCC
+ depends on IMX_SCU
select WATCHDOG_CORE
help
This is the driver for the system controller watchdog
diff --git a/drivers/watchdog/acquirewdt.c b/drivers/watchdog/acquirewdt.c
index 957d1255d4ca..848db958411e 100644
--- a/drivers/watchdog/acquirewdt.c
+++ b/drivers/watchdog/acquirewdt.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* Acquire Single Board Computer Watchdog Timer driver
*
@@ -6,11 +7,6 @@
* (c) Copyright 1996 Alan Cox <alan@lxorguk.ukuu.org.uk>,
* All Rights Reserved.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Neither Alan Cox nor CymruNet Ltd. admit liability nor provide
* warranty for any of this software. This material is provided
* "AS-IS" and at no charge.
diff --git a/drivers/watchdog/advantechwdt.c b/drivers/watchdog/advantechwdt.c
index 2766af292a71..0d02bb275b3d 100644
--- a/drivers/watchdog/advantechwdt.c
+++ b/drivers/watchdog/advantechwdt.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* Advantech Single Board Computer WDT driver
*
@@ -9,11 +10,6 @@
* (c) Copyright 1996 Alan Cox <alan@lxorguk.ukuu.org.uk>,
* All Rights Reserved.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Neither Alan Cox nor CymruNet Ltd. admit liability nor provide
* warranty for any of this software. This material is provided
* "AS-IS" and at no charge.
diff --git a/drivers/watchdog/aspeed_wdt.c b/drivers/watchdog/aspeed_wdt.c
index f0148637e5dd..cc71861e033a 100644
--- a/drivers/watchdog/aspeed_wdt.c
+++ b/drivers/watchdog/aspeed_wdt.c
@@ -309,13 +309,7 @@ static int aspeed_wdt_probe(struct platform_device *pdev)
if (status & WDT_TIMEOUT_STATUS_BOOT_SECONDARY)
wdt->wdd.bootstatus = WDIOF_CARDRESET;
- ret = devm_watchdog_register_device(dev, &wdt->wdd);
- if (ret) {
- dev_err(dev, "failed to register\n");
- return ret;
- }
-
- return 0;
+ return devm_watchdog_register_device(dev, &wdt->wdd);
}
static struct platform_driver aspeed_watchdog_driver = {
diff --git a/drivers/watchdog/bcm2835_wdt.c b/drivers/watchdog/bcm2835_wdt.c
index 560c1c54c177..dec6ca019bea 100644
--- a/drivers/watchdog/bcm2835_wdt.c
+++ b/drivers/watchdog/bcm2835_wdt.c
@@ -202,10 +202,8 @@ static int bcm2835_wdt_probe(struct platform_device *pdev)
watchdog_stop_on_reboot(&bcm2835_wdt_wdd);
err = devm_watchdog_register_device(dev, &bcm2835_wdt_wdd);
- if (err) {
- dev_err(dev, "Failed to register watchdog device");
+ if (err)
return err;
- }
if (pm_power_off == NULL) {
pm_power_off = bcm2835_power_off;
@@ -240,6 +238,7 @@ module_param(nowayout, bool, 0);
MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default="
__MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
+MODULE_ALIAS("platform:bcm2835-wdt");
MODULE_AUTHOR("Lubomir Rintel <lkundrak@v3.sk>");
MODULE_DESCRIPTION("Driver for Broadcom BCM2835 watchdog timer");
MODULE_LICENSE("GPL");
diff --git a/drivers/watchdog/bcm7038_wdt.c b/drivers/watchdog/bcm7038_wdt.c
index d3d88f6703d7..979caa18d3c8 100644
--- a/drivers/watchdog/bcm7038_wdt.c
+++ b/drivers/watchdog/bcm7038_wdt.c
@@ -159,10 +159,8 @@ static int bcm7038_wdt_probe(struct platform_device *pdev)
watchdog_stop_on_reboot(&wdt->wdd);
watchdog_stop_on_unregister(&wdt->wdd);
err = devm_watchdog_register_device(dev, &wdt->wdd);
- if (err) {
- dev_err(dev, "Failed to register watchdog device\n");
+ if (err)
return err;
- }
dev_info(dev, "Registered BCM7038 Watchdog\n");
diff --git a/drivers/watchdog/bcm_kona_wdt.c b/drivers/watchdog/bcm_kona_wdt.c
index 921291025680..eb850a8d19df 100644
--- a/drivers/watchdog/bcm_kona_wdt.c
+++ b/drivers/watchdog/bcm_kona_wdt.c
@@ -301,10 +301,8 @@ static int bcm_kona_wdt_probe(struct platform_device *pdev)
watchdog_stop_on_reboot(&bcm_kona_wdt_wdd);
watchdog_stop_on_unregister(&bcm_kona_wdt_wdd);
ret = devm_watchdog_register_device(dev, &bcm_kona_wdt_wdd);
- if (ret) {
- dev_err(dev, "Failed to register watchdog device");
+ if (ret)
return ret;
- }
bcm_kona_wdt_debug_init(pdev);
dev_dbg(dev, "Broadcom Kona Watchdog Timer");
diff --git a/drivers/watchdog/cadence_wdt.c b/drivers/watchdog/cadence_wdt.c
index a22f2d431a35..f8d4e91d0383 100644
--- a/drivers/watchdog/cadence_wdt.c
+++ b/drivers/watchdog/cadence_wdt.c
@@ -363,10 +363,8 @@ static int cdns_wdt_probe(struct platform_device *pdev)
watchdog_stop_on_reboot(cdns_wdt_device);
watchdog_stop_on_unregister(cdns_wdt_device);
ret = devm_watchdog_register_device(dev, cdns_wdt_device);
- if (ret) {
- dev_err(dev, "Failed to register wdt device\n");
+ if (ret)
return ret;
- }
platform_set_drvdata(pdev, wdt);
dev_info(dev, "Xilinx Watchdog Timer at %p with timeout %ds%s\n",
diff --git a/drivers/watchdog/da9052_wdt.c b/drivers/watchdog/da9052_wdt.c
index a2feef1ff307..d708c091bf1b 100644
--- a/drivers/watchdog/da9052_wdt.c
+++ b/drivers/watchdog/da9052_wdt.c
@@ -176,14 +176,7 @@ static int da9052_wdt_probe(struct platform_device *pdev)
return ret;
}
- ret = devm_watchdog_register_device(dev, &driver_data->wdt);
- if (ret != 0) {
- dev_err(da9052->dev, "watchdog_register_device() failed: %d\n",
- ret);
- return ret;
- }
-
- return ret;
+ return devm_watchdog_register_device(dev, &driver_data->wdt);
}
static struct platform_driver da9052_wdt_driver = {
diff --git a/drivers/watchdog/da9062_wdt.c b/drivers/watchdog/da9062_wdt.c
index aac749cfaccb..e149e66a6ea9 100644
--- a/drivers/watchdog/da9062_wdt.c
+++ b/drivers/watchdog/da9062_wdt.c
@@ -214,11 +214,8 @@ static int da9062_wdt_probe(struct platform_device *pdev)
watchdog_set_drvdata(&wdt->wdtdev, wdt);
ret = devm_watchdog_register_device(dev, &wdt->wdtdev);
- if (ret < 0) {
- dev_err(wdt->hw->dev,
- "watchdog registration failed (%d)\n", ret);
+ if (ret < 0)
return ret;
- }
return da9062_wdt_ping(&wdt->wdtdev);
}
diff --git a/drivers/watchdog/davinci_wdt.c b/drivers/watchdog/davinci_wdt.c
index 7b2ee35b5ffd..2b3f3cd382ef 100644
--- a/drivers/watchdog/davinci_wdt.c
+++ b/drivers/watchdog/davinci_wdt.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* drivers/char/watchdog/davinci_wdt.c
*
@@ -5,10 +6,7 @@
*
* Copyright (C) 2006-2013 Texas Instruments.
*
- * 2007 (c) MontaVista Software, Inc. This file is licensed under
- * the terms of the GNU General Public License version 2. This program
- * is licensed "as is" without any warranty of any kind, whether express
- * or implied.
+ * 2007 (c) MontaVista Software, Inc.
*/
#include <linux/module.h>
@@ -247,13 +245,7 @@ static int davinci_wdt_probe(struct platform_device *pdev)
if (IS_ERR(davinci_wdt->base))
return PTR_ERR(davinci_wdt->base);
- ret = devm_watchdog_register_device(dev, wdd);
- if (ret) {
- dev_err(dev, "cannot register watchdog device\n");
- return ret;
- }
-
- return 0;
+ return devm_watchdog_register_device(dev, wdd);
}
static const struct of_device_id davinci_wdt_of_match[] = {
diff --git a/drivers/watchdog/digicolor_wdt.c b/drivers/watchdog/digicolor_wdt.c
index 8af6e9a67d0d..073d37867f47 100644
--- a/drivers/watchdog/digicolor_wdt.c
+++ b/drivers/watchdog/digicolor_wdt.c
@@ -118,7 +118,6 @@ static int dc_wdt_probe(struct platform_device *pdev)
{
struct device *dev = &pdev->dev;
struct dc_wdt *wdt;
- int ret;
wdt = devm_kzalloc(dev, sizeof(struct dc_wdt), GFP_KERNEL);
if (!wdt)
@@ -141,13 +140,7 @@ static int dc_wdt_probe(struct platform_device *pdev)
watchdog_set_restart_priority(&dc_wdt_wdd, 128);
watchdog_init_timeout(&dc_wdt_wdd, timeout, dev);
watchdog_stop_on_reboot(&dc_wdt_wdd);
- ret = devm_watchdog_register_device(dev, &dc_wdt_wdd);
- if (ret) {
- dev_err(dev, "Failed to register watchdog device");
- return ret;
- }
-
- return 0;
+ return devm_watchdog_register_device(dev, &dc_wdt_wdd);
}
static const struct of_device_id dc_wdt_of_match[] = {
diff --git a/drivers/watchdog/ebc-c384_wdt.c b/drivers/watchdog/ebc-c384_wdt.c
index c176f59fea28..8ef4b0df3855 100644
--- a/drivers/watchdog/ebc-c384_wdt.c
+++ b/drivers/watchdog/ebc-c384_wdt.c
@@ -2,15 +2,6 @@
/*
* Watchdog timer driver for the WinSystems EBC-C384
* Copyright (C) 2016 William Breathitt Gray
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
#include <linux/device.h>
#include <linux/dmi.h>
diff --git a/drivers/watchdog/eurotechwdt.c b/drivers/watchdog/eurotechwdt.c
index 89129e6fa9b6..3a83a48abcae 100644
--- a/drivers/watchdog/eurotechwdt.c
+++ b/drivers/watchdog/eurotechwdt.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* Eurotech CPU-1220/1410/1420 on board WDT driver
*
@@ -11,11 +12,6 @@
* (c) Copyright 1996-1997 Alan Cox <alan@lxorguk.ukuu.org.uk>,
* All Rights Reserved.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Neither Alan Cox nor CymruNet Ltd. admit liability nor provide
* warranty for any of this software. This material is provided
* "AS-IS" and at no charge.
diff --git a/drivers/watchdog/ftwdt010_wdt.c b/drivers/watchdog/ftwdt010_wdt.c
index d9626ef9b9ae..21dcc7765688 100644
--- a/drivers/watchdog/ftwdt010_wdt.c
+++ b/drivers/watchdog/ftwdt010_wdt.c
@@ -165,10 +165,8 @@ static int ftwdt010_wdt_probe(struct platform_device *pdev)
}
ret = devm_watchdog_register_device(dev, &gwdt->wdd);
- if (ret) {
- dev_err(dev, "failed to register watchdog\n");
+ if (ret)
return ret;
- }
/* Set up platform driver data */
platform_set_drvdata(pdev, gwdt);
diff --git a/drivers/watchdog/gpio_wdt.c b/drivers/watchdog/gpio_wdt.c
index 777de10f2a78..0923201ce874 100644
--- a/drivers/watchdog/gpio_wdt.c
+++ b/drivers/watchdog/gpio_wdt.c
@@ -13,6 +13,12 @@
#include <linux/platform_device.h>
#include <linux/watchdog.h>
+static bool nowayout = WATCHDOG_NOWAYOUT;
+module_param(nowayout, bool, 0);
+MODULE_PARM_DESC(nowayout,
+ "Watchdog cannot be stopped once started (default="
+ __MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
+
#define SOFT_TIMEOUT_MIN 1
#define SOFT_TIMEOUT_DEF 60
@@ -151,6 +157,7 @@ static int gpio_wdt_probe(struct platform_device *pdev)
priv->wdd.timeout = SOFT_TIMEOUT_DEF;
watchdog_init_timeout(&priv->wdd, 0, dev);
+ watchdog_set_nowayout(&priv->wdd, nowayout);
watchdog_stop_on_reboot(&priv->wdd);
diff --git a/drivers/watchdog/hpwdt.c b/drivers/watchdog/hpwdt.c
index 8a90f159ffb1..7d34bcf1c45b 100644
--- a/drivers/watchdog/hpwdt.c
+++ b/drivers/watchdog/hpwdt.c
@@ -22,10 +22,11 @@
#include <linux/watchdog.h>
#include <asm/nmi.h>
-#define HPWDT_VERSION "2.0.2"
+#define HPWDT_VERSION "2.0.3"
#define SECS_TO_TICKS(secs) ((secs) * 1000 / 128)
#define TICKS_TO_SECS(ticks) ((ticks) * 128 / 1000)
-#define HPWDT_MAX_TIMER TICKS_TO_SECS(65535)
+#define HPWDT_MAX_TICKS 65535
+#define HPWDT_MAX_TIMER TICKS_TO_SECS(HPWDT_MAX_TICKS)
#define DEFAULT_MARGIN 30
#define PRETIMEOUT_SEC 9
@@ -33,6 +34,7 @@ static bool ilo5;
static unsigned int soft_margin = DEFAULT_MARGIN; /* in seconds */
static bool nowayout = WATCHDOG_NOWAYOUT;
static bool pretimeout = IS_ENABLED(CONFIG_HPWDT_NMI_DECODING);
+static int kdumptimeout = -1;
static void __iomem *pci_mem_addr; /* the PCI-memory address */
static unsigned long __iomem *hpwdt_nmistat;
@@ -52,15 +54,21 @@ static const struct pci_device_id hpwdt_blacklist[] = {
{0}, /* terminate list */
};
+static struct watchdog_device hpwdt_dev;
/*
* Watchdog operations
*/
+static int hpwdt_hw_is_running(void)
+{
+ return ioread8(hpwdt_timer_con) & 0x01;
+}
+
static int hpwdt_start(struct watchdog_device *wdd)
{
int control = 0x81 | (pretimeout ? 0x4 : 0);
- int reload = SECS_TO_TICKS(wdd->timeout);
+ int reload = SECS_TO_TICKS(min(wdd->timeout, wdd->max_hw_heartbeat_ms/1000));
- dev_dbg(wdd->parent, "start watchdog 0x%08x:0x%02x\n", reload, control);
+ dev_dbg(wdd->parent, "start watchdog 0x%08x:0x%08x:0x%02x\n", wdd->timeout, reload, control);
iowrite16(reload, hpwdt_timer_reg);
iowrite8(control, hpwdt_timer_con);
@@ -85,12 +93,18 @@ static int hpwdt_stop_core(struct watchdog_device *wdd)
return 0;
}
+static void hpwdt_ping_ticks(int val)
+{
+ val = min(val, HPWDT_MAX_TICKS);
+ iowrite16(val, hpwdt_timer_reg);
+}
+
static int hpwdt_ping(struct watchdog_device *wdd)
{
- int reload = SECS_TO_TICKS(wdd->timeout);
+ int reload = SECS_TO_TICKS(min(wdd->timeout, wdd->max_hw_heartbeat_ms/1000));
- dev_dbg(wdd->parent, "ping watchdog 0x%08x\n", reload);
- iowrite16(reload, hpwdt_timer_reg);
+ dev_dbg(wdd->parent, "ping watchdog 0x%08x:0x%08x\n", wdd->timeout, reload);
+ hpwdt_ping_ticks(reload);
return 0;
}
@@ -166,7 +180,14 @@ static int hpwdt_pretimeout(unsigned int ulReason, struct pt_regs *regs)
if (ilo5 && !pretimeout && !mynmi)
return NMI_DONE;
- hpwdt_stop();
+ if (kdumptimeout < 0)
+ hpwdt_stop();
+ else if (kdumptimeout == 0)
+ ;
+ else {
+ unsigned int val = max((unsigned int)kdumptimeout, hpwdt_dev.timeout);
+ hpwdt_ping_ticks(SECS_TO_TICKS(val));
+ }
hex_byte_pack(panic_msg, mynmi);
nmi_panic(regs, panic_msg);
@@ -204,9 +225,9 @@ static struct watchdog_device hpwdt_dev = {
.info = &ident,
.ops = &hpwdt_ops,
.min_timeout = 1,
- .max_timeout = HPWDT_MAX_TIMER,
.timeout = DEFAULT_MARGIN,
.pretimeout = PRETIMEOUT_SEC,
+ .max_hw_heartbeat_ms = HPWDT_MAX_TIMER * 1000,
};
@@ -298,14 +319,18 @@ static int hpwdt_init_one(struct pci_dev *dev,
hpwdt_timer_reg = pci_mem_addr + 0x70;
hpwdt_timer_con = pci_mem_addr + 0x72;
- /* Make sure that timer is disabled until /dev/watchdog is opened */
- hpwdt_stop();
+ /* Have the core update running timer until user space is ready */
+ if (hpwdt_hw_is_running()) {
+ dev_info(&dev->dev, "timer is running\n");
+ set_bit(WDOG_HW_RUNNING, &hpwdt_dev.status);
+ }
/* Initialize NMI Decoding functionality */
retval = hpwdt_init_nmi_decoding(dev);
if (retval != 0)
goto error_init_nmi_decoding;
+ watchdog_stop_on_unregister(&hpwdt_dev);
watchdog_set_nowayout(&hpwdt_dev, nowayout);
watchdog_init_timeout(&hpwdt_dev, soft_margin, NULL);
@@ -314,13 +339,12 @@ static int hpwdt_init_one(struct pci_dev *dev,
pretimeout = 0;
}
hpwdt_dev.pretimeout = pretimeout ? PRETIMEOUT_SEC : 0;
+ kdumptimeout = min(kdumptimeout, HPWDT_MAX_TIMER);
hpwdt_dev.parent = &dev->dev;
retval = watchdog_register_device(&hpwdt_dev);
- if (retval < 0) {
- dev_err(&dev->dev, "watchdog register failed: %d.\n", retval);
+ if (retval < 0)
goto error_wd_register;
- }
dev_info(&dev->dev, "HPE Watchdog Timer Driver: Version: %s\n",
HPWDT_VERSION);
@@ -328,6 +352,7 @@ static int hpwdt_init_one(struct pci_dev *dev,
hpwdt_dev.timeout, nowayout);
dev_info(&dev->dev, "pretimeout: %s.\n",
pretimeout ? "on" : "off");
+ dev_info(&dev->dev, "kdumptimeout: %d.\n", kdumptimeout);
if (dev->subsystem_vendor == PCI_VENDOR_ID_HP_3PAR)
ilo5 = true;
@@ -345,9 +370,6 @@ error_pci_iomap:
static void hpwdt_exit(struct pci_dev *dev)
{
- if (!nowayout)
- hpwdt_stop();
-
watchdog_unregister_device(&hpwdt_dev);
hpwdt_exit_nmi_decoding();
pci_iounmap(dev, pci_mem_addr);
@@ -376,6 +398,9 @@ module_param(nowayout, bool, 0);
MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default="
__MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
+module_param(kdumptimeout, int, 0444);
+MODULE_PARM_DESC(kdumptimeout, "Timeout applied for crash kernel transition in seconds");
+
#ifdef CONFIG_HPWDT_NMI_DECODING
module_param(pretimeout, bool, 0);
MODULE_PARM_DESC(pretimeout, "Watchdog pretimeout enabled");
diff --git a/drivers/watchdog/i6300esb.c b/drivers/watchdog/i6300esb.c
index f98f35a05896..a30835f547b3 100644
--- a/drivers/watchdog/i6300esb.c
+++ b/drivers/watchdog/i6300esb.c
@@ -315,11 +315,8 @@ static int esb_probe(struct pci_dev *pdev,
/* Register the watchdog so that userspace has access to it */
ret = watchdog_register_device(&edev->wdd);
- if (ret != 0) {
- dev_err(&pdev->dev,
- "cannot register watchdog device (err=%d)\n", ret);
+ if (ret != 0)
goto err_unmap;
- }
dev_info(&pdev->dev,
"initialized. heartbeat=%d sec (nowayout=%d)\n",
edev->wdd.timeout, nowayout);
diff --git a/drivers/watchdog/iTCO_vendor_support.c b/drivers/watchdog/iTCO_vendor_support.c
index 68a9d9cc2eb8..4f1b96f59349 100644
--- a/drivers/watchdog/iTCO_vendor_support.c
+++ b/drivers/watchdog/iTCO_vendor_support.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* intel TCO vendor specific watchdog driver support
*
* (c) Copyright 2006-2009 Wim Van Sebroeck <wim@iguana.be>.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Neither Wim Van Sebroeck nor Iguana vzw. admit liability nor
* provide warranty for any of this software. This material is
* provided "AS-IS" and at no charge.
@@ -216,4 +212,3 @@ MODULE_AUTHOR("Wim Van Sebroeck <wim@iguana.be>, "
MODULE_DESCRIPTION("Intel TCO Vendor Specific WatchDog Timer Driver Support");
MODULE_VERSION(DRV_VERSION);
MODULE_LICENSE("GPL");
-
diff --git a/drivers/watchdog/iTCO_wdt.c b/drivers/watchdog/iTCO_wdt.c
index 89cea6ce9a08..c559f706ae7e 100644
--- a/drivers/watchdog/iTCO_wdt.c
+++ b/drivers/watchdog/iTCO_wdt.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* intel TCO Watchdog Driver
*
* (c) Copyright 2006-2011 Wim Van Sebroeck <wim@iguana.be>.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Neither Wim Van Sebroeck nor Iguana vzw. admit liability nor
* provide warranty for any of this software. This material is
* provided "AS-IS" and at no charge.
diff --git a/drivers/watchdog/ib700wdt.c b/drivers/watchdog/ib700wdt.c
index 30d6cec582af..92fd7f33bc4d 100644
--- a/drivers/watchdog/ib700wdt.c
+++ b/drivers/watchdog/ib700wdt.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* IB700 Single Board Computer WDT driver
*
@@ -14,11 +15,6 @@
* (c) Copyright 1996 Alan Cox <alan@lxorguk.ukuu.org.uk>,
* All Rights Reserved.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Neither Alan Cox nor CymruNet Ltd. admit liability nor provide
* warranty for any of this software. This material is provided
* "AS-IS" and at no charge.
diff --git a/drivers/watchdog/ie6xx_wdt.c b/drivers/watchdog/ie6xx_wdt.c
index 508fbefce9f6..8f28993fab8b 100644
--- a/drivers/watchdog/ie6xx_wdt.c
+++ b/drivers/watchdog/ie6xx_wdt.c
@@ -66,7 +66,7 @@ MODULE_PARM_DESC(resetmode,
static struct {
unsigned short sch_wdtba;
- struct spinlock unlock_sequence;
+ spinlock_t unlock_sequence;
#ifdef CONFIG_DEBUG_FS
struct dentry *debugfs;
#endif
@@ -254,12 +254,8 @@ static int ie6xx_wdt_probe(struct platform_device *pdev)
ie6xx_wdt_debugfs_init();
ret = watchdog_register_device(&ie6xx_wdt_dev);
- if (ret) {
- dev_err(&pdev->dev,
- "Watchdog timer: cannot register device (err =%d)\n",
- ret);
+ if (ret)
goto misc_register_error;
- }
return 0;
diff --git a/drivers/watchdog/imx2_wdt.c b/drivers/watchdog/imx2_wdt.c
index a606005dd65f..32af3974e6bb 100644
--- a/drivers/watchdog/imx2_wdt.c
+++ b/drivers/watchdog/imx2_wdt.c
@@ -316,10 +316,8 @@ static int __init imx2_wdt_probe(struct platform_device *pdev)
regmap_write(wdev->regmap, IMX2_WDT_WMCR, 0);
ret = watchdog_register_device(wdog);
- if (ret) {
- dev_err(&pdev->dev, "cannot register watchdog device\n");
+ if (ret)
goto disable_clk;
- }
dev_info(&pdev->dev, "timeout %d sec (nowayout=%d)\n",
wdog->timeout, nowayout);
diff --git a/drivers/watchdog/imx_sc_wdt.c b/drivers/watchdog/imx_sc_wdt.c
index 49848b66186c..78eaaf75a263 100644
--- a/drivers/watchdog/imx_sc_wdt.c
+++ b/drivers/watchdog/imx_sc_wdt.c
@@ -4,6 +4,7 @@
*/
#include <linux/arm-smccc.h>
+#include <linux/firmware/imx/sci.h>
#include <linux/io.h>
#include <linux/init.h>
#include <linux/kernel.h>
@@ -33,11 +34,19 @@
#define SC_TIMER_WDOG_ACTION_PARTITION 0
+#define SC_IRQ_WDOG 1
+#define SC_IRQ_GROUP_WDOG 1
+
static bool nowayout = WATCHDOG_NOWAYOUT;
module_param(nowayout, bool, 0000);
MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default="
__MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
+struct imx_sc_wdt_device {
+ struct watchdog_device wdd;
+ struct notifier_block wdt_notifier;
+};
+
static int imx_sc_wdt_ping(struct watchdog_device *wdog)
{
struct arm_smccc_res res;
@@ -85,24 +94,66 @@ static int imx_sc_wdt_set_timeout(struct watchdog_device *wdog,
return res.a0 ? -EACCES : 0;
}
+static int imx_sc_wdt_set_pretimeout(struct watchdog_device *wdog,
+ unsigned int pretimeout)
+{
+ struct arm_smccc_res res;
+
+ arm_smccc_smc(IMX_SIP_TIMER, IMX_SIP_TIMER_SET_PRETIME_WDOG,
+ pretimeout * 1000, 0, 0, 0, 0, 0, &res);
+ if (res.a0)
+ return -EACCES;
+
+ wdog->pretimeout = pretimeout;
+
+ return 0;
+}
+
+static int imx_sc_wdt_notify(struct notifier_block *nb,
+ unsigned long event, void *group)
+{
+ struct imx_sc_wdt_device *imx_sc_wdd =
+ container_of(nb,
+ struct imx_sc_wdt_device,
+ wdt_notifier);
+
+ if (event & SC_IRQ_WDOG &&
+ *(u8 *)group == SC_IRQ_GROUP_WDOG)
+ watchdog_notify_pretimeout(&imx_sc_wdd->wdd);
+
+ return 0;
+}
+
+static void imx_sc_wdt_action(void *data)
+{
+ struct notifier_block *wdt_notifier = data;
+
+ imx_scu_irq_unregister_notifier(wdt_notifier);
+ imx_scu_irq_group_enable(SC_IRQ_GROUP_WDOG,
+ SC_IRQ_WDOG,
+ false);
+}
+
static const struct watchdog_ops imx_sc_wdt_ops = {
.owner = THIS_MODULE,
.start = imx_sc_wdt_start,
.stop = imx_sc_wdt_stop,
.ping = imx_sc_wdt_ping,
.set_timeout = imx_sc_wdt_set_timeout,
+ .set_pretimeout = imx_sc_wdt_set_pretimeout,
};
-static const struct watchdog_info imx_sc_wdt_info = {
+static struct watchdog_info imx_sc_wdt_info = {
.identity = "i.MX SC watchdog timer",
.options = WDIOF_SETTIMEOUT | WDIOF_KEEPALIVEPING |
- WDIOF_MAGICCLOSE | WDIOF_PRETIMEOUT,
+ WDIOF_MAGICCLOSE,
};
static int imx_sc_wdt_probe(struct platform_device *pdev)
{
+ struct imx_sc_wdt_device *imx_sc_wdd;
+ struct watchdog_device *wdog;
struct device *dev = &pdev->dev;
- struct watchdog_device *imx_sc_wdd;
int ret;
imx_sc_wdd = devm_kzalloc(dev, sizeof(*imx_sc_wdd), GFP_KERNEL);
@@ -111,42 +162,70 @@ static int imx_sc_wdt_probe(struct platform_device *pdev)
platform_set_drvdata(pdev, imx_sc_wdd);
- imx_sc_wdd->info = &imx_sc_wdt_info;
- imx_sc_wdd->ops = &imx_sc_wdt_ops;
- imx_sc_wdd->min_timeout = 1;
- imx_sc_wdd->max_timeout = MAX_TIMEOUT;
- imx_sc_wdd->parent = dev;
- imx_sc_wdd->timeout = DEFAULT_TIMEOUT;
-
- watchdog_init_timeout(imx_sc_wdd, 0, dev);
- watchdog_stop_on_reboot(imx_sc_wdd);
- watchdog_stop_on_unregister(imx_sc_wdd);
+ wdog = &imx_sc_wdd->wdd;
+ wdog->info = &imx_sc_wdt_info;
+ wdog->ops = &imx_sc_wdt_ops;
+ wdog->min_timeout = 1;
+ wdog->max_timeout = MAX_TIMEOUT;
+ wdog->parent = dev;
+ wdog->timeout = DEFAULT_TIMEOUT;
+
+ watchdog_init_timeout(wdog, 0, dev);
+ watchdog_stop_on_reboot(wdog);
+ watchdog_stop_on_unregister(wdog);
+
+ ret = devm_watchdog_register_device(dev, wdog);
+
+ if (ret) {
+ dev_err(dev, "Failed to register watchdog device\n");
+ return ret;
+ }
+
+ ret = imx_scu_irq_group_enable(SC_IRQ_GROUP_WDOG,
+ SC_IRQ_WDOG,
+ true);
+ if (ret) {
+ dev_warn(dev, "Enable irq failed, pretimeout NOT supported\n");
+ return 0;
+ }
- ret = devm_watchdog_register_device(dev, imx_sc_wdd);
+ imx_sc_wdd->wdt_notifier.notifier_call = imx_sc_wdt_notify;
+ ret = imx_scu_irq_register_notifier(&imx_sc_wdd->wdt_notifier);
if (ret) {
- dev_err(dev, "Failed to register watchdog device\n");
- return ret;
+ imx_scu_irq_group_enable(SC_IRQ_GROUP_WDOG,
+ SC_IRQ_WDOG,
+ false);
+ dev_warn(dev,
+ "Register irq notifier failed, pretimeout NOT supported\n");
+ return 0;
}
+ ret = devm_add_action_or_reset(dev, imx_sc_wdt_action,
+ &imx_sc_wdd->wdt_notifier);
+ if (!ret)
+ imx_sc_wdt_info.options |= WDIOF_PRETIMEOUT;
+ else
+ dev_warn(dev, "Add action failed, pretimeout NOT supported\n");
+
return 0;
}
static int __maybe_unused imx_sc_wdt_suspend(struct device *dev)
{
- struct watchdog_device *imx_sc_wdd = dev_get_drvdata(dev);
+ struct imx_sc_wdt_device *imx_sc_wdd = dev_get_drvdata(dev);
- if (watchdog_active(imx_sc_wdd))
- imx_sc_wdt_stop(imx_sc_wdd);
+ if (watchdog_active(&imx_sc_wdd->wdd))
+ imx_sc_wdt_stop(&imx_sc_wdd->wdd);
return 0;
}
static int __maybe_unused imx_sc_wdt_resume(struct device *dev)
{
- struct watchdog_device *imx_sc_wdd = dev_get_drvdata(dev);
+ struct imx_sc_wdt_device *imx_sc_wdd = dev_get_drvdata(dev);
- if (watchdog_active(imx_sc_wdd))
- imx_sc_wdt_start(imx_sc_wdd);
+ if (watchdog_active(&imx_sc_wdd->wdd))
+ imx_sc_wdt_start(&imx_sc_wdd->wdd);
return 0;
}
diff --git a/drivers/watchdog/intel-mid_wdt.c b/drivers/watchdog/intel-mid_wdt.c
index b2463f8276e6..2cdbd37c700c 100644
--- a/drivers/watchdog/intel-mid_wdt.c
+++ b/drivers/watchdog/intel-mid_wdt.c
@@ -161,10 +161,8 @@ static int mid_wdt_probe(struct platform_device *pdev)
set_bit(WDOG_HW_RUNNING, &wdt_dev->status);
ret = devm_watchdog_register_device(dev, wdt_dev);
- if (ret) {
- dev_err(dev, "error registering watchdog device\n");
+ if (ret)
return ret;
- }
dev_info(dev, "Intel MID watchdog device probed\n");
diff --git a/drivers/watchdog/jz4740_wdt.c b/drivers/watchdog/jz4740_wdt.c
index 313358b2e0b1..d4a90916dd38 100644
--- a/drivers/watchdog/jz4740_wdt.c
+++ b/drivers/watchdog/jz4740_wdt.c
@@ -4,6 +4,7 @@
* JZ4740 Watchdog driver
*/
+#include <linux/mfd/ingenic-tcu.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/types.h>
@@ -19,23 +20,16 @@
#include <asm/mach-jz4740/timer.h>
-#define JZ_REG_WDT_TIMER_DATA 0x0
-#define JZ_REG_WDT_COUNTER_ENABLE 0x4
-#define JZ_REG_WDT_TIMER_COUNTER 0x8
-#define JZ_REG_WDT_TIMER_CONTROL 0xC
-
#define JZ_WDT_CLOCK_PCLK 0x1
#define JZ_WDT_CLOCK_RTC 0x2
#define JZ_WDT_CLOCK_EXT 0x4
-#define JZ_WDT_CLOCK_DIV_SHIFT 3
-
-#define JZ_WDT_CLOCK_DIV_1 (0 << JZ_WDT_CLOCK_DIV_SHIFT)
-#define JZ_WDT_CLOCK_DIV_4 (1 << JZ_WDT_CLOCK_DIV_SHIFT)
-#define JZ_WDT_CLOCK_DIV_16 (2 << JZ_WDT_CLOCK_DIV_SHIFT)
-#define JZ_WDT_CLOCK_DIV_64 (3 << JZ_WDT_CLOCK_DIV_SHIFT)
-#define JZ_WDT_CLOCK_DIV_256 (4 << JZ_WDT_CLOCK_DIV_SHIFT)
-#define JZ_WDT_CLOCK_DIV_1024 (5 << JZ_WDT_CLOCK_DIV_SHIFT)
+#define JZ_WDT_CLOCK_DIV_1 (0 << TCU_TCSR_PRESCALE_LSB)
+#define JZ_WDT_CLOCK_DIV_4 (1 << TCU_TCSR_PRESCALE_LSB)
+#define JZ_WDT_CLOCK_DIV_16 (2 << TCU_TCSR_PRESCALE_LSB)
+#define JZ_WDT_CLOCK_DIV_64 (3 << TCU_TCSR_PRESCALE_LSB)
+#define JZ_WDT_CLOCK_DIV_256 (4 << TCU_TCSR_PRESCALE_LSB)
+#define JZ_WDT_CLOCK_DIV_1024 (5 << TCU_TCSR_PRESCALE_LSB)
#define DEFAULT_HEARTBEAT 5
#define MAX_HEARTBEAT 2048
@@ -63,7 +57,7 @@ static int jz4740_wdt_ping(struct watchdog_device *wdt_dev)
{
struct jz4740_wdt_drvdata *drvdata = watchdog_get_drvdata(wdt_dev);
- writew(0x0, drvdata->base + JZ_REG_WDT_TIMER_COUNTER);
+ writew(0x0, drvdata->base + TCU_REG_WDT_TCNT);
return 0;
}
@@ -74,6 +68,7 @@ static int jz4740_wdt_set_timeout(struct watchdog_device *wdt_dev,
unsigned int rtc_clk_rate;
unsigned int timeout_value;
unsigned short clock_div = JZ_WDT_CLOCK_DIV_1;
+ u8 tcer;
rtc_clk_rate = clk_get_rate(drvdata->rtc_clk);
@@ -86,18 +81,19 @@ static int jz4740_wdt_set_timeout(struct watchdog_device *wdt_dev,
break;
}
timeout_value >>= 2;
- clock_div += (1 << JZ_WDT_CLOCK_DIV_SHIFT);
+ clock_div += (1 << TCU_TCSR_PRESCALE_LSB);
}
- writeb(0x0, drvdata->base + JZ_REG_WDT_COUNTER_ENABLE);
- writew(clock_div, drvdata->base + JZ_REG_WDT_TIMER_CONTROL);
+ tcer = readb(drvdata->base + TCU_REG_WDT_TCER);
+ writeb(0x0, drvdata->base + TCU_REG_WDT_TCER);
+ writew(clock_div, drvdata->base + TCU_REG_WDT_TCSR);
- writew((u16)timeout_value, drvdata->base + JZ_REG_WDT_TIMER_DATA);
- writew(0x0, drvdata->base + JZ_REG_WDT_TIMER_COUNTER);
- writew(clock_div | JZ_WDT_CLOCK_RTC,
- drvdata->base + JZ_REG_WDT_TIMER_CONTROL);
+ writew((u16)timeout_value, drvdata->base + TCU_REG_WDT_TDR);
+ writew(0x0, drvdata->base + TCU_REG_WDT_TCNT);
+ writew(clock_div | JZ_WDT_CLOCK_RTC, drvdata->base + TCU_REG_WDT_TCSR);
- writeb(0x1, drvdata->base + JZ_REG_WDT_COUNTER_ENABLE);
+ if (tcer & TCU_WDT_TCER_TCEN)
+ writeb(TCU_WDT_TCER_TCEN, drvdata->base + TCU_REG_WDT_TCER);
wdt_dev->timeout = new_timeout;
return 0;
@@ -105,9 +101,18 @@ static int jz4740_wdt_set_timeout(struct watchdog_device *wdt_dev,
static int jz4740_wdt_start(struct watchdog_device *wdt_dev)
{
+ struct jz4740_wdt_drvdata *drvdata = watchdog_get_drvdata(wdt_dev);
+ u8 tcer;
+
+ tcer = readb(drvdata->base + TCU_REG_WDT_TCER);
+
jz4740_timer_enable_watchdog();
jz4740_wdt_set_timeout(wdt_dev, wdt_dev->timeout);
+ /* Start watchdog if it wasn't started already */
+ if (!(tcer & TCU_WDT_TCER_TCEN))
+ writeb(TCU_WDT_TCER_TCEN, drvdata->base + TCU_REG_WDT_TCER);
+
return 0;
}
@@ -115,7 +120,7 @@ static int jz4740_wdt_stop(struct watchdog_device *wdt_dev)
{
struct jz4740_wdt_drvdata *drvdata = watchdog_get_drvdata(wdt_dev);
- writeb(0x0, drvdata->base + JZ_REG_WDT_COUNTER_ENABLE);
+ writeb(0x0, drvdata->base + TCU_REG_WDT_TCER);
jz4740_timer_disable_watchdog();
return 0;
@@ -187,11 +192,7 @@ static int jz4740_wdt_probe(struct platform_device *pdev)
return PTR_ERR(drvdata->rtc_clk);
}
- ret = devm_watchdog_register_device(dev, &drvdata->wdt);
- if (ret < 0)
- return ret;
-
- return 0;
+ return devm_watchdog_register_device(dev, &drvdata->wdt);
}
static struct platform_driver jz4740_wdt_driver = {
diff --git a/drivers/watchdog/loongson1_wdt.c b/drivers/watchdog/loongson1_wdt.c
index c8c2b8a88fc2..bb3d075c0633 100644
--- a/drivers/watchdog/loongson1_wdt.c
+++ b/drivers/watchdog/loongson1_wdt.c
@@ -132,10 +132,8 @@ static int ls1x_wdt_probe(struct platform_device *pdev)
watchdog_set_drvdata(ls1x_wdt, drvdata);
err = devm_watchdog_register_device(dev, &drvdata->wdt);
- if (err) {
- dev_err(dev, "failed to register watchdog device\n");
+ if (err)
return err;
- }
platform_set_drvdata(pdev, drvdata);
diff --git a/drivers/watchdog/max77620_wdt.c b/drivers/watchdog/max77620_wdt.c
index 9937f9fccd2e..be6a53c30002 100644
--- a/drivers/watchdog/max77620_wdt.c
+++ b/drivers/watchdog/max77620_wdt.c
@@ -182,13 +182,7 @@ static int max77620_wdt_probe(struct platform_device *pdev)
watchdog_set_drvdata(wdt_dev, wdt);
watchdog_stop_on_unregister(wdt_dev);
- ret = devm_watchdog_register_device(dev, wdt_dev);
- if (ret < 0) {
- dev_err(dev, "watchdog registration failed: %d\n", ret);
- return ret;
- }
-
- return 0;
+ return devm_watchdog_register_device(dev, wdt_dev);
}
static const struct platform_device_id max77620_wdt_devtype[] = {
diff --git a/drivers/watchdog/mei_wdt.c b/drivers/watchdog/mei_wdt.c
index 96a770938ff0..5391bf3e6b11 100644
--- a/drivers/watchdog/mei_wdt.c
+++ b/drivers/watchdog/mei_wdt.c
@@ -384,10 +384,8 @@ static int mei_wdt_register(struct mei_wdt *wdt)
watchdog_stop_on_reboot(&wdt->wdd);
ret = watchdog_register_device(&wdt->wdd);
- if (ret) {
- dev_err(dev, "unable to register watchdog device = %d.\n", ret);
+ if (ret)
watchdog_set_drvdata(&wdt->wdd, NULL);
- }
wdt->state = MEI_WDT_IDLE;
diff --git a/drivers/watchdog/mena21_wdt.c b/drivers/watchdog/mena21_wdt.c
index e9ca4e0e25dc..99d2359d5a8a 100644
--- a/drivers/watchdog/mena21_wdt.c
+++ b/drivers/watchdog/mena21_wdt.c
@@ -190,10 +190,8 @@ static int a21_wdt_probe(struct platform_device *pdev)
dev_set_drvdata(dev, drv);
ret = devm_watchdog_register_device(dev, &a21_wdt);
- if (ret) {
- dev_err(dev, "Cannot register watchdog device\n");
+ if (ret)
return ret;
- }
dev_info(dev, "MEN A21 watchdog timer driver enabled\n");
diff --git a/drivers/watchdog/menf21bmc_wdt.c b/drivers/watchdog/menf21bmc_wdt.c
index 7766d7361d3b..81ebdfc371f4 100644
--- a/drivers/watchdog/menf21bmc_wdt.c
+++ b/drivers/watchdog/menf21bmc_wdt.c
@@ -152,10 +152,8 @@ static int menf21bmc_wdt_probe(struct platform_device *pdev)
}
ret = devm_watchdog_register_device(dev, &drv_data->wdt);
- if (ret) {
- dev_err(dev, "failed to register Watchdog device\n");
+ if (ret)
return ret;
- }
dev_info(dev, "MEN 14F021P00 BMC Watchdog device enabled\n");
diff --git a/drivers/watchdog/mpc8xxx_wdt.c b/drivers/watchdog/mpc8xxx_wdt.c
index b6ffad421bd0..3fc457bc16db 100644
--- a/drivers/watchdog/mpc8xxx_wdt.c
+++ b/drivers/watchdog/mpc8xxx_wdt.c
@@ -201,11 +201,8 @@ static int mpc8xxx_wdt_probe(struct platform_device *ofdev)
ddata->wdd.timeout = ddata->wdd.min_timeout;
ret = devm_watchdog_register_device(dev, &ddata->wdd);
- if (ret) {
- dev_err(dev, "cannot register watchdog device (err=%d)\n",
- ret);
+ if (ret)
return ret;
- }
dev_info(dev,
"WDT driver for MPC8xxx initialized. mode:%s timeout=%d sec\n",
diff --git a/drivers/watchdog/mv64x60_wdt.c b/drivers/watchdog/mv64x60_wdt.c
index c785f4f0a196..74bf7144a970 100644
--- a/drivers/watchdog/mv64x60_wdt.c
+++ b/drivers/watchdog/mv64x60_wdt.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* mv64x60_wdt.c - MV64X60 (Marvell Discovery) watchdog userspace interface
*
@@ -9,10 +10,7 @@
*
* Derived from mpc8xx_wdt.c, with the following copyright.
*
- * 2002 (c) Florian Schirmer <jolt@tuxbox.org> This file is licensed under
- * the terms of the GNU General Public License version 2. This program
- * is licensed "as is" without any warranty of any kind, whether express
- * or implied.
+ * 2002 (c) Florian Schirmer <jolt@tuxbox.org>
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/drivers/watchdog/ni903x_wdt.c b/drivers/watchdog/ni903x_wdt.c
index 60f5608af2a8..4cebad324b20 100644
--- a/drivers/watchdog/ni903x_wdt.c
+++ b/drivers/watchdog/ni903x_wdt.c
@@ -211,10 +211,8 @@ static int ni903x_acpi_add(struct acpi_device *device)
watchdog_init_timeout(wdd, timeout, dev);
ret = watchdog_register_device(wdd);
- if (ret) {
- dev_err(dev, "failed to register watchdog\n");
+ if (ret)
return ret;
- }
/* Switch from boot mode to user mode */
outb(NIWD_CONTROL_RESET | NIWD_CONTROL_MODE,
diff --git a/drivers/watchdog/nic7018_wdt.c b/drivers/watchdog/nic7018_wdt.c
index 2e1a2a3d4ec9..2a46cc662943 100644
--- a/drivers/watchdog/nic7018_wdt.c
+++ b/drivers/watchdog/nic7018_wdt.c
@@ -210,7 +210,6 @@ static int nic7018_probe(struct platform_device *pdev)
ret = watchdog_register_device(wdd);
if (ret) {
outb(LOCK, wdt->io_base + WDT_REG_LOCK);
- dev_err(dev, "failed to register watchdog\n");
return ret;
}
diff --git a/drivers/watchdog/npcm_wdt.c b/drivers/watchdog/npcm_wdt.c
index 9d6c1689b12c..9c773c3d6d5d 100644
--- a/drivers/watchdog/npcm_wdt.c
+++ b/drivers/watchdog/npcm_wdt.c
@@ -220,10 +220,8 @@ static int npcm_wdt_probe(struct platform_device *pdev)
return ret;
ret = devm_watchdog_register_device(dev, &wdt->wdd);
- if (ret) {
- dev_err(dev, "failed to register watchdog\n");
+ if (ret)
return ret;
- }
dev_info(dev, "NPCM watchdog driver enabled\n");
diff --git a/drivers/watchdog/nv_tco.h b/drivers/watchdog/nv_tco.h
index c2d1d04e055b..d325e528010f 100644
--- a/drivers/watchdog/nv_tco.h
+++ b/drivers/watchdog/nv_tco.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
/*
* nv_tco: TCO timer driver for nVidia chipsets.
*
@@ -10,11 +11,6 @@
* Reserved.
* http://www.kernelconcepts.de
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Neither kernel concepts nor Nils Faerber admit liability nor provide
* warranty for any of this software. This material is provided
* "AS-IS" and at no charge.
diff --git a/drivers/watchdog/octeon-wdt-main.c b/drivers/watchdog/octeon-wdt-main.c
index 0ec419a3f7ed..fde9e739b436 100644
--- a/drivers/watchdog/octeon-wdt-main.c
+++ b/drivers/watchdog/octeon-wdt-main.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* Octeon Watchdog driver
*
@@ -10,22 +11,12 @@
* (c) Copyright 1996-1997 Alan Cox <alan@lxorguk.ukuu.org.uk>,
* All Rights Reserved.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Neither Alan Cox nor CymruNet Ltd. admit liability nor provide
* warranty for any of this software. This material is provided
* "AS-IS" and at no charge.
*
* (c) Copyright 1995 Alan Cox <alan@lxorguk.ukuu.org.uk>
*
- * This file is subject to the terms and conditions of the GNU General Public
- * License. See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- *
* The OCTEON watchdog has a maximum timeout of 2^32 * io_clock.
* For most systems this is less than 10 seconds, so to allow for
* software to request longer watchdog heartbeats, we maintain software
diff --git a/drivers/watchdog/of_xilinx_wdt.c b/drivers/watchdog/of_xilinx_wdt.c
index 03786992b701..7fe4f7c3f7ce 100644
--- a/drivers/watchdog/of_xilinx_wdt.c
+++ b/drivers/watchdog/of_xilinx_wdt.c
@@ -238,10 +238,8 @@ static int xwdt_probe(struct platform_device *pdev)
}
rc = devm_watchdog_register_device(dev, xilinx_wdt_wdd);
- if (rc) {
- dev_err(dev, "Cannot register watchdog (err=%d)\n", rc);
+ if (rc)
return rc;
- }
clk_disable(xdev->clk);
diff --git a/drivers/watchdog/omap_wdt.c b/drivers/watchdog/omap_wdt.c
index d49688d93f6a..9b91882fe3c4 100644
--- a/drivers/watchdog/omap_wdt.c
+++ b/drivers/watchdog/omap_wdt.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* omap_wdt.c
*
@@ -6,10 +7,7 @@
* Author: MontaVista Software, Inc.
* <gdavis@mvista.com> or <source@mvista.com>
*
- * 2003 (c) MontaVista Software, Inc. This file is licensed under the
- * terms of the GNU General Public License version 2. This program is
- * licensed "as is" without any warranty of any kind, whether express
- * or implied.
+ * 2003 (c) MontaVista Software, Inc.
*
* History:
*
diff --git a/drivers/watchdog/omap_wdt.h b/drivers/watchdog/omap_wdt.h
index 42f31ec5e90d..950b4643f3e7 100644
--- a/drivers/watchdog/omap_wdt.h
+++ b/drivers/watchdog/omap_wdt.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
/*
* linux/drivers/char/watchdog/omap_wdt.h
*
@@ -5,26 +6,6 @@
* OMAP Watchdog timer register definitions
*
* Copyright (C) 2004 Texas Instruments.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2 of the License, or (at your
- * option) any later version.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
- * NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
- * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
- * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#ifndef _OMAP_WATCHDOG_H
diff --git a/drivers/watchdog/pc87413_wdt.c b/drivers/watchdog/pc87413_wdt.c
index ca21d6c240a3..2af1a8b3f973 100644
--- a/drivers/watchdog/pc87413_wdt.c
+++ b/drivers/watchdog/pc87413_wdt.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* NS pc87413-wdt Watchdog Timer driver for Linux 2.6.x.x
*
@@ -6,11 +7,6 @@
* (C) Copyright 2006 Sven Anders, <anders@anduras.de>
* and Marcus Junker, <junker@anduras.de>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Neither Sven Anders, Marcus Junker nor ANDURAS AG
* admit liability nor provide warranty for any of this software.
* This material is provided "AS-IS" and at no charge.
diff --git a/drivers/watchdog/pcwd_pci.c b/drivers/watchdog/pcwd_pci.c
index 5773d2591d3f..e30c1f762045 100644
--- a/drivers/watchdog/pcwd_pci.c
+++ b/drivers/watchdog/pcwd_pci.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* Berkshire PCI-PC Watchdog Card Driver
*
@@ -10,11 +11,6 @@
* Matt Domsch <Matt_Domsch@dell.com>,
* Rob Radez <rob@osinvestor.com>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Neither Wim Van Sebroeck nor Iguana vzw. admit liability nor
* provide warranty for any of this software. This material is
* provided "AS-IS" and at no charge.
diff --git a/drivers/watchdog/pcwd_usb.c b/drivers/watchdog/pcwd_usb.c
index 5de6182dae33..6727f8ab2d18 100644
--- a/drivers/watchdog/pcwd_usb.c
+++ b/drivers/watchdog/pcwd_usb.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* Berkshire USB-PC Watchdog Card Driver
*
@@ -10,11 +11,6 @@
* Rob Radez <rob@osinvestor.com>,
* Greg Kroah-Hartman <greg@kroah.com>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Neither Wim Van Sebroeck nor Iguana vzw. admit liability nor
* provide warranty for any of this software. This material is
* provided "AS-IS" and at no charge.
diff --git a/drivers/watchdog/pic32-dmt.c b/drivers/watchdog/pic32-dmt.c
index 4f2aca78f13a..f43062b3c4c8 100644
--- a/drivers/watchdog/pic32-dmt.c
+++ b/drivers/watchdog/pic32-dmt.c
@@ -212,10 +212,8 @@ static int pic32_dmt_probe(struct platform_device *pdev)
watchdog_set_drvdata(wdd, dmt);
ret = devm_watchdog_register_device(dev, wdd);
- if (ret) {
- dev_err(dev, "watchdog register failed, err %d\n", ret);
+ if (ret)
return ret;
- }
platform_set_drvdata(pdev, wdd);
return 0;
diff --git a/drivers/watchdog/pic32-wdt.c b/drivers/watchdog/pic32-wdt.c
index 5ecdd880f0b7..41715d68d9e9 100644
--- a/drivers/watchdog/pic32-wdt.c
+++ b/drivers/watchdog/pic32-wdt.c
@@ -221,10 +221,8 @@ static int pic32_wdt_drv_probe(struct platform_device *pdev)
watchdog_set_drvdata(wdd, wdt);
ret = devm_watchdog_register_device(dev, wdd);
- if (ret) {
- dev_err(dev, "watchdog register failed, err %d\n", ret);
+ if (ret)
return ret;
- }
platform_set_drvdata(pdev, wdd);
diff --git a/drivers/watchdog/pnx4008_wdt.c b/drivers/watchdog/pnx4008_wdt.c
index d9e03544aeae..7b446b696f2b 100644
--- a/drivers/watchdog/pnx4008_wdt.c
+++ b/drivers/watchdog/pnx4008_wdt.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* drivers/char/watchdog/pnx4008_wdt.c
*
@@ -11,10 +12,6 @@
* 2005-2006 (c) MontaVista Software, Inc.
*
* (C) 2012 Wolfram Sang, Pengutronix
- *
- * This file is licensed under the terms of the GNU General Public License
- * version 2. This program is licensed "as is" without any warranty of any
- * kind, whether express or implied.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -221,10 +218,8 @@ static int pnx4008_wdt_probe(struct platform_device *pdev)
set_bit(WDOG_HW_RUNNING, &pnx4008_wdd.status);
ret = devm_watchdog_register_device(dev, &pnx4008_wdd);
- if (ret < 0) {
- dev_err(dev, "cannot register watchdog device\n");
+ if (ret < 0)
return ret;
- }
dev_info(dev, "heartbeat %d sec\n", pnx4008_wdd.timeout);
diff --git a/drivers/watchdog/qcom-wdt.c b/drivers/watchdog/qcom-wdt.c
index fc0f7e5de38d..7be7f87be28f 100644
--- a/drivers/watchdog/qcom-wdt.c
+++ b/drivers/watchdog/qcom-wdt.c
@@ -223,10 +223,8 @@ static int qcom_wdt_probe(struct platform_device *pdev)
watchdog_init_timeout(&wdt->wdd, 0, dev);
ret = devm_watchdog_register_device(dev, &wdt->wdd);
- if (ret) {
- dev_err(dev, "failed to register watchdog\n");
+ if (ret)
return ret;
- }
platform_set_drvdata(pdev, wdt);
return 0;
diff --git a/drivers/watchdog/rave-sp-wdt.c b/drivers/watchdog/rave-sp-wdt.c
index 35db173252f9..2c95615b6354 100644
--- a/drivers/watchdog/rave-sp-wdt.c
+++ b/drivers/watchdog/rave-sp-wdt.c
@@ -310,7 +310,6 @@ static int rave_sp_wdt_probe(struct platform_device *pdev)
ret = devm_watchdog_register_device(dev, wdd);
if (ret) {
- dev_err(dev, "Failed to register watchdog device\n");
rave_sp_wdt_stop(wdd);
return ret;
}
diff --git a/drivers/watchdog/renesas_wdt.c b/drivers/watchdog/renesas_wdt.c
index 565dbc1ec638..00662a8e039c 100644
--- a/drivers/watchdog/renesas_wdt.c
+++ b/drivers/watchdog/renesas_wdt.c
@@ -7,6 +7,7 @@
*/
#include <linux/bitops.h>
#include <linux/clk.h>
+#include <linux/delay.h>
#include <linux/io.h>
#include <linux/kernel.h>
#include <linux/module.h>
@@ -70,6 +71,15 @@ static int rwdt_init_timeout(struct watchdog_device *wdev)
return 0;
}
+static void rwdt_wait_cycles(struct rwdt_priv *priv, unsigned int cycles)
+{
+ unsigned int delay;
+
+ delay = DIV_ROUND_UP(cycles * 1000000, priv->clk_rate);
+
+ usleep_range(delay, 2 * delay);
+}
+
static int rwdt_start(struct watchdog_device *wdev)
{
struct rwdt_priv *priv = watchdog_get_drvdata(wdev);
@@ -80,6 +90,8 @@ static int rwdt_start(struct watchdog_device *wdev)
/* Stop the timer before we modify any register */
val = readb_relaxed(priv->base + RWTCSRA) & ~RWTCSRA_TME;
rwdt_write(priv, val, RWTCSRA);
+ /* Delay 2 cycles before setting watchdog counter */
+ rwdt_wait_cycles(priv, 2);
rwdt_init_timeout(wdev);
rwdt_write(priv, priv->cks, RWTCSRA);
@@ -98,6 +110,8 @@ static int rwdt_stop(struct watchdog_device *wdev)
struct rwdt_priv *priv = watchdog_get_drvdata(wdev);
rwdt_write(priv, priv->cks, RWTCSRA);
+ /* Delay 3 cycles before disabling module clock */
+ rwdt_wait_cycles(priv, 3);
pm_runtime_put(wdev->parent);
return 0;
@@ -175,15 +189,16 @@ static inline bool rwdt_blacklisted(struct device *dev) { return false; }
static int rwdt_probe(struct platform_device *pdev)
{
+ struct device *dev = &pdev->dev;
struct rwdt_priv *priv;
struct clk *clk;
unsigned long clks_per_sec;
int ret, i;
- if (rwdt_blacklisted(&pdev->dev))
+ if (rwdt_blacklisted(dev))
return -ENODEV;
- priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
+ priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
if (!priv)
return -ENOMEM;
@@ -191,16 +206,16 @@ static int rwdt_probe(struct platform_device *pdev)
if (IS_ERR(priv->base))
return PTR_ERR(priv->base);
- clk = devm_clk_get(&pdev->dev, NULL);
+ clk = devm_clk_get(dev, NULL);
if (IS_ERR(clk))
return PTR_ERR(clk);
- pm_runtime_enable(&pdev->dev);
- pm_runtime_get_sync(&pdev->dev);
+ pm_runtime_enable(dev);
+ pm_runtime_get_sync(dev);
priv->clk_rate = clk_get_rate(clk);
priv->wdev.bootstatus = (readb_relaxed(priv->base + RWTCSRA) &
RWTCSRA_WOVF) ? WDIOF_CARDRESET : 0;
- pm_runtime_put(&pdev->dev);
+ pm_runtime_put(dev);
if (!priv->clk_rate) {
ret = -ENOENT;
@@ -216,14 +231,14 @@ static int rwdt_probe(struct platform_device *pdev)
}
if (i < 0) {
- dev_err(&pdev->dev, "Can't find suitable clock divider\n");
+ dev_err(dev, "Can't find suitable clock divider\n");
ret = -ERANGE;
goto out_pm_disable;
}
priv->wdev.info = &rwdt_ident;
priv->wdev.ops = &rwdt_ops;
- priv->wdev.parent = &pdev->dev;
+ priv->wdev.parent = dev;
priv->wdev.min_timeout = 1;
priv->wdev.max_timeout = DIV_BY_CLKS_PER_SEC(priv, 65536);
priv->wdev.timeout = min(priv->wdev.max_timeout, RWDT_DEFAULT_TIMEOUT);
@@ -235,7 +250,7 @@ static int rwdt_probe(struct platform_device *pdev)
watchdog_stop_on_unregister(&priv->wdev);
/* This overrides the default timeout only if DT configuration was found */
- watchdog_init_timeout(&priv->wdev, 0, &pdev->dev);
+ watchdog_init_timeout(&priv->wdev, 0, dev);
ret = watchdog_register_device(&priv->wdev);
if (ret < 0)
@@ -244,7 +259,7 @@ static int rwdt_probe(struct platform_device *pdev)
return 0;
out_pm_disable:
- pm_runtime_disable(&pdev->dev);
+ pm_runtime_disable(dev);
return ret;
}
diff --git a/drivers/watchdog/retu_wdt.c b/drivers/watchdog/retu_wdt.c
index 39cd51df2ffc..258dfcf9cbda 100644
--- a/drivers/watchdog/retu_wdt.c
+++ b/drivers/watchdog/retu_wdt.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* Retu watchdog driver
*
@@ -5,15 +6,6 @@
*
* Based on code written by Amit Kucheria and Michael Buesch.
* Rewritten by Aaro Koskinen.
- *
- * This file is subject to the terms and conditions of the GNU General
- * Public License. See the file "COPYING" in the main directory of this
- * archive for more details.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
*/
#include <linux/slab.h>
diff --git a/drivers/watchdog/s3c2410_wdt.c b/drivers/watchdog/s3c2410_wdt.c
index daf3bf0d86b8..2395f353e52d 100644
--- a/drivers/watchdog/s3c2410_wdt.c
+++ b/drivers/watchdog/s3c2410_wdt.c
@@ -606,10 +606,8 @@ static int s3c2410wdt_probe(struct platform_device *pdev)
wdt->wdt_device.parent = dev;
ret = watchdog_register_device(&wdt->wdt_device);
- if (ret) {
- dev_err(dev, "cannot register watchdog (%d)\n", ret);
+ if (ret)
goto err_cpufreq;
- }
ret = s3c2410wdt_mask_and_disable_reset(wdt, false);
if (ret < 0)
diff --git a/drivers/watchdog/sa1100_wdt.c b/drivers/watchdog/sa1100_wdt.c
index bfa035e1a75e..cbd8c957182f 100644
--- a/drivers/watchdog/sa1100_wdt.c
+++ b/drivers/watchdog/sa1100_wdt.c
@@ -1,14 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* Watchdog driver for the SA11x0/PXA2xx
*
* (c) Copyright 2000 Oleg Drokin <green@crimea.edu>
* Based on SoftDog driver by Alan Cox <alan@lxorguk.ukuu.org.uk>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Neither Oleg Drokin nor iXcelerator.com admit liability nor provide
* warranty for any of this software. This material is provided
* "AS-IS" and at no charge.
diff --git a/drivers/watchdog/sama5d4_wdt.c b/drivers/watchdog/sama5d4_wdt.c
index b8da1bf21e12..d193a60430b2 100644
--- a/drivers/watchdog/sama5d4_wdt.c
+++ b/drivers/watchdog/sama5d4_wdt.c
@@ -110,9 +110,7 @@ static int sama5d4_wdt_set_timeout(struct watchdog_device *wdd,
u32 value = WDT_SEC2TICKS(timeout);
wdt->mr &= ~AT91_WDT_WDV;
- wdt->mr &= ~AT91_WDT_WDD;
wdt->mr |= AT91_WDT_SET_WDV(value);
- wdt->mr |= AT91_WDT_SET_WDD(value);
/*
* WDDIS has to be 0 when updating WDD/WDV. The datasheet states: When
@@ -248,7 +246,7 @@ static int sama5d4_wdt_probe(struct platform_device *pdev)
timeout = WDT_SEC2TICKS(wdd->timeout);
- wdt->mr |= AT91_WDT_SET_WDD(timeout);
+ wdt->mr |= AT91_WDT_SET_WDD(WDT_SEC2TICKS(MAX_WDT_TIMEOUT));
wdt->mr |= AT91_WDT_SET_WDV(timeout);
ret = sama5d4_wdt_init(wdt);
@@ -259,10 +257,8 @@ static int sama5d4_wdt_probe(struct platform_device *pdev)
watchdog_stop_on_unregister(wdd);
ret = devm_watchdog_register_device(dev, wdd);
- if (ret) {
- dev_err(dev, "failed to register watchdog device\n");
+ if (ret)
return ret;
- }
platform_set_drvdata(pdev, wdt);
@@ -279,7 +275,17 @@ static const struct of_device_id sama5d4_wdt_of_match[] = {
MODULE_DEVICE_TABLE(of, sama5d4_wdt_of_match);
#ifdef CONFIG_PM_SLEEP
-static int sama5d4_wdt_resume(struct device *dev)
+static int sama5d4_wdt_suspend_late(struct device *dev)
+{
+ struct sama5d4_wdt *wdt = dev_get_drvdata(dev);
+
+ if (watchdog_active(&wdt->wdd))
+ sama5d4_wdt_stop(&wdt->wdd);
+
+ return 0;
+}
+
+static int sama5d4_wdt_resume_early(struct device *dev)
{
struct sama5d4_wdt *wdt = dev_get_drvdata(dev);
@@ -290,12 +296,17 @@ static int sama5d4_wdt_resume(struct device *dev)
*/
sama5d4_wdt_init(wdt);
+ if (watchdog_active(&wdt->wdd))
+ sama5d4_wdt_start(&wdt->wdd);
+
return 0;
}
#endif
-static SIMPLE_DEV_PM_OPS(sama5d4_wdt_pm_ops, NULL,
- sama5d4_wdt_resume);
+static const struct dev_pm_ops sama5d4_wdt_pm_ops = {
+ SET_LATE_SYSTEM_SLEEP_PM_OPS(sama5d4_wdt_suspend_late,
+ sama5d4_wdt_resume_early)
+};
static struct platform_driver sama5d4_wdt_driver = {
.probe = sama5d4_wdt_probe,
diff --git a/drivers/watchdog/sbc7240_wdt.c b/drivers/watchdog/sbc7240_wdt.c
index efc81b318939..12cdee7d5069 100644
--- a/drivers/watchdog/sbc7240_wdt.c
+++ b/drivers/watchdog/sbc7240_wdt.c
@@ -1,19 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* NANO7240 SBC Watchdog device driver
*
* Based on w83877f.c by Scott Jennings,
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation;
- *
- * Software distributed under the License is distributed on an "AS IS"
- * basis, WITHOUT WARRANTY OF ANY KIND, either express or
- * implied. See the License for the specific language governing
- * rights and limitations under the License.
- *
* (c) Copyright 2007 Gilles GIGAN <gilles.gigan@jcu.edu.au>
- *
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/drivers/watchdog/sbc8360.c b/drivers/watchdog/sbc8360.c
index 3396024e7b76..4f8b9912fc51 100644
--- a/drivers/watchdog/sbc8360.c
+++ b/drivers/watchdog/sbc8360.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* SBC8360 Watchdog driver
*
@@ -19,11 +20,6 @@
* (c) Copyright 1996 Alan Cox <alan@lxorguk.ukuu.org.uk>,
* All Rights Reserved.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Neither Alan Cox nor CymruNet Ltd. admit liability nor provide
* warranty for any of this software. This material is provided
* "AS-IS" and at no charge.
diff --git a/drivers/watchdog/sch311x_wdt.c b/drivers/watchdog/sch311x_wdt.c
index ed6e9fac5d74..3612f1df381b 100644
--- a/drivers/watchdog/sch311x_wdt.c
+++ b/drivers/watchdog/sch311x_wdt.c
@@ -1,14 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* sch311x_wdt.c - Driver for the SCH311x Super-I/O chips
* integrated watchdog.
*
* (c) Copyright 2008 Wim Van Sebroeck <wim@iguana.be>.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Neither Wim Van Sebroeck nor Iguana vzw. admit liability nor
* provide warranty for any of this software. This material is
* provided "AS-IS" and at no charge.
diff --git a/drivers/watchdog/softdog.c b/drivers/watchdog/softdog.c
index 060740625485..3e4885c1545e 100644
--- a/drivers/watchdog/softdog.c
+++ b/drivers/watchdog/softdog.c
@@ -1,14 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* SoftDog: A Software Watchdog Device
*
* (c) Copyright 1996 Alan Cox <alan@lxorguk.ukuu.org.uk>,
* All Rights Reserved.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Neither Alan Cox nor CymruNet Ltd. admit liability nor provide
* warranty for any of this software. This material is provided
* "AS-IS" and at no charge.
diff --git a/drivers/watchdog/sp5100_tco.c b/drivers/watchdog/sp5100_tco.c
index cd4430ff9b1c..93bd302ae7c5 100644
--- a/drivers/watchdog/sp5100_tco.c
+++ b/drivers/watchdog/sp5100_tco.c
@@ -402,10 +402,8 @@ static int sp5100_tco_probe(struct platform_device *pdev)
return ret;
ret = devm_watchdog_register_device(dev, wdd);
- if (ret) {
- dev_err(dev, "cannot register watchdog device (err=%d)\n", ret);
+ if (ret)
return ret;
- }
/* Show module parameters */
dev_info(dev, "initialized. heartbeat=%d sec (nowayout=%d)\n",
diff --git a/drivers/watchdog/sp805_wdt.c b/drivers/watchdog/sp805_wdt.c
index 072986d461b7..53e04926a7b2 100644
--- a/drivers/watchdog/sp805_wdt.c
+++ b/drivers/watchdog/sp805_wdt.c
@@ -288,11 +288,8 @@ sp805_wdt_probe(struct amba_device *adev, const struct amba_id *id)
}
ret = watchdog_register_device(&wdt->wdd);
- if (ret) {
- dev_err(&adev->dev, "watchdog_register_device() failed: %d\n",
- ret);
+ if (ret)
goto err;
- }
amba_set_drvdata(adev, wdt);
dev_info(&adev->dev, "registration successful\n");
diff --git a/drivers/watchdog/sprd_wdt.c b/drivers/watchdog/sprd_wdt.c
index 916fb3f96bdc..edba4e278685 100644
--- a/drivers/watchdog/sprd_wdt.c
+++ b/drivers/watchdog/sprd_wdt.c
@@ -320,7 +320,6 @@ static int sprd_wdt_probe(struct platform_device *pdev)
ret = devm_watchdog_register_device(dev, &wdt->wdd);
if (ret) {
sprd_wdt_disable(wdt);
- dev_err(dev, "failed to register watchdog\n");
return ret;
}
platform_set_drvdata(pdev, wdt);
diff --git a/drivers/watchdog/st_lpc_wdt.c b/drivers/watchdog/st_lpc_wdt.c
index 7a90184eb950..14ab6559c748 100644
--- a/drivers/watchdog/st_lpc_wdt.c
+++ b/drivers/watchdog/st_lpc_wdt.c
@@ -228,10 +228,8 @@ static int st_wdog_probe(struct platform_device *pdev)
return ret;
ret = devm_watchdog_register_device(dev, &st_wdog_dev);
- if (ret) {
- dev_err(dev, "Unable to register watchdog\n");
+ if (ret)
return ret;
- }
st_wdog_setup(st_wdog, true);
diff --git a/drivers/watchdog/stm32_iwdg.c b/drivers/watchdog/stm32_iwdg.c
index d569a3634d9b..a3a329011a06 100644
--- a/drivers/watchdog/stm32_iwdg.c
+++ b/drivers/watchdog/stm32_iwdg.c
@@ -263,10 +263,8 @@ static int stm32_iwdg_probe(struct platform_device *pdev)
watchdog_init_timeout(wdd, 0, dev);
ret = devm_watchdog_register_device(dev, wdd);
- if (ret) {
- dev_err(dev, "failed to register watchdog device\n");
+ if (ret)
return ret;
- }
platform_set_drvdata(pdev, wdt);
diff --git a/drivers/watchdog/stmp3xxx_rtc_wdt.c b/drivers/watchdog/stmp3xxx_rtc_wdt.c
index 671f4ba7b4ed..7caf3aa71c6a 100644
--- a/drivers/watchdog/stmp3xxx_rtc_wdt.c
+++ b/drivers/watchdog/stmp3xxx_rtc_wdt.c
@@ -98,10 +98,8 @@ static int stmp3xxx_wdt_probe(struct platform_device *pdev)
stmp3xxx_wdd.parent = dev;
ret = devm_watchdog_register_device(dev, &stmp3xxx_wdd);
- if (ret < 0) {
- dev_err(dev, "cannot register watchdog device\n");
+ if (ret < 0)
return ret;
- }
if (register_reboot_notifier(&wdt_notifier))
dev_warn(dev, "cannot register reboot notifier\n");
diff --git a/drivers/watchdog/tegra_wdt.c b/drivers/watchdog/tegra_wdt.c
index a58b000acc4f..dfe06e506cad 100644
--- a/drivers/watchdog/tegra_wdt.c
+++ b/drivers/watchdog/tegra_wdt.c
@@ -219,10 +219,8 @@ static int tegra_wdt_probe(struct platform_device *pdev)
watchdog_stop_on_unregister(wdd);
ret = devm_watchdog_register_device(dev, wdd);
- if (ret) {
- dev_err(dev, "failed to register watchdog device\n");
+ if (ret)
return ret;
- }
platform_set_drvdata(pdev, wdt);
diff --git a/drivers/watchdog/ts4800_wdt.c b/drivers/watchdog/ts4800_wdt.c
index 9dc6d7f45806..c137ad2bd5c3 100644
--- a/drivers/watchdog/ts4800_wdt.c
+++ b/drivers/watchdog/ts4800_wdt.c
@@ -171,10 +171,8 @@ static int ts4800_wdt_probe(struct platform_device *pdev)
ts4800_wdt_stop(wdd);
ret = devm_watchdog_register_device(dev, wdd);
- if (ret) {
- dev_err(dev, "failed to register watchdog device\n");
+ if (ret)
return ret;
- }
platform_set_drvdata(pdev, wdt);
diff --git a/drivers/watchdog/w83627hf_wdt.c b/drivers/watchdog/w83627hf_wdt.c
index 3a49ba9ea608..38b31e9947aa 100644
--- a/drivers/watchdog/w83627hf_wdt.c
+++ b/drivers/watchdog/w83627hf_wdt.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* w83627hf/thf WDT driver
*
@@ -17,11 +18,6 @@
* (c) Copyright 1996 Alan Cox <alan@lxorguk.ukuu.org.uk>,
* All Rights Reserved.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Neither Alan Cox nor CymruNet Ltd. admit liability nor provide
* warranty for any of this software. This material is provided
* "AS-IS" and at no charge.
diff --git a/drivers/watchdog/wafer5823wdt.c b/drivers/watchdog/wafer5823wdt.c
index 0a8073b419f8..6d2071a0590d 100644
--- a/drivers/watchdog/wafer5823wdt.c
+++ b/drivers/watchdog/wafer5823wdt.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* ICP Wafer 5823 Single Board Computer WDT driver
* http://www.icpamerica.com/wafer_5823.php
@@ -13,11 +14,6 @@
* (c) Copyright 1996-1997 Alan Cox <alan@lxorguk.ukuu.org.uk>,
* All Rights Reserved.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Neither Alan Cox nor CymruNet Ltd. admit liability nor provide
* warranty for any of this software. This material is provided
* "AS-IS" and at no charge.
diff --git a/drivers/watchdog/watchdog_core.c b/drivers/watchdog/watchdog_core.c
index 62be9e52a4de..21e8085b848b 100644
--- a/drivers/watchdog/watchdog_core.c
+++ b/drivers/watchdog/watchdog_core.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* watchdog_core.c
*
@@ -16,11 +17,6 @@
* Satyam Sharma <satyam@infradead.org>
* Randy Dunlap <randy.dunlap@oracle.com>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Neither Alan Cox, CymruNet Ltd., Wim Van Sebroeck nor Iguana vzw.
* admit liability nor provide warranty for any of this software.
* This material is provided "AS-IS" and at no charge.
@@ -60,11 +56,10 @@ static DEFINE_MUTEX(wtd_deferred_reg_mutex);
static LIST_HEAD(wtd_deferred_reg_list);
static bool wtd_deferred_reg_done;
-static int watchdog_deferred_registration_add(struct watchdog_device *wdd)
+static void watchdog_deferred_registration_add(struct watchdog_device *wdd)
{
list_add_tail(&wdd->deferred,
&wtd_deferred_reg_list);
- return 0;
}
static void watchdog_deferred_registration_del(struct watchdog_device *wdd)
@@ -265,14 +260,23 @@ static int __watchdog_register_device(struct watchdog_device *wdd)
int watchdog_register_device(struct watchdog_device *wdd)
{
- int ret;
+ const char *dev_str;
+ int ret = 0;
mutex_lock(&wtd_deferred_reg_mutex);
if (wtd_deferred_reg_done)
ret = __watchdog_register_device(wdd);
else
- ret = watchdog_deferred_registration_add(wdd);
+ watchdog_deferred_registration_add(wdd);
mutex_unlock(&wtd_deferred_reg_mutex);
+
+ if (ret) {
+ dev_str = wdd->parent ? dev_name(wdd->parent) :
+ (const char *)wdd->info->identity;
+ pr_err("%s: failed to register watchdog device (err = %d)\n",
+ dev_str, ret);
+ }
+
return ret;
}
EXPORT_SYMBOL_GPL(watchdog_register_device);
diff --git a/drivers/watchdog/watchdog_core.h b/drivers/watchdog/watchdog_core.h
index 86ff962d1e15..a5062e8e0d13 100644
--- a/drivers/watchdog/watchdog_core.h
+++ b/drivers/watchdog/watchdog_core.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
/*
* watchdog_core.h
*
@@ -16,11 +17,6 @@
* Satyam Sharma <satyam@infradead.org>
* Randy Dunlap <randy.dunlap@oracle.com>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Neither Alan Cox, CymruNet Ltd., Wim Van Sebroeck nor Iguana vzw.
* admit liability nor provide warranty for any of this software.
* This material is provided "AS-IS" and at no charge.
diff --git a/drivers/watchdog/watchdog_dev.c b/drivers/watchdog/watchdog_dev.c
index 252a7c7b6592..dbd2ad4c9294 100644
--- a/drivers/watchdog/watchdog_dev.c
+++ b/drivers/watchdog/watchdog_dev.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* watchdog_dev.c
*
@@ -20,11 +21,6 @@
* Satyam Sharma <satyam@infradead.org>
* Randy Dunlap <randy.dunlap@oracle.com>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Neither Alan Cox, CymruNet Ltd., Wim Van Sebroeck nor Iguana vzw.
* admit liability nor provide warranty for any of this software.
* This material is provided "AS-IS" and at no charge.
@@ -69,6 +65,7 @@ struct watchdog_core_data {
struct mutex lock;
ktime_t last_keepalive;
ktime_t last_hw_keepalive;
+ ktime_t open_deadline;
struct hrtimer timer;
struct kthread_work work;
unsigned long status; /* Internal status bits */
@@ -87,6 +84,19 @@ static struct kthread_worker *watchdog_kworker;
static bool handle_boot_enabled =
IS_ENABLED(CONFIG_WATCHDOG_HANDLE_BOOT_ENABLED);
+static unsigned open_timeout = CONFIG_WATCHDOG_OPEN_TIMEOUT;
+
+static bool watchdog_past_open_deadline(struct watchdog_core_data *data)
+{
+ return ktime_after(ktime_get(), data->open_deadline);
+}
+
+static void watchdog_set_open_deadline(struct watchdog_core_data *data)
+{
+ data->open_deadline = open_timeout ?
+ ktime_get() + ktime_set(open_timeout, 0) : KTIME_MAX;
+}
+
static inline bool watchdog_need_worker(struct watchdog_device *wdd)
{
/* All variables in milli-seconds */
@@ -119,14 +129,15 @@ static ktime_t watchdog_next_keepalive(struct watchdog_device *wdd)
ktime_t virt_timeout;
unsigned int hw_heartbeat_ms;
- virt_timeout = ktime_add(wd_data->last_keepalive,
- ms_to_ktime(timeout_ms));
+ if (watchdog_active(wdd))
+ virt_timeout = ktime_add(wd_data->last_keepalive,
+ ms_to_ktime(timeout_ms));
+ else
+ virt_timeout = wd_data->open_deadline;
+
hw_heartbeat_ms = min_not_zero(timeout_ms, wdd->max_hw_heartbeat_ms);
keepalive_interval = ms_to_ktime(hw_heartbeat_ms / 2);
- if (!watchdog_active(wdd))
- return keepalive_interval;
-
/*
* To ensure that the watchdog times out wdd->timeout seconds
* after the most recent ping from userspace, the last
@@ -211,7 +222,13 @@ static bool watchdog_worker_should_ping(struct watchdog_core_data *wd_data)
{
struct watchdog_device *wdd = wd_data->wdd;
- return wdd && (watchdog_active(wdd) || watchdog_hw_running(wdd));
+ if (!wdd)
+ return false;
+
+ if (watchdog_active(wdd))
+ return true;
+
+ return watchdog_hw_running(wdd) && !watchdog_past_open_deadline(wd_data);
}
static void watchdog_ping_work(struct kthread_work *work)
@@ -824,6 +841,15 @@ static int watchdog_open(struct inode *inode, struct file *file)
if (!hw_running)
kref_get(&wd_data->kref);
+ /*
+ * open_timeout only applies for the first open from
+ * userspace. Set open_deadline to infinity so that the kernel
+ * will take care of an always-running hardware watchdog in
+ * case the device gets magic-closed or WDIOS_DISABLECARD is
+ * applied.
+ */
+ wd_data->open_deadline = KTIME_MAX;
+
/* dev/watchdog is a virtual (and thus non-seekable) filesystem */
return stream_open(inode, file);
@@ -983,6 +1009,7 @@ static int watchdog_cdev_register(struct watchdog_device *wdd, dev_t devno)
/* Record time of most recent heartbeat as 'just before now'. */
wd_data->last_hw_keepalive = ktime_sub(ktime_get(), 1);
+ watchdog_set_open_deadline(wd_data);
/*
* If the watchdog is running, prevent its driver from being unloaded,
@@ -1181,3 +1208,8 @@ module_param(handle_boot_enabled, bool, 0444);
MODULE_PARM_DESC(handle_boot_enabled,
"Watchdog core auto-updates boot enabled watchdogs before userspace takes over (default="
__MODULE_STRING(IS_ENABLED(CONFIG_WATCHDOG_HANDLE_BOOT_ENABLED)) ")");
+
+module_param(open_timeout, uint, 0644);
+MODULE_PARM_DESC(open_timeout,
+ "Maximum time (in seconds, 0 means infinity) for userspace to take over a running watchdog (default="
+ __MODULE_STRING(CONFIG_WATCHDOG_OPEN_TIMEOUT) ")");
diff --git a/drivers/watchdog/wd501p.h b/drivers/watchdog/wd501p.h
index 0e3a497d5626..43a4d88fd363 100644
--- a/drivers/watchdog/wd501p.h
+++ b/drivers/watchdog/wd501p.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-1.0+ */
/*
* Industrial Computer Source WDT500/501 driver
*
@@ -11,12 +12,7 @@
*
* http://www.cymru.net
*
- * This driver is provided under the GNU General Public License,
- * incorporated herein by reference. The driver is provided without
- * warranty or support.
- *
* Release 0.04.
- *
*/
diff --git a/drivers/watchdog/wdt.c b/drivers/watchdog/wdt.c
index 3d2f5ed60e88..0650100fad00 100644
--- a/drivers/watchdog/wdt.c
+++ b/drivers/watchdog/wdt.c
@@ -1,14 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* Industrial Computer Source WDT501 driver
*
* (c) Copyright 1996-1997 Alan Cox <alan@lxorguk.ukuu.org.uk>,
* All Rights Reserved.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Neither Alan Cox nor CymruNet Ltd. admit liability nor provide
* warranty for any of this software. This material is provided
* "AS-IS" and at no charge.
diff --git a/drivers/watchdog/wdt_pci.c b/drivers/watchdog/wdt_pci.c
index ff3a41f47127..66303ab95685 100644
--- a/drivers/watchdog/wdt_pci.c
+++ b/drivers/watchdog/wdt_pci.c
@@ -1,14 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* Industrial Computer Source PCI-WDT500/501 driver
*
* (c) Copyright 1996-1997 Alan Cox <alan@lxorguk.ukuu.org.uk>,
* All Rights Reserved.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Neither Alan Cox nor CymruNet Ltd. admit liability nor provide
* warranty for any of this software. This material is provided
* "AS-IS" and at no charge.
diff --git a/drivers/watchdog/wm831x_wdt.c b/drivers/watchdog/wm831x_wdt.c
index 9b6565a3fab4..030ce240620d 100644
--- a/drivers/watchdog/wm831x_wdt.c
+++ b/drivers/watchdog/wm831x_wdt.c
@@ -267,14 +267,7 @@ static int wm831x_wdt_probe(struct platform_device *pdev)
}
}
- ret = devm_watchdog_register_device(dev, &driver_data->wdt);
- if (ret != 0) {
- dev_err(wm831x->dev, "watchdog_register_device() failed: %d\n",
- ret);
- return ret;
- }
-
- return 0;
+ return devm_watchdog_register_device(dev, &driver_data->wdt);
}
static struct platform_driver wm831x_wdt_driver = {
diff --git a/drivers/watchdog/xen_wdt.c b/drivers/watchdog/xen_wdt.c
index 2ba0a3c4523c..b343f421dc72 100644
--- a/drivers/watchdog/xen_wdt.c
+++ b/drivers/watchdog/xen_wdt.c
@@ -138,10 +138,8 @@ static int xen_wdt_probe(struct platform_device *pdev)
watchdog_stop_on_unregister(&xen_wdt_dev);
ret = devm_watchdog_register_device(dev, &xen_wdt_dev);
- if (ret) {
- dev_err(dev, "cannot register watchdog device (%d)\n", ret);
+ if (ret)
return ret;
- }
dev_info(dev, "initialized (timeout=%ds, nowayout=%d)\n",
xen_wdt_dev.timeout, nowayout);
diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index d53f3493a6b9..cfbe46785a3b 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -402,7 +402,7 @@ static dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,
map = swiotlb_tbl_map_single(dev, start_dma_addr, phys, size, dir,
attrs);
- if (map == DMA_MAPPING_ERROR)
+ if (map == (phys_addr_t)DMA_MAPPING_ERROR)
return DMA_MAPPING_ERROR;
dev_addr = xen_phys_to_bus(map);
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index 7f7d92d6b024..cf235f6eacf9 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -36,3 +36,15 @@ config CEPH_FS_POSIX_ACL
groups beyond the owner/group/world scheme.
If you don't know what Access Control Lists are, say N
+
+config CEPH_FS_SECURITY_LABEL
+ bool "CephFS Security Labels"
+ depends on CEPH_FS && SECURITY
+ help
+ Security labels support alternative access control models
+ implemented by security modules like SELinux. This option
+ enables an extended attribute handler for file security
+ labels in the Ceph filesystem.
+
+ If you are not using a security module that requires using
+ extended attributes for file security labels, say N.
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index 8a19c249036c..aa55f412a6e3 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -159,7 +159,7 @@ out:
}
int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
- struct ceph_acls_info *info)
+ struct ceph_acl_sec_ctx *as_ctx)
{
struct posix_acl *acl, *default_acl;
size_t val_size1 = 0, val_size2 = 0;
@@ -234,9 +234,9 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
kfree(tmp_buf);
- info->acl = acl;
- info->default_acl = default_acl;
- info->pagelist = pagelist;
+ as_ctx->acl = acl;
+ as_ctx->default_acl = default_acl;
+ as_ctx->pagelist = pagelist;
return 0;
out_err:
@@ -248,18 +248,10 @@ out_err:
return err;
}
-void ceph_init_inode_acls(struct inode* inode, struct ceph_acls_info *info)
+void ceph_init_inode_acls(struct inode *inode, struct ceph_acl_sec_ctx *as_ctx)
{
if (!inode)
return;
- ceph_set_cached_acl(inode, ACL_TYPE_ACCESS, info->acl);
- ceph_set_cached_acl(inode, ACL_TYPE_DEFAULT, info->default_acl);
-}
-
-void ceph_release_acls_info(struct ceph_acls_info *info)
-{
- posix_acl_release(info->acl);
- posix_acl_release(info->default_acl);
- if (info->pagelist)
- ceph_pagelist_release(info->pagelist);
+ ceph_set_cached_acl(inode, ACL_TYPE_ACCESS, as_ctx->acl);
+ ceph_set_cached_acl(inode, ACL_TYPE_DEFAULT, as_ctx->default_acl);
}
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index a47c541f8006..e078cc55b989 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -10,6 +10,7 @@
#include <linux/pagevec.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/signal.h>
+#include <linux/iversion.h>
#include "super.h"
#include "mds_client.h"
@@ -1576,6 +1577,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
/* Update time before taking page lock */
file_update_time(vma->vm_file);
+ inode_inc_iversion_raw(inode);
do {
lock_page(page);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 0176241eaea7..d98dcd976c80 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -8,6 +8,7 @@
#include <linux/vmalloc.h>
#include <linux/wait.h>
#include <linux/writeback.h>
+#include <linux/iversion.h>
#include "super.h"
#include "mds_client.h"
@@ -1138,8 +1139,9 @@ struct cap_msg_args {
u64 ino, cid, follows;
u64 flush_tid, oldest_flush_tid, size, max_size;
u64 xattr_version;
+ u64 change_attr;
struct ceph_buffer *xattr_buf;
- struct timespec64 atime, mtime, ctime;
+ struct timespec64 atime, mtime, ctime, btime;
int op, caps, wanted, dirty;
u32 seq, issue_seq, mseq, time_warp_seq;
u32 flags;
@@ -1160,7 +1162,6 @@ static int send_cap_msg(struct cap_msg_args *arg)
struct ceph_msg *msg;
void *p;
size_t extra_len;
- struct timespec64 zerotime = {0};
struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc;
dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
@@ -1245,15 +1246,10 @@ static int send_cap_msg(struct cap_msg_args *arg)
/* pool namespace (version 8) (mds always ignores this) */
ceph_encode_32(&p, 0);
- /*
- * btime and change_attr (version 9)
- *
- * We just zero these out for now, as the MDS ignores them unless
- * the requisite feature flags are set (which we don't do yet).
- */
- ceph_encode_timespec64(p, &zerotime);
+ /* btime and change_attr (version 9) */
+ ceph_encode_timespec64(p, &arg->btime);
p += sizeof(struct ceph_timespec);
- ceph_encode_64(&p, 0);
+ ceph_encode_64(&p, arg->change_attr);
/* Advisory flags (version 10) */
ceph_encode_32(&p, arg->flags);
@@ -1263,20 +1259,22 @@ static int send_cap_msg(struct cap_msg_args *arg)
}
/*
- * Queue cap releases when an inode is dropped from our cache. Since
- * inode is about to be destroyed, there is no need for i_ceph_lock.
+ * Queue cap releases when an inode is dropped from our cache.
*/
-void __ceph_remove_caps(struct inode *inode)
+void __ceph_remove_caps(struct ceph_inode_info *ci)
{
- struct ceph_inode_info *ci = ceph_inode(inode);
struct rb_node *p;
+ /* lock i_ceph_lock, because ceph_d_revalidate(..., LOOKUP_RCU)
+ * may call __ceph_caps_issued_mask() on a freeing inode. */
+ spin_lock(&ci->i_ceph_lock);
p = rb_first(&ci->i_caps);
while (p) {
struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
p = rb_next(p);
__ceph_remove_cap(cap, true);
}
+ spin_unlock(&ci->i_ceph_lock);
}
/*
@@ -1297,7 +1295,7 @@ void __ceph_remove_caps(struct inode *inode)
* caller should hold snap_rwsem (read), s_mutex.
*/
static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
- int op, bool sync, int used, int want, int retain,
+ int op, int flags, int used, int want, int retain,
int flushing, u64 flush_tid, u64 oldest_flush_tid)
__releases(cap->ci->i_ceph_lock)
{
@@ -1377,6 +1375,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
arg.mtime = inode->i_mtime;
arg.atime = inode->i_atime;
arg.ctime = inode->i_ctime;
+ arg.btime = ci->i_btime;
+ arg.change_attr = inode_peek_iversion_raw(inode);
arg.op = op;
arg.caps = cap->implemented;
@@ -1393,12 +1393,19 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
arg.mode = inode->i_mode;
arg.inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
- if (list_empty(&ci->i_cap_snaps))
- arg.flags = CEPH_CLIENT_CAPS_NO_CAPSNAP;
- else
- arg.flags = CEPH_CLIENT_CAPS_PENDING_CAPSNAP;
- if (sync)
- arg.flags |= CEPH_CLIENT_CAPS_SYNC;
+ if (!(flags & CEPH_CLIENT_CAPS_PENDING_CAPSNAP) &&
+ !list_empty(&ci->i_cap_snaps)) {
+ struct ceph_cap_snap *capsnap;
+ list_for_each_entry_reverse(capsnap, &ci->i_cap_snaps, ci_item) {
+ if (capsnap->cap_flush.tid)
+ break;
+ if (capsnap->need_flush) {
+ flags |= CEPH_CLIENT_CAPS_PENDING_CAPSNAP;
+ break;
+ }
+ }
+ }
+ arg.flags = flags;
spin_unlock(&ci->i_ceph_lock);
@@ -1436,6 +1443,8 @@ static inline int __send_flush_snap(struct inode *inode,
arg.atime = capsnap->atime;
arg.mtime = capsnap->mtime;
arg.ctime = capsnap->ctime;
+ arg.btime = capsnap->btime;
+ arg.change_attr = capsnap->change_attr;
arg.op = CEPH_CAP_OP_FLUSHSNAP;
arg.caps = capsnap->issued;
@@ -1603,10 +1612,8 @@ retry:
}
// make sure flushsnap messages are sent in proper order.
- if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
+ if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH)
__kick_flushing_caps(mdsc, session, ci, 0);
- ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
- }
__ceph_flush_snaps(ci, session);
out:
@@ -2048,10 +2055,8 @@ ack:
if (cap == ci->i_auth_cap &&
(ci->i_ceph_flags &
(CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS))) {
- if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
+ if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH)
__kick_flushing_caps(mdsc, session, ci, 0);
- ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
- }
if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)
__ceph_flush_snaps(ci, session);
@@ -2087,7 +2092,7 @@ ack:
sent++;
/* __send_cap drops i_ceph_lock */
- delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, false,
+ delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, 0,
cap_used, want, retain, flushing,
flush_tid, oldest_flush_tid);
goto retry; /* retake i_ceph_lock and restart our cap scan. */
@@ -2121,6 +2126,7 @@ static int try_flush_caps(struct inode *inode, u64 *ptid)
retry:
spin_lock(&ci->i_ceph_lock);
+retry_locked:
if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
spin_unlock(&ci->i_ceph_lock);
dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);
@@ -2128,8 +2134,6 @@ retry:
}
if (ci->i_dirty_caps && ci->i_auth_cap) {
struct ceph_cap *cap = ci->i_auth_cap;
- int used = __ceph_caps_used(ci);
- int want = __ceph_caps_wanted(ci);
int delayed;
if (!session || session != cap->session) {
@@ -2145,13 +2149,25 @@ retry:
goto out;
}
+ if (ci->i_ceph_flags &
+ (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS)) {
+ if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH)
+ __kick_flushing_caps(mdsc, session, ci, 0);
+ if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)
+ __ceph_flush_snaps(ci, session);
+ goto retry_locked;
+ }
+
flushing = __mark_caps_flushing(inode, session, true,
&flush_tid, &oldest_flush_tid);
/* __send_cap drops i_ceph_lock */
- delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, true,
- used, want, (cap->issued | cap->implemented),
- flushing, flush_tid, oldest_flush_tid);
+ delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
+ CEPH_CLIENT_CAPS_SYNC,
+ __ceph_caps_used(ci),
+ __ceph_caps_wanted(ci),
+ (cap->issued | cap->implemented),
+ flushing, flush_tid, oldest_flush_tid);
if (delayed) {
spin_lock(&ci->i_ceph_lock);
@@ -2320,6 +2336,16 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
struct ceph_cap_flush *cf;
int ret;
u64 first_tid = 0;
+ u64 last_snap_flush = 0;
+
+ ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
+
+ list_for_each_entry_reverse(cf, &ci->i_cap_flush_list, i_list) {
+ if (!cf->caps) {
+ last_snap_flush = cf->tid;
+ break;
+ }
+ }
list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) {
if (cf->tid < first_tid)
@@ -2338,10 +2364,13 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
dout("kick_flushing_caps %p cap %p tid %llu %s\n",
inode, cap, cf->tid, ceph_cap_string(cf->caps));
ci->i_ceph_flags |= CEPH_I_NODELAY;
+
ret = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
- false, __ceph_caps_used(ci),
+ (cf->tid < last_snap_flush ?
+ CEPH_CLIENT_CAPS_PENDING_CAPSNAP : 0),
+ __ceph_caps_used(ci),
__ceph_caps_wanted(ci),
- cap->issued | cap->implemented,
+ (cap->issued | cap->implemented),
cf->caps, cf->tid, oldest_flush_tid);
if (ret) {
pr_err("kick_flushing_caps: error sending "
@@ -2410,7 +2439,6 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
*/
if ((cap->issued & ci->i_flushing_caps) !=
ci->i_flushing_caps) {
- ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
/* encode_caps_cb() also will reset these sequence
* numbers. make sure sequence numbers in cap flush
* message match later reconnect message */
@@ -2450,7 +2478,6 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
continue;
}
if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
- ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
__kick_flushing_caps(mdsc, session, ci,
oldest_flush_tid);
}
@@ -2478,7 +2505,6 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
oldest_flush_tid = __get_oldest_flush_tid(mdsc);
spin_unlock(&mdsc->cap_dirty_lock);
- ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
__kick_flushing_caps(mdsc, session, ci, oldest_flush_tid);
spin_unlock(&ci->i_ceph_lock);
} else {
@@ -3040,8 +3066,10 @@ struct cap_extra_info {
bool dirstat_valid;
u64 nfiles;
u64 nsubdirs;
+ u64 change_attr;
/* currently issued */
int issued;
+ struct timespec64 btime;
};
/*
@@ -3123,11 +3151,14 @@ static void handle_cap_grant(struct inode *inode,
__check_cap_issue(ci, cap, newcaps);
+ inode_set_max_iversion_raw(inode, extra_info->change_attr);
+
if ((newcaps & CEPH_CAP_AUTH_SHARED) &&
(extra_info->issued & CEPH_CAP_AUTH_EXCL) == 0) {
inode->i_mode = le32_to_cpu(grant->mode);
inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid));
inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid));
+ ci->i_btime = extra_info->btime;
dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
from_kuid(&init_user_ns, inode->i_uid),
from_kgid(&init_user_ns, inode->i_gid));
@@ -3154,6 +3185,7 @@ static void handle_cap_grant(struct inode *inode,
ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
ci->i_xattrs.version = version;
ceph_forget_all_cached_acls(inode);
+ ceph_security_invalidate_secctx(inode);
}
}
@@ -3848,17 +3880,19 @@ void ceph_handle_caps(struct ceph_mds_session *session,
}
}
- if (msg_version >= 11) {
+ if (msg_version >= 9) {
struct ceph_timespec *btime;
- u64 change_attr;
- u32 flags;
- /* version >= 9 */
if (p + sizeof(*btime) > end)
goto bad;
btime = p;
+ ceph_decode_timespec64(&extra_info.btime, btime);
p += sizeof(*btime);
- ceph_decode_64_safe(&p, end, change_attr, bad);
+ ceph_decode_64_safe(&p, end, extra_info.change_attr, bad);
+ }
+
+ if (msg_version >= 11) {
+ u32 flags;
/* version >= 10 */
ceph_decode_32_safe(&p, end, flags, bad);
/* version >= 11 */
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 83cd41fa2b01..2eb88ed22993 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -52,7 +52,7 @@ static int mdsc_show(struct seq_file *s, void *p)
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
struct rb_node *rp;
- int pathlen;
+ int pathlen = 0;
u64 pathbase;
char *path;
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 0637149fb9f9..aab29f48c62d 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -825,7 +825,7 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry,
struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
- struct ceph_acls_info acls = {};
+ struct ceph_acl_sec_ctx as_ctx = {};
int err;
if (ceph_snap(dir) != CEPH_NOSNAP)
@@ -836,7 +836,10 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry,
goto out;
}
- err = ceph_pre_init_acls(dir, &mode, &acls);
+ err = ceph_pre_init_acls(dir, &mode, &as_ctx);
+ if (err < 0)
+ goto out;
+ err = ceph_security_init_secctx(dentry, mode, &as_ctx);
if (err < 0)
goto out;
@@ -855,9 +858,9 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry,
req->r_args.mknod.rdev = cpu_to_le32(rdev);
req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
- if (acls.pagelist) {
- req->r_pagelist = acls.pagelist;
- acls.pagelist = NULL;
+ if (as_ctx.pagelist) {
+ req->r_pagelist = as_ctx.pagelist;
+ as_ctx.pagelist = NULL;
}
err = ceph_mdsc_do_request(mdsc, dir, req);
if (!err && !req->r_reply_info.head->is_dentry)
@@ -865,10 +868,10 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry,
ceph_mdsc_put_request(req);
out:
if (!err)
- ceph_init_inode_acls(d_inode(dentry), &acls);
+ ceph_init_inode_acls(d_inode(dentry), &as_ctx);
else
d_drop(dentry);
- ceph_release_acls_info(&acls);
+ ceph_release_acl_sec_ctx(&as_ctx);
return err;
}
@@ -884,6 +887,7 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
+ struct ceph_acl_sec_ctx as_ctx = {};
int err;
if (ceph_snap(dir) != CEPH_NOSNAP)
@@ -894,6 +898,10 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
goto out;
}
+ err = ceph_security_init_secctx(dentry, S_IFLNK | 0777, &as_ctx);
+ if (err < 0)
+ goto out;
+
dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);
if (IS_ERR(req)) {
@@ -919,6 +927,7 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
out:
if (err)
d_drop(dentry);
+ ceph_release_acl_sec_ctx(&as_ctx);
return err;
}
@@ -927,7 +936,7 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
- struct ceph_acls_info acls = {};
+ struct ceph_acl_sec_ctx as_ctx = {};
int err = -EROFS;
int op;
@@ -950,7 +959,10 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
}
mode |= S_IFDIR;
- err = ceph_pre_init_acls(dir, &mode, &acls);
+ err = ceph_pre_init_acls(dir, &mode, &as_ctx);
+ if (err < 0)
+ goto out;
+ err = ceph_security_init_secctx(dentry, mode, &as_ctx);
if (err < 0)
goto out;
@@ -967,9 +979,9 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
req->r_args.mkdir.mode = cpu_to_le32(mode);
req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
- if (acls.pagelist) {
- req->r_pagelist = acls.pagelist;
- acls.pagelist = NULL;
+ if (as_ctx.pagelist) {
+ req->r_pagelist = as_ctx.pagelist;
+ as_ctx.pagelist = NULL;
}
err = ceph_mdsc_do_request(mdsc, dir, req);
if (!err &&
@@ -979,10 +991,10 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
ceph_mdsc_put_request(req);
out:
if (!err)
- ceph_init_inode_acls(d_inode(dentry), &acls);
+ ceph_init_inode_acls(d_inode(dentry), &as_ctx);
else
d_drop(dentry);
- ceph_release_acls_info(&acls);
+ ceph_release_acl_sec_ctx(&as_ctx);
return err;
}
@@ -1433,8 +1445,7 @@ static bool __dentry_lease_is_valid(struct ceph_dentry_info *di)
return false;
}
-static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
- struct inode *dir)
+static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags)
{
struct ceph_dentry_info *di;
struct ceph_mds_session *session = NULL;
@@ -1466,7 +1477,7 @@ static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
spin_unlock(&dentry->d_lock);
if (session) {
- ceph_mdsc_lease_send_msg(session, dir, dentry,
+ ceph_mdsc_lease_send_msg(session, dentry,
CEPH_MDS_LEASE_RENEW, seq);
ceph_put_mds_session(session);
}
@@ -1512,18 +1523,26 @@ static int __dir_lease_try_check(const struct dentry *dentry)
static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
{
struct ceph_inode_info *ci = ceph_inode(dir);
- struct ceph_dentry_info *di = ceph_dentry(dentry);
- int valid = 0;
+ int valid;
+ int shared_gen;
spin_lock(&ci->i_ceph_lock);
- if (atomic_read(&ci->i_shared_gen) == di->lease_shared_gen)
- valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
+ valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
+ shared_gen = atomic_read(&ci->i_shared_gen);
spin_unlock(&ci->i_ceph_lock);
- if (valid)
- __ceph_dentry_dir_lease_touch(di);
- dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n",
- dir, (unsigned)atomic_read(&ci->i_shared_gen),
- dentry, (unsigned)di->lease_shared_gen, valid);
+ if (valid) {
+ struct ceph_dentry_info *di;
+ spin_lock(&dentry->d_lock);
+ di = ceph_dentry(dentry);
+ if (dir == d_inode(dentry->d_parent) &&
+ di && di->lease_shared_gen == shared_gen)
+ __ceph_dentry_dir_lease_touch(di);
+ else
+ valid = 0;
+ spin_unlock(&dentry->d_lock);
+ }
+ dout("dir_lease_is_valid dir %p v%u dentry %p = %d\n",
+ dir, (unsigned)atomic_read(&ci->i_shared_gen), dentry, valid);
return valid;
}
@@ -1558,7 +1577,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
ceph_snap(d_inode(dentry)) == CEPH_SNAPDIR) {
valid = 1;
} else {
- valid = dentry_lease_is_valid(dentry, flags, dir);
+ valid = dentry_lease_is_valid(dentry, flags);
if (valid == -ECHILD)
return valid;
if (valid || dir_lease_is_valid(dir, dentry)) {
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index d3ef7ee429ec..15ff1b09cfa2 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -368,7 +368,7 @@ static struct dentry *ceph_get_parent(struct dentry *child)
}
out:
dout("get_parent %p ino %llx.%llx err=%ld\n",
- child, ceph_vinop(inode), (IS_ERR(dn) ? PTR_ERR(dn) : 0));
+ child, ceph_vinop(inode), (long)PTR_ERR_OR_ZERO(dn));
return dn;
}
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index c5517ffeb11c..685a03cc4b77 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -10,6 +10,7 @@
#include <linux/namei.h>
#include <linux/writeback.h>
#include <linux/falloc.h>
+#include <linux/iversion.h>
#include "super.h"
#include "mds_client.h"
@@ -437,7 +438,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
struct dentry *dn;
- struct ceph_acls_info acls = {};
+ struct ceph_acl_sec_ctx as_ctx = {};
int mask;
int err;
@@ -451,25 +452,28 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
if (flags & O_CREAT) {
if (ceph_quota_is_max_files_exceeded(dir))
return -EDQUOT;
- err = ceph_pre_init_acls(dir, &mode, &acls);
+ err = ceph_pre_init_acls(dir, &mode, &as_ctx);
if (err < 0)
return err;
+ err = ceph_security_init_secctx(dentry, mode, &as_ctx);
+ if (err < 0)
+ goto out_ctx;
}
/* do the open */
req = prepare_open_request(dir->i_sb, flags, mode);
if (IS_ERR(req)) {
err = PTR_ERR(req);
- goto out_acl;
+ goto out_ctx;
}
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
if (flags & O_CREAT) {
req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
- if (acls.pagelist) {
- req->r_pagelist = acls.pagelist;
- acls.pagelist = NULL;
+ if (as_ctx.pagelist) {
+ req->r_pagelist = as_ctx.pagelist;
+ as_ctx.pagelist = NULL;
}
}
@@ -507,7 +511,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
} else {
dout("atomic_open finish_open on dn %p\n", dn);
if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
- ceph_init_inode_acls(d_inode(dentry), &acls);
+ ceph_init_inode_acls(d_inode(dentry), &as_ctx);
file->f_mode |= FMODE_CREATED;
}
err = finish_open(file, dentry, ceph_open);
@@ -516,8 +520,8 @@ out_req:
if (!req->r_err && req->r_target_inode)
ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode);
ceph_mdsc_put_request(req);
-out_acl:
- ceph_release_acls_info(&acls);
+out_ctx:
+ ceph_release_acl_sec_ctx(&as_ctx);
dout("atomic_open result=%d\n", err);
return err;
}
@@ -1007,7 +1011,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
* may block.
*/
truncate_inode_pages_range(inode->i_mapping, pos,
- (pos+len) | (PAGE_SIZE - 1));
+ PAGE_ALIGN(pos + len) - 1);
req->r_mtime = mtime;
}
@@ -1022,7 +1026,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
req->r_callback = ceph_aio_complete_req;
req->r_inode = inode;
req->r_priv = aio_req;
- list_add_tail(&req->r_unsafe_item, &aio_req->osd_reqs);
+ list_add_tail(&req->r_private_item, &aio_req->osd_reqs);
pos += len;
continue;
@@ -1082,8 +1086,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
while (!list_empty(&osd_reqs)) {
req = list_first_entry(&osd_reqs,
struct ceph_osd_request,
- r_unsafe_item);
- list_del_init(&req->r_unsafe_item);
+ r_private_item);
+ list_del_init(&req->r_private_item);
if (ret >= 0)
ret = ceph_osdc_start_request(req->r_osdc,
req, false);
@@ -1432,6 +1436,8 @@ retry_snap:
if (err)
goto out;
+ inode_inc_iversion_raw(inode);
+
if (ci->i_inline_version != CEPH_INLINE_NONE) {
err = ceph_uninline_data(file, NULL);
if (err < 0)
@@ -2063,6 +2069,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
do_final_copy = true;
file_update_time(dst_file);
+ inode_inc_iversion_raw(dst_inode);
+
if (endoff > size) {
int caps_flags = 0;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 761451f36e2d..791f84a13bb8 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -13,6 +13,7 @@
#include <linux/posix_acl.h>
#include <linux/random.h>
#include <linux/sort.h>
+#include <linux/iversion.h>
#include "super.h"
#include "mds_client.h"
@@ -42,6 +43,7 @@ static int ceph_set_ino_cb(struct inode *inode, void *data)
{
ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
+ inode_set_iversion_raw(inode, 0);
return 0;
}
@@ -509,6 +511,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
INIT_WORK(&ci->i_work, ceph_inode_work);
ci->i_work_mask = 0;
+ memset(&ci->i_btime, '\0', sizeof(ci->i_btime));
ceph_fscache_inode_init(ci);
@@ -523,17 +526,20 @@ void ceph_free_inode(struct inode *inode)
kmem_cache_free(ceph_inode_cachep, ci);
}
-void ceph_destroy_inode(struct inode *inode)
+void ceph_evict_inode(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_inode_frag *frag;
struct rb_node *n;
- dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
+ dout("evict_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
+
+ truncate_inode_pages_final(&inode->i_data);
+ clear_inode(inode);
ceph_fscache_unregister_inode_cookie(ci);
- __ceph_remove_caps(inode);
+ __ceph_remove_caps(ci);
if (__ceph_has_any_quota(ci))
ceph_adjust_quota_realms_count(inode, false);
@@ -578,16 +584,6 @@ void ceph_destroy_inode(struct inode *inode)
ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns));
}
-int ceph_drop_inode(struct inode *inode)
-{
- /*
- * Positve dentry and corresponding inode are always accompanied
- * in MDS reply. So no need to keep inode in the cache after
- * dropping all its aliases.
- */
- return 1;
-}
-
static inline blkcnt_t calc_inode_blocks(u64 size)
{
return (size + (1<<9) - 1) >> 9;
@@ -795,6 +791,9 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
le64_to_cpu(info->version) > (ci->i_version & ~1)))
new_version = true;
+ /* Update change_attribute */
+ inode_set_max_iversion_raw(inode, iinfo->change_attr);
+
__ceph_caps_issued(ci, &issued);
issued |= __ceph_caps_dirty(ci);
new_issued = ~issued & info_caps;
@@ -813,6 +812,8 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
from_kuid(&init_user_ns, inode->i_uid),
from_kgid(&init_user_ns, inode->i_gid));
+ ceph_decode_timespec64(&ci->i_btime, &iinfo->btime);
+ ceph_decode_timespec64(&ci->i_snap_btime, &iinfo->snap_btime);
}
if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
@@ -887,6 +888,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
iinfo->xattr_data, iinfo->xattr_len);
ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
ceph_forget_all_cached_acls(inode);
+ ceph_security_invalidate_secctx(inode);
xattr_blob = NULL;
}
@@ -1027,59 +1029,38 @@ out:
}
/*
- * caller should hold session s_mutex.
+ * caller should hold session s_mutex and dentry->d_lock.
*/
-static void update_dentry_lease(struct dentry *dentry,
- struct ceph_mds_reply_lease *lease,
- struct ceph_mds_session *session,
- unsigned long from_time,
- struct ceph_vino *tgt_vino,
- struct ceph_vino *dir_vino)
+static void __update_dentry_lease(struct inode *dir, struct dentry *dentry,
+ struct ceph_mds_reply_lease *lease,
+ struct ceph_mds_session *session,
+ unsigned long from_time,
+ struct ceph_mds_session **old_lease_session)
{
struct ceph_dentry_info *di = ceph_dentry(dentry);
long unsigned duration = le32_to_cpu(lease->duration_ms);
long unsigned ttl = from_time + (duration * HZ) / 1000;
long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
- struct inode *dir;
- struct ceph_mds_session *old_lease_session = NULL;
- /*
- * Make sure dentry's inode matches tgt_vino. NULL tgt_vino means that
- * we expect a negative dentry.
- */
- if (!tgt_vino && d_really_is_positive(dentry))
- return;
-
- if (tgt_vino && (d_really_is_negative(dentry) ||
- !ceph_ino_compare(d_inode(dentry), tgt_vino)))
- return;
-
- spin_lock(&dentry->d_lock);
dout("update_dentry_lease %p duration %lu ms ttl %lu\n",
dentry, duration, ttl);
- dir = d_inode(dentry->d_parent);
-
- /* make sure parent matches dir_vino */
- if (!ceph_ino_compare(dir, dir_vino))
- goto out_unlock;
-
/* only track leases on regular dentries */
if (ceph_snap(dir) != CEPH_NOSNAP)
- goto out_unlock;
+ return;
di->lease_shared_gen = atomic_read(&ceph_inode(dir)->i_shared_gen);
if (duration == 0) {
__ceph_dentry_dir_lease_touch(di);
- goto out_unlock;
+ return;
}
if (di->lease_gen == session->s_cap_gen &&
time_before(ttl, di->time))
- goto out_unlock; /* we already have a newer lease. */
+ return; /* we already have a newer lease. */
if (di->lease_session && di->lease_session != session) {
- old_lease_session = di->lease_session;
+ *old_lease_session = di->lease_session;
di->lease_session = NULL;
}
@@ -1092,6 +1073,62 @@ static void update_dentry_lease(struct dentry *dentry,
di->time = ttl;
__ceph_dentry_lease_touch(di);
+}
+
+static inline void update_dentry_lease(struct inode *dir, struct dentry *dentry,
+ struct ceph_mds_reply_lease *lease,
+ struct ceph_mds_session *session,
+ unsigned long from_time)
+{
+ struct ceph_mds_session *old_lease_session = NULL;
+ spin_lock(&dentry->d_lock);
+ __update_dentry_lease(dir, dentry, lease, session, from_time,
+ &old_lease_session);
+ spin_unlock(&dentry->d_lock);
+ if (old_lease_session)
+ ceph_put_mds_session(old_lease_session);
+}
+
+/*
+ * update dentry lease without having parent inode locked
+ */
+static void update_dentry_lease_careful(struct dentry *dentry,
+ struct ceph_mds_reply_lease *lease,
+ struct ceph_mds_session *session,
+ unsigned long from_time,
+ char *dname, u32 dname_len,
+ struct ceph_vino *pdvino,
+ struct ceph_vino *ptvino)
+
+{
+ struct inode *dir;
+ struct ceph_mds_session *old_lease_session = NULL;
+
+ spin_lock(&dentry->d_lock);
+ /* make sure dentry's name matches target */
+ if (dentry->d_name.len != dname_len ||
+ memcmp(dentry->d_name.name, dname, dname_len))
+ goto out_unlock;
+
+ dir = d_inode(dentry->d_parent);
+ /* make sure parent matches dvino */
+ if (!ceph_ino_compare(dir, pdvino))
+ goto out_unlock;
+
+ /* make sure dentry's inode matches target. NULL ptvino means that
+ * we expect a negative dentry */
+ if (ptvino) {
+ if (d_really_is_negative(dentry))
+ goto out_unlock;
+ if (!ceph_ino_compare(d_inode(dentry), ptvino))
+ goto out_unlock;
+ } else {
+ if (d_really_is_positive(dentry))
+ goto out_unlock;
+ }
+
+ __update_dentry_lease(dir, dentry, lease, session,
+ from_time, &old_lease_session);
out_unlock:
spin_unlock(&dentry->d_lock);
if (old_lease_session)
@@ -1156,19 +1193,6 @@ static int splice_dentry(struct dentry **pdn, struct inode *in)
return 0;
}
-static int d_name_cmp(struct dentry *dentry, const char *name, size_t len)
-{
- int ret;
-
- /* take d_lock to ensure dentry->d_name stability */
- spin_lock(&dentry->d_lock);
- ret = dentry->d_name.len - len;
- if (!ret)
- ret = memcmp(dentry->d_name.name, name, len);
- spin_unlock(&dentry->d_lock);
- return ret;
-}
-
/*
* Incorporate results into the local cache. This is either just
* one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
@@ -1371,10 +1395,9 @@ retry_lookup:
} else if (have_lease) {
if (d_unhashed(dn))
d_add(dn, NULL);
- update_dentry_lease(dn, rinfo->dlease,
- session,
- req->r_request_started,
- NULL, &dvino);
+ update_dentry_lease(dir, dn,
+ rinfo->dlease, session,
+ req->r_request_started);
}
goto done;
}
@@ -1396,11 +1419,9 @@ retry_lookup:
}
if (have_lease) {
- tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
- tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
- update_dentry_lease(dn, rinfo->dlease, session,
- req->r_request_started,
- &tvino, &dvino);
+ update_dentry_lease(dir, dn,
+ rinfo->dlease, session,
+ req->r_request_started);
}
dout(" final dn %p\n", dn);
} else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
@@ -1418,27 +1439,20 @@ retry_lookup:
err = splice_dentry(&req->r_dentry, in);
if (err < 0)
goto done;
- } else if (rinfo->head->is_dentry &&
- !d_name_cmp(req->r_dentry, rinfo->dname, rinfo->dname_len)) {
+ } else if (rinfo->head->is_dentry && req->r_dentry) {
+ /* parent inode is not locked, be carefull */
struct ceph_vino *ptvino = NULL;
-
- if ((le32_to_cpu(rinfo->diri.in->cap.caps) & CEPH_CAP_FILE_SHARED) ||
- le32_to_cpu(rinfo->dlease->duration_ms)) {
- dvino.ino = le64_to_cpu(rinfo->diri.in->ino);
- dvino.snap = le64_to_cpu(rinfo->diri.in->snapid);
-
- if (rinfo->head->is_target) {
- tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
- tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
- ptvino = &tvino;
- }
-
- update_dentry_lease(req->r_dentry, rinfo->dlease,
- session, req->r_request_started, ptvino,
- &dvino);
- } else {
- dout("%s: no dentry lease or dir cap\n", __func__);
+ dvino.ino = le64_to_cpu(rinfo->diri.in->ino);
+ dvino.snap = le64_to_cpu(rinfo->diri.in->snapid);
+ if (rinfo->head->is_target) {
+ tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
+ tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
+ ptvino = &tvino;
}
+ update_dentry_lease_careful(req->r_dentry, rinfo->dlease,
+ session, req->r_request_started,
+ rinfo->dname, rinfo->dname_len,
+ &dvino, ptvino);
}
done:
dout("fill_trace done err=%d\n", err);
@@ -1600,7 +1614,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
/* FIXME: release caps/leases if error occurs */
for (i = 0; i < rinfo->dir_nr; i++) {
struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
- struct ceph_vino tvino, dvino;
+ struct ceph_vino tvino;
dname.name = rde->name;
dname.len = rde->name_len;
@@ -1701,9 +1715,9 @@ retry_lookup:
ceph_dentry(dn)->offset = rde->offset;
- dvino = ceph_vino(d_inode(parent));
- update_dentry_lease(dn, rde->lease, req->r_session,
- req->r_request_started, &tvino, &dvino);
+ update_dentry_lease(d_inode(parent), dn,
+ rde->lease, req->r_session,
+ req->r_request_started);
if (err == 0 && skipped == 0 && cache_ctl.index >= 0) {
ret = fill_readdir_cache(d_inode(parent), dn,
@@ -2282,7 +2296,7 @@ static int statx_to_caps(u32 want)
{
int mask = 0;
- if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME))
+ if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME|STATX_BTIME))
mask |= CEPH_CAP_AUTH_SHARED;
if (want & (STATX_NLINK|STATX_CTIME))
@@ -2307,6 +2321,7 @@ int ceph_getattr(const struct path *path, struct kstat *stat,
{
struct inode *inode = d_inode(path->dentry);
struct ceph_inode_info *ci = ceph_inode(inode);
+ u32 valid_mask = STATX_BASIC_STATS;
int err = 0;
/* Skip the getattr altogether if we're asked not to sync */
@@ -2319,6 +2334,16 @@ int ceph_getattr(const struct path *path, struct kstat *stat,
generic_fillattr(inode, stat);
stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
+
+ /*
+ * btime on newly-allocated inodes is 0, so if this is still set to
+ * that, then assume that it's not valid.
+ */
+ if (ci->i_btime.tv_sec || ci->i_btime.tv_nsec) {
+ stat->btime = ci->i_btime;
+ valid_mask |= STATX_BTIME;
+ }
+
if (ceph_snap(inode) == CEPH_NOSNAP)
stat->dev = inode->i_sb->s_dev;
else
@@ -2342,7 +2367,6 @@ int ceph_getattr(const struct path *path, struct kstat *stat,
stat->nlink = 1 + 1 + ci->i_subdirs;
}
- /* Mask off any higher bits (e.g. btime) until we have support */
- stat->result_mask = request_mask & STATX_BASIC_STATS;
+ stat->result_mask = request_mask & valid_mask;
return err;
}
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index c8a9b89b922d..920e9f048bd8 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -150,14 +150,13 @@ static int parse_reply_info_in(void **p, void *end,
info->pool_ns_data = *p;
*p += info->pool_ns_len;
}
- /* btime, change_attr */
- {
- struct ceph_timespec btime;
- u64 change_attr;
- ceph_decode_need(p, end, sizeof(btime), bad);
- ceph_decode_copy(p, &btime, sizeof(btime));
- ceph_decode_64_safe(p, end, change_attr, bad);
- }
+
+ /* btime */
+ ceph_decode_need(p, end, sizeof(info->btime), bad);
+ ceph_decode_copy(p, &info->btime, sizeof(info->btime));
+
+ /* change attribute */
+ ceph_decode_64_safe(p, end, info->change_attr, bad);
/* dir pin */
if (struct_v >= 2) {
@@ -166,6 +165,15 @@ static int parse_reply_info_in(void **p, void *end,
info->dir_pin = -ENODATA;
}
+ /* snapshot birth time, remains zero for v<=2 */
+ if (struct_v >= 3) {
+ ceph_decode_need(p, end, sizeof(info->snap_btime), bad);
+ ceph_decode_copy(p, &info->snap_btime,
+ sizeof(info->snap_btime));
+ } else {
+ memset(&info->snap_btime, 0, sizeof(info->snap_btime));
+ }
+
*p = end;
} else {
if (features & CEPH_FEATURE_MDS_INLINE_DATA) {
@@ -197,7 +205,14 @@ static int parse_reply_info_in(void **p, void *end,
}
}
+ if (features & CEPH_FEATURE_FS_BTIME) {
+ ceph_decode_need(p, end, sizeof(info->btime), bad);
+ ceph_decode_copy(p, &info->btime, sizeof(info->btime));
+ ceph_decode_64_safe(p, end, info->change_attr, bad);
+ }
+
info->dir_pin = -ENODATA;
+ /* info->snap_btime remains zero */
}
return 0;
bad:
@@ -717,6 +732,7 @@ void ceph_mdsc_release_request(struct kref *kref)
ceph_pagelist_release(req->r_pagelist);
put_request_session(req);
ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation);
+ WARN_ON_ONCE(!list_empty(&req->r_wait));
kfree(req);
}
@@ -903,7 +919,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
struct inode *dir;
rcu_read_lock();
- parent = req->r_dentry->d_parent;
+ parent = READ_ONCE(req->r_dentry->d_parent);
dir = req->r_parent ? : d_inode_rcu(parent);
if (!dir || dir->i_sb != mdsc->fsc->sb) {
@@ -2135,7 +2151,7 @@ retry:
memcpy(path + pos, temp->d_name.name, temp->d_name.len);
}
spin_unlock(&temp->d_lock);
- temp = temp->d_parent;
+ temp = READ_ONCE(temp->d_parent);
/* Are we at the root? */
if (IS_ROOT(temp))
@@ -3727,42 +3743,35 @@ static void check_new_map(struct ceph_mds_client *mdsc,
ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
ceph_session_state_name(s->s_state));
- if (i >= newmap->m_num_mds ||
- memcmp(ceph_mdsmap_get_addr(oldmap, i),
- ceph_mdsmap_get_addr(newmap, i),
- sizeof(struct ceph_entity_addr))) {
- if (s->s_state == CEPH_MDS_SESSION_OPENING) {
- /* the session never opened, just close it
- * out now */
- get_session(s);
- __unregister_session(mdsc, s);
- __wake_requests(mdsc, &s->s_waiting);
- ceph_put_mds_session(s);
- } else if (i >= newmap->m_num_mds) {
- /* force close session for stopped mds */
- get_session(s);
- __unregister_session(mdsc, s);
- __wake_requests(mdsc, &s->s_waiting);
- kick_requests(mdsc, i);
- mutex_unlock(&mdsc->mutex);
+ if (i >= newmap->m_num_mds) {
+ /* force close session for stopped mds */
+ get_session(s);
+ __unregister_session(mdsc, s);
+ __wake_requests(mdsc, &s->s_waiting);
+ mutex_unlock(&mdsc->mutex);
- mutex_lock(&s->s_mutex);
- cleanup_session_requests(mdsc, s);
- remove_session_caps(s);
- mutex_unlock(&s->s_mutex);
+ mutex_lock(&s->s_mutex);
+ cleanup_session_requests(mdsc, s);
+ remove_session_caps(s);
+ mutex_unlock(&s->s_mutex);
- ceph_put_mds_session(s);
+ ceph_put_mds_session(s);
- mutex_lock(&mdsc->mutex);
- } else {
- /* just close it */
- mutex_unlock(&mdsc->mutex);
- mutex_lock(&s->s_mutex);
- mutex_lock(&mdsc->mutex);
- ceph_con_close(&s->s_con);
- mutex_unlock(&s->s_mutex);
- s->s_state = CEPH_MDS_SESSION_RESTARTING;
- }
+ mutex_lock(&mdsc->mutex);
+ kick_requests(mdsc, i);
+ continue;
+ }
+
+ if (memcmp(ceph_mdsmap_get_addr(oldmap, i),
+ ceph_mdsmap_get_addr(newmap, i),
+ sizeof(struct ceph_entity_addr))) {
+ /* just close it */
+ mutex_unlock(&mdsc->mutex);
+ mutex_lock(&s->s_mutex);
+ mutex_lock(&mdsc->mutex);
+ ceph_con_close(&s->s_con);
+ mutex_unlock(&s->s_mutex);
+ s->s_state = CEPH_MDS_SESSION_RESTARTING;
} else if (oldstate == newstate) {
continue; /* nothing new with this mds */
}
@@ -3931,31 +3940,33 @@ bad:
}
void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
- struct inode *inode,
struct dentry *dentry, char action,
u32 seq)
{
struct ceph_msg *msg;
struct ceph_mds_lease *lease;
- int len = sizeof(*lease) + sizeof(u32);
- int dnamelen = 0;
+ struct inode *dir;
+ int len = sizeof(*lease) + sizeof(u32) + NAME_MAX;
- dout("lease_send_msg inode %p dentry %p %s to mds%d\n",
- inode, dentry, ceph_lease_op_name(action), session->s_mds);
- dnamelen = dentry->d_name.len;
- len += dnamelen;
+ dout("lease_send_msg identry %p %s to mds%d\n",
+ dentry, ceph_lease_op_name(action), session->s_mds);
msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false);
if (!msg)
return;
lease = msg->front.iov_base;
lease->action = action;
- lease->ino = cpu_to_le64(ceph_vino(inode).ino);
- lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
lease->seq = cpu_to_le32(seq);
- put_unaligned_le32(dnamelen, lease + 1);
- memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dnamelen);
+ spin_lock(&dentry->d_lock);
+ dir = d_inode(dentry->d_parent);
+ lease->ino = cpu_to_le64(ceph_ino(dir));
+ lease->first = lease->last = cpu_to_le64(ceph_snap(dir));
+
+ put_unaligned_le32(dentry->d_name.len, lease + 1);
+ memcpy((void *)(lease + 1) + 4,
+ dentry->d_name.name, dentry->d_name.len);
+ spin_unlock(&dentry->d_lock);
/*
* if this is a preemptive lease RELEASE, no need to
* flush request stream, since the actual request will
@@ -4157,6 +4168,7 @@ static void wait_requests(struct ceph_mds_client *mdsc)
while ((req = __get_oldest_req(mdsc))) {
dout("wait_requests timed out on tid %llu\n",
req->r_tid);
+ list_del_init(&req->r_wait);
__unregister_request(mdsc, req);
}
}
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index a83f28bc2387..f7c8603484fe 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -69,6 +69,9 @@ struct ceph_mds_reply_info_in {
u64 max_bytes;
u64 max_files;
s32 dir_pin;
+ struct ceph_timespec btime;
+ struct ceph_timespec snap_btime;
+ u64 change_attr;
};
struct ceph_mds_reply_dir_entry {
@@ -504,7 +507,6 @@ extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
- struct inode *inode,
struct dentry *dentry, char action,
u32 seq);
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 701b4fb0fb5a..ce2d00da5096 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -107,7 +107,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
struct ceph_mdsmap *m;
const void *start = *p;
int i, j, n;
- int err = -EINVAL;
+ int err;
u8 mdsmap_v, mdsmap_cv;
u16 mdsmap_ev;
@@ -183,8 +183,9 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
inc = ceph_decode_32(p);
state = ceph_decode_32(p);
state_seq = ceph_decode_64(p);
- ceph_decode_copy(p, &addr, sizeof(addr));
- ceph_decode_addr(&addr);
+ err = ceph_decode_entity_addr(p, end, &addr);
+ if (err)
+ goto corrupt;
ceph_decode_copy(p, &laggy_since, sizeof(laggy_since));
*p += sizeof(u32);
ceph_decode_32_safe(p, end, namelen, bad);
@@ -357,7 +358,7 @@ bad_ext:
nomem:
err = -ENOMEM;
goto out_err;
-bad:
+corrupt:
pr_err("corrupt mdsmap\n");
print_hex_dump(KERN_DEBUG, "mdsmap: ",
DUMP_PREFIX_OFFSET, 16, 1,
@@ -365,6 +366,9 @@ bad:
out_err:
ceph_mdsmap_destroy(m);
return ERR_PTR(err);
+bad:
+ err = -EINVAL;
+ goto corrupt;
}
void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
index d629fc857450..de56dee60540 100644
--- a/fs/ceph/quota.c
+++ b/fs/ceph/quota.c
@@ -135,7 +135,7 @@ static struct inode *lookup_quotarealm_inode(struct ceph_mds_client *mdsc,
return NULL;
mutex_lock(&qri->mutex);
- if (qri->inode) {
+ if (qri->inode && ceph_is_any_caps(qri->inode)) {
/* A request has already returned the inode */
mutex_unlock(&qri->mutex);
return qri->inode;
@@ -146,7 +146,18 @@ static struct inode *lookup_quotarealm_inode(struct ceph_mds_client *mdsc,
mutex_unlock(&qri->mutex);
return NULL;
}
- in = ceph_lookup_inode(sb, realm->ino);
+ if (qri->inode) {
+ /* get caps */
+ int ret = __ceph_do_getattr(qri->inode, NULL,
+ CEPH_STAT_CAP_INODE, true);
+ if (ret >= 0)
+ in = qri->inode;
+ else
+ in = ERR_PTR(ret);
+ } else {
+ in = ceph_lookup_inode(sb, realm->ino);
+ }
+
if (IS_ERR(in)) {
pr_warn("Can't lookup inode %llx (err: %ld)\n",
realm->ino, PTR_ERR(in));
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 72c6c022f02b..4c6494eb02b5 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -3,6 +3,7 @@
#include <linux/sort.h>
#include <linux/slab.h>
+#include <linux/iversion.h>
#include "super.h"
#include "mds_client.h"
#include <linux/ceph/decode.h>
@@ -606,6 +607,8 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
capsnap->mtime = inode->i_mtime;
capsnap->atime = inode->i_atime;
capsnap->ctime = inode->i_ctime;
+ capsnap->btime = ci->i_btime;
+ capsnap->change_attr = inode_peek_iversion_raw(inode);
capsnap->time_warp_seq = ci->i_time_warp_seq;
capsnap->truncate_size = ci->i_truncate_size;
capsnap->truncate_seq = ci->i_truncate_seq;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index ed1b65a6c2c3..ab4868c7308e 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -840,10 +840,10 @@ static int ceph_remount(struct super_block *sb, int *flags, char *data)
static const struct super_operations ceph_super_ops = {
.alloc_inode = ceph_alloc_inode,
- .destroy_inode = ceph_destroy_inode,
.free_inode = ceph_free_inode,
.write_inode = ceph_write_inode,
- .drop_inode = ceph_drop_inode,
+ .drop_inode = generic_delete_inode,
+ .evict_inode = ceph_evict_inode,
.sync_fs = ceph_sync_fs,
.put_super = ceph_put_super,
.remount_fs = ceph_remount,
@@ -978,7 +978,7 @@ static int ceph_set_super(struct super_block *s, void *data)
s->s_d_op = &ceph_dentry_ops;
s->s_export_op = &ceph_export_ops;
- s->s_time_gran = 1000; /* 1000 ns == 1 us */
+ s->s_time_gran = 1;
ret = set_anon_super(s, NULL); /* what is that second arg for? */
if (ret != 0)
@@ -1159,17 +1159,15 @@ static int __init init_ceph(void)
goto out;
ceph_flock_init();
- ceph_xattr_init();
ret = register_filesystem(&ceph_fs_type);
if (ret)
- goto out_xattr;
+ goto out_caches;
pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL);
return 0;
-out_xattr:
- ceph_xattr_exit();
+out_caches:
destroy_caches();
out:
return ret;
@@ -1179,7 +1177,6 @@ static void __exit exit_ceph(void)
{
dout("exit_ceph\n");
unregister_filesystem(&ceph_fs_type);
- ceph_xattr_exit();
destroy_caches();
}
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index fbe6869a3f95..d2352fd95dbc 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -197,7 +197,8 @@ struct ceph_cap_snap {
u64 xattr_version;
u64 size;
- struct timespec64 mtime, atime, ctime;
+ u64 change_attr;
+ struct timespec64 mtime, atime, ctime, btime;
u64 time_warp_seq;
u64 truncate_size;
u32 truncate_seq;
@@ -384,6 +385,8 @@ struct ceph_inode_info {
int i_snap_realm_counter; /* snap realm (if caps) */
struct list_head i_snap_realm_item;
struct list_head i_snap_flush_item;
+ struct timespec64 i_btime;
+ struct timespec64 i_snap_btime;
struct work_struct i_work;
unsigned long i_work_mask;
@@ -544,7 +547,12 @@ static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci,
long long release_count,
long long ordered_count)
{
- smp_mb__before_atomic();
+ /*
+ * Makes sure operations that setup readdir cache (update page
+ * cache and i_size) are strongly ordered w.r.t. the following
+ * atomic64_set() operations.
+ */
+ smp_mb();
atomic64_set(&ci->i_complete_seq[0], release_count);
atomic64_set(&ci->i_complete_seq[1], ordered_count);
}
@@ -876,9 +884,8 @@ static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
extern const struct inode_operations ceph_file_iops;
extern struct inode *ceph_alloc_inode(struct super_block *sb);
-extern void ceph_destroy_inode(struct inode *inode);
+extern void ceph_evict_inode(struct inode *inode);
extern void ceph_free_inode(struct inode *inode);
-extern int ceph_drop_inode(struct inode *inode);
extern struct inode *ceph_get_inode(struct super_block *sb,
struct ceph_vino vino);
@@ -921,10 +928,20 @@ ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t);
extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci);
extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
-extern void __init ceph_xattr_init(void);
-extern void ceph_xattr_exit(void);
extern const struct xattr_handler *ceph_xattr_handlers[];
+struct ceph_acl_sec_ctx {
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+ void *default_acl;
+ void *acl;
+#endif
+#ifdef CONFIG_CEPH_FS_SECURITY_LABEL
+ void *sec_ctx;
+ u32 sec_ctxlen;
+#endif
+ struct ceph_pagelist *pagelist;
+};
+
#ifdef CONFIG_SECURITY
extern bool ceph_security_xattr_deadlock(struct inode *in);
extern bool ceph_security_xattr_wanted(struct inode *in);
@@ -939,21 +956,32 @@ static inline bool ceph_security_xattr_wanted(struct inode *in)
}
#endif
-/* acl.c */
-struct ceph_acls_info {
- void *default_acl;
- void *acl;
- struct ceph_pagelist *pagelist;
-};
+#ifdef CONFIG_CEPH_FS_SECURITY_LABEL
+extern int ceph_security_init_secctx(struct dentry *dentry, umode_t mode,
+ struct ceph_acl_sec_ctx *ctx);
+extern void ceph_security_invalidate_secctx(struct inode *inode);
+#else
+static inline int ceph_security_init_secctx(struct dentry *dentry, umode_t mode,
+ struct ceph_acl_sec_ctx *ctx)
+{
+ return 0;
+}
+static inline void ceph_security_invalidate_secctx(struct inode *inode)
+{
+}
+#endif
+
+void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx);
+/* acl.c */
#ifdef CONFIG_CEPH_FS_POSIX_ACL
struct posix_acl *ceph_get_acl(struct inode *, int);
int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type);
int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
- struct ceph_acls_info *info);
-void ceph_init_inode_acls(struct inode *inode, struct ceph_acls_info *info);
-void ceph_release_acls_info(struct ceph_acls_info *info);
+ struct ceph_acl_sec_ctx *as_ctx);
+void ceph_init_inode_acls(struct inode *inode,
+ struct ceph_acl_sec_ctx *as_ctx);
static inline void ceph_forget_all_cached_acls(struct inode *inode)
{
@@ -966,15 +994,12 @@ static inline void ceph_forget_all_cached_acls(struct inode *inode)
#define ceph_set_acl NULL
static inline int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
- struct ceph_acls_info *info)
+ struct ceph_acl_sec_ctx *as_ctx)
{
return 0;
}
static inline void ceph_init_inode_acls(struct inode *inode,
- struct ceph_acls_info *info)
-{
-}
-static inline void ceph_release_acls_info(struct ceph_acls_info *info)
+ struct ceph_acl_sec_ctx *as_ctx)
{
}
static inline int ceph_acl_chmod(struct dentry *dentry, struct inode *inode)
@@ -1000,7 +1025,7 @@ extern void ceph_add_cap(struct inode *inode,
unsigned cap, unsigned seq, u64 realmino, int flags,
struct ceph_cap **new_cap);
extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
-extern void __ceph_remove_caps(struct inode* inode);
+extern void __ceph_remove_caps(struct ceph_inode_info *ci);
extern void ceph_put_cap(struct ceph_mds_client *mdsc,
struct ceph_cap *cap);
extern int ceph_is_any_caps(struct inode *inode);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 0cc42c8879e9..37b458a9af3a 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -8,6 +8,7 @@
#include <linux/ceph/decode.h>
#include <linux/xattr.h>
+#include <linux/security.h>
#include <linux/posix_acl_xattr.h>
#include <linux/slab.h>
@@ -17,26 +18,9 @@
static int __remove_xattr(struct ceph_inode_info *ci,
struct ceph_inode_xattr *xattr);
-static const struct xattr_handler ceph_other_xattr_handler;
-
-/*
- * List of handlers for synthetic system.* attributes. Other
- * attributes are handled directly.
- */
-const struct xattr_handler *ceph_xattr_handlers[] = {
-#ifdef CONFIG_CEPH_FS_POSIX_ACL
- &posix_acl_access_xattr_handler,
- &posix_acl_default_xattr_handler,
-#endif
- &ceph_other_xattr_handler,
- NULL,
-};
-
static bool ceph_is_valid_xattr(const char *name)
{
return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) ||
- !strncmp(name, XATTR_SECURITY_PREFIX,
- XATTR_SECURITY_PREFIX_LEN) ||
!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
}
@@ -48,8 +32,8 @@ static bool ceph_is_valid_xattr(const char *name)
struct ceph_vxattr {
char *name;
size_t name_size; /* strlen(name) + 1 (for '\0') */
- size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
- size_t size);
+ ssize_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
+ size_t size);
bool (*exists_cb)(struct ceph_inode_info *ci);
unsigned int flags;
};
@@ -68,8 +52,8 @@ static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci)
rcu_dereference_raw(fl->pool_ns) != NULL);
}
-static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
- size_t size)
+static ssize_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
+ size_t size)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
struct ceph_osd_client *osdc = &fsc->client->osdc;
@@ -79,7 +63,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
const char *ns_field = " pool_namespace=";
char buf[128];
size_t len, total_len = 0;
- int ret;
+ ssize_t ret;
pool_ns = ceph_try_get_string(ci->i_layout.pool_ns);
@@ -96,18 +80,15 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
len = snprintf(buf, sizeof(buf),
"stripe_unit=%u stripe_count=%u object_size=%u pool=%lld",
ci->i_layout.stripe_unit, ci->i_layout.stripe_count,
- ci->i_layout.object_size, (unsigned long long)pool);
+ ci->i_layout.object_size, pool);
total_len = len;
}
if (pool_ns)
total_len += strlen(ns_field) + pool_ns->len;
- if (!size) {
- ret = total_len;
- } else if (total_len > size) {
- ret = -ERANGE;
- } else {
+ ret = total_len;
+ if (size >= total_len) {
memcpy(val, buf, len);
ret = len;
if (pool_name) {
@@ -128,28 +109,55 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
return ret;
}
-static size_t ceph_vxattrcb_layout_stripe_unit(struct ceph_inode_info *ci,
- char *val, size_t size)
+/*
+ * The convention with strings in xattrs is that they should not be NULL
+ * terminated, since we're returning the length with them. snprintf always
+ * NULL terminates however, so call it on a temporary buffer and then memcpy
+ * the result into place.
+ */
+static int ceph_fmt_xattr(char *val, size_t size, const char *fmt, ...)
{
- return snprintf(val, size, "%u", ci->i_layout.stripe_unit);
+ int ret;
+ va_list args;
+ char buf[96]; /* NB: reevaluate size if new vxattrs are added */
+
+ va_start(args, fmt);
+ ret = vsnprintf(buf, size ? sizeof(buf) : 0, fmt, args);
+ va_end(args);
+
+ /* Sanity check */
+ if (size && ret + 1 > sizeof(buf)) {
+ WARN_ONCE(true, "Returned length too big (%d)", ret);
+ return -E2BIG;
+ }
+
+ if (ret <= size)
+ memcpy(val, buf, ret);
+ return ret;
}
-static size_t ceph_vxattrcb_layout_stripe_count(struct ceph_inode_info *ci,
+static ssize_t ceph_vxattrcb_layout_stripe_unit(struct ceph_inode_info *ci,
char *val, size_t size)
{
- return snprintf(val, size, "%u", ci->i_layout.stripe_count);
+ return ceph_fmt_xattr(val, size, "%u", ci->i_layout.stripe_unit);
+}
+
+static ssize_t ceph_vxattrcb_layout_stripe_count(struct ceph_inode_info *ci,
+ char *val, size_t size)
+{
+ return ceph_fmt_xattr(val, size, "%u", ci->i_layout.stripe_count);
}
-static size_t ceph_vxattrcb_layout_object_size(struct ceph_inode_info *ci,
- char *val, size_t size)
+static ssize_t ceph_vxattrcb_layout_object_size(struct ceph_inode_info *ci,
+ char *val, size_t size)
{
- return snprintf(val, size, "%u", ci->i_layout.object_size);
+ return ceph_fmt_xattr(val, size, "%u", ci->i_layout.object_size);
}
-static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
- char *val, size_t size)
+static ssize_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
+ char *val, size_t size)
{
- int ret;
+ ssize_t ret;
struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
struct ceph_osd_client *osdc = &fsc->client->osdc;
s64 pool = ci->i_layout.pool_id;
@@ -157,21 +165,27 @@ static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
down_read(&osdc->lock);
pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
- if (pool_name)
- ret = snprintf(val, size, "%s", pool_name);
- else
- ret = snprintf(val, size, "%lld", (unsigned long long)pool);
+ if (pool_name) {
+ ret = strlen(pool_name);
+ if (ret <= size)
+ memcpy(val, pool_name, ret);
+ } else {
+ ret = ceph_fmt_xattr(val, size, "%lld", pool);
+ }
up_read(&osdc->lock);
return ret;
}
-static size_t ceph_vxattrcb_layout_pool_namespace(struct ceph_inode_info *ci,
- char *val, size_t size)
+static ssize_t ceph_vxattrcb_layout_pool_namespace(struct ceph_inode_info *ci,
+ char *val, size_t size)
{
- int ret = 0;
+ ssize_t ret = 0;
struct ceph_string *ns = ceph_try_get_string(ci->i_layout.pool_ns);
+
if (ns) {
- ret = snprintf(val, size, "%.*s", (int)ns->len, ns->str);
+ ret = ns->len;
+ if (ret <= size)
+ memcpy(val, ns->str, ret);
ceph_put_string(ns);
}
return ret;
@@ -179,53 +193,54 @@ static size_t ceph_vxattrcb_layout_pool_namespace(struct ceph_inode_info *ci,
/* directories */
-static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val,
- size_t size)
+static ssize_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val,
+ size_t size)
{
- return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs);
+ return ceph_fmt_xattr(val, size, "%lld", ci->i_files + ci->i_subdirs);
}
-static size_t ceph_vxattrcb_dir_files(struct ceph_inode_info *ci, char *val,
- size_t size)
+static ssize_t ceph_vxattrcb_dir_files(struct ceph_inode_info *ci, char *val,
+ size_t size)
{
- return snprintf(val, size, "%lld", ci->i_files);
+ return ceph_fmt_xattr(val, size, "%lld", ci->i_files);
}
-static size_t ceph_vxattrcb_dir_subdirs(struct ceph_inode_info *ci, char *val,
- size_t size)
+static ssize_t ceph_vxattrcb_dir_subdirs(struct ceph_inode_info *ci, char *val,
+ size_t size)
{
- return snprintf(val, size, "%lld", ci->i_subdirs);
+ return ceph_fmt_xattr(val, size, "%lld", ci->i_subdirs);
}
-static size_t ceph_vxattrcb_dir_rentries(struct ceph_inode_info *ci, char *val,
- size_t size)
+static ssize_t ceph_vxattrcb_dir_rentries(struct ceph_inode_info *ci, char *val,
+ size_t size)
{
- return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs);
+ return ceph_fmt_xattr(val, size, "%lld",
+ ci->i_rfiles + ci->i_rsubdirs);
}
-static size_t ceph_vxattrcb_dir_rfiles(struct ceph_inode_info *ci, char *val,
- size_t size)
+static ssize_t ceph_vxattrcb_dir_rfiles(struct ceph_inode_info *ci, char *val,
+ size_t size)
{
- return snprintf(val, size, "%lld", ci->i_rfiles);
+ return ceph_fmt_xattr(val, size, "%lld", ci->i_rfiles);
}
-static size_t ceph_vxattrcb_dir_rsubdirs(struct ceph_inode_info *ci, char *val,
- size_t size)
+static ssize_t ceph_vxattrcb_dir_rsubdirs(struct ceph_inode_info *ci, char *val,
+ size_t size)
{
- return snprintf(val, size, "%lld", ci->i_rsubdirs);
+ return ceph_fmt_xattr(val, size, "%lld", ci->i_rsubdirs);
}
-static size_t ceph_vxattrcb_dir_rbytes(struct ceph_inode_info *ci, char *val,
- size_t size)
+static ssize_t ceph_vxattrcb_dir_rbytes(struct ceph_inode_info *ci, char *val,
+ size_t size)
{
- return snprintf(val, size, "%lld", ci->i_rbytes);
+ return ceph_fmt_xattr(val, size, "%lld", ci->i_rbytes);
}
-static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
- size_t size)
+static ssize_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
+ size_t size)
{
- return snprintf(val, size, "%lld.09%ld", ci->i_rctime.tv_sec,
- ci->i_rctime.tv_nsec);
+ return ceph_fmt_xattr(val, size, "%lld.%09ld", ci->i_rctime.tv_sec,
+ ci->i_rctime.tv_nsec);
}
/* dir pin */
@@ -234,10 +249,10 @@ static bool ceph_vxattrcb_dir_pin_exists(struct ceph_inode_info *ci)
return ci->i_dir_pin != -ENODATA;
}
-static size_t ceph_vxattrcb_dir_pin(struct ceph_inode_info *ci, char *val,
- size_t size)
+static ssize_t ceph_vxattrcb_dir_pin(struct ceph_inode_info *ci, char *val,
+ size_t size)
{
- return snprintf(val, size, "%d", (int)ci->i_dir_pin);
+ return ceph_fmt_xattr(val, size, "%d", (int)ci->i_dir_pin);
}
/* quotas */
@@ -254,23 +269,36 @@ static bool ceph_vxattrcb_quota_exists(struct ceph_inode_info *ci)
return ret;
}
-static size_t ceph_vxattrcb_quota(struct ceph_inode_info *ci, char *val,
- size_t size)
+static ssize_t ceph_vxattrcb_quota(struct ceph_inode_info *ci, char *val,
+ size_t size)
+{
+ return ceph_fmt_xattr(val, size, "max_bytes=%llu max_files=%llu",
+ ci->i_max_bytes, ci->i_max_files);
+}
+
+static ssize_t ceph_vxattrcb_quota_max_bytes(struct ceph_inode_info *ci,
+ char *val, size_t size)
{
- return snprintf(val, size, "max_bytes=%llu max_files=%llu",
- ci->i_max_bytes, ci->i_max_files);
+ return ceph_fmt_xattr(val, size, "%llu", ci->i_max_bytes);
}
-static size_t ceph_vxattrcb_quota_max_bytes(struct ceph_inode_info *ci,
- char *val, size_t size)
+static ssize_t ceph_vxattrcb_quota_max_files(struct ceph_inode_info *ci,
+ char *val, size_t size)
{
- return snprintf(val, size, "%llu", ci->i_max_bytes);
+ return ceph_fmt_xattr(val, size, "%llu", ci->i_max_files);
}
-static size_t ceph_vxattrcb_quota_max_files(struct ceph_inode_info *ci,
- char *val, size_t size)
+/* snapshots */
+static bool ceph_vxattrcb_snap_btime_exists(struct ceph_inode_info *ci)
{
- return snprintf(val, size, "%llu", ci->i_max_files);
+ return (ci->i_snap_btime.tv_sec != 0 || ci->i_snap_btime.tv_nsec != 0);
+}
+
+static ssize_t ceph_vxattrcb_snap_btime(struct ceph_inode_info *ci, char *val,
+ size_t size)
+{
+ return ceph_fmt_xattr(val, size, "%lld.%09ld", ci->i_snap_btime.tv_sec,
+ ci->i_snap_btime.tv_nsec);
}
#define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name
@@ -327,7 +355,7 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
XATTR_RSTAT_FIELD(dir, rctime),
{
.name = "ceph.dir.pin",
- .name_size = sizeof("ceph.dir_pin"),
+ .name_size = sizeof("ceph.dir.pin"),
.getxattr_cb = ceph_vxattrcb_dir_pin,
.exists_cb = ceph_vxattrcb_dir_pin_exists,
.flags = VXATTR_FLAG_HIDDEN,
@@ -341,9 +369,15 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
},
XATTR_QUOTA_FIELD(quota, max_bytes),
XATTR_QUOTA_FIELD(quota, max_files),
+ {
+ .name = "ceph.snap.btime",
+ .name_size = sizeof("ceph.snap.btime"),
+ .getxattr_cb = ceph_vxattrcb_snap_btime,
+ .exists_cb = ceph_vxattrcb_snap_btime_exists,
+ .flags = VXATTR_FLAG_READONLY,
+ },
{ .name = NULL, 0 } /* Required table terminator */
};
-static size_t ceph_dir_vxattrs_name_size; /* total size of all names */
/* files */
@@ -360,9 +394,15 @@ static struct ceph_vxattr ceph_file_vxattrs[] = {
XATTR_LAYOUT_FIELD(file, layout, object_size),
XATTR_LAYOUT_FIELD(file, layout, pool),
XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
+ {
+ .name = "ceph.snap.btime",
+ .name_size = sizeof("ceph.snap.btime"),
+ .getxattr_cb = ceph_vxattrcb_snap_btime,
+ .exists_cb = ceph_vxattrcb_snap_btime_exists,
+ .flags = VXATTR_FLAG_READONLY,
+ },
{ .name = NULL, 0 } /* Required table terminator */
};
-static size_t ceph_file_vxattrs_name_size; /* total size of all names */
static struct ceph_vxattr *ceph_inode_vxattrs(struct inode *inode)
{
@@ -373,47 +413,6 @@ static struct ceph_vxattr *ceph_inode_vxattrs(struct inode *inode)
return NULL;
}
-static size_t ceph_vxattrs_name_size(struct ceph_vxattr *vxattrs)
-{
- if (vxattrs == ceph_dir_vxattrs)
- return ceph_dir_vxattrs_name_size;
- if (vxattrs == ceph_file_vxattrs)
- return ceph_file_vxattrs_name_size;
- BUG_ON(vxattrs);
- return 0;
-}
-
-/*
- * Compute the aggregate size (including terminating '\0') of all
- * virtual extended attribute names in the given vxattr table.
- */
-static size_t __init vxattrs_name_size(struct ceph_vxattr *vxattrs)
-{
- struct ceph_vxattr *vxattr;
- size_t size = 0;
-
- for (vxattr = vxattrs; vxattr->name; vxattr++) {
- if (!(vxattr->flags & VXATTR_FLAG_HIDDEN))
- size += vxattr->name_size;
- }
-
- return size;
-}
-
-/* Routines called at initialization and exit time */
-
-void __init ceph_xattr_init(void)
-{
- ceph_dir_vxattrs_name_size = vxattrs_name_size(ceph_dir_vxattrs);
- ceph_file_vxattrs_name_size = vxattrs_name_size(ceph_file_vxattrs);
-}
-
-void ceph_xattr_exit(void)
-{
- ceph_dir_vxattrs_name_size = 0;
- ceph_file_vxattrs_name_size = 0;
-}
-
static struct ceph_vxattr *ceph_match_vxattr(struct inode *inode,
const char *name)
{
@@ -523,8 +522,8 @@ static int __set_xattr(struct ceph_inode_info *ci,
dout("__set_xattr_val p=%p\n", p);
}
- dout("__set_xattr_val added %llx.%llx xattr %p %s=%.*s\n",
- ceph_vinop(&ci->vfs_inode), xattr, name, val_len, val);
+ dout("__set_xattr_val added %llx.%llx xattr %p %.*s=%.*s\n",
+ ceph_vinop(&ci->vfs_inode), xattr, name_len, name, val_len, val);
return 0;
}
@@ -823,7 +822,7 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
struct ceph_inode_xattr *xattr;
struct ceph_vxattr *vxattr = NULL;
int req_mask;
- int err;
+ ssize_t err;
/* let's see if a virtual xattr was requested */
vxattr = ceph_match_vxattr(inode, name);
@@ -835,8 +834,11 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
if (err)
return err;
err = -ENODATA;
- if (!(vxattr->exists_cb && !vxattr->exists_cb(ci)))
+ if (!(vxattr->exists_cb && !vxattr->exists_cb(ci))) {
err = vxattr->getxattr_cb(ci, value, size);
+ if (size && size < err)
+ err = -ERANGE;
+ }
return err;
}
@@ -897,10 +899,9 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
struct inode *inode = d_inode(dentry);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_vxattr *vxattrs = ceph_inode_vxattrs(inode);
- u32 vir_namelen = 0;
+ bool len_only = (size == 0);
u32 namelen;
int err;
- u32 len;
int i;
spin_lock(&ci->i_ceph_lock);
@@ -919,38 +920,45 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
err = __build_xattrs(inode);
if (err < 0)
goto out;
- /*
- * Start with virtual dir xattr names (if any) (including
- * terminating '\0' characters for each).
- */
- vir_namelen = ceph_vxattrs_name_size(vxattrs);
- /* adding 1 byte per each variable due to the null termination */
+ /* add 1 byte for each xattr due to the null termination */
namelen = ci->i_xattrs.names_size + ci->i_xattrs.count;
- err = -ERANGE;
- if (size && vir_namelen + namelen > size)
- goto out;
-
- err = namelen + vir_namelen;
- if (size == 0)
- goto out;
+ if (!len_only) {
+ if (namelen > size) {
+ err = -ERANGE;
+ goto out;
+ }
+ names = __copy_xattr_names(ci, names);
+ size -= namelen;
+ }
- names = __copy_xattr_names(ci, names);
/* virtual xattr names, too */
- err = namelen;
if (vxattrs) {
for (i = 0; vxattrs[i].name; i++) {
- if (!(vxattrs[i].flags & VXATTR_FLAG_HIDDEN) &&
- !(vxattrs[i].exists_cb &&
- !vxattrs[i].exists_cb(ci))) {
- len = sprintf(names, "%s", vxattrs[i].name);
- names += len + 1;
- err += len + 1;
+ size_t this_len;
+
+ if (vxattrs[i].flags & VXATTR_FLAG_HIDDEN)
+ continue;
+ if (vxattrs[i].exists_cb && !vxattrs[i].exists_cb(ci))
+ continue;
+
+ this_len = strlen(vxattrs[i].name) + 1;
+ namelen += this_len;
+ if (len_only)
+ continue;
+
+ if (this_len > size) {
+ err = -ERANGE;
+ goto out;
}
+
+ memcpy(names, vxattrs[i].name, this_len);
+ names += this_len;
+ size -= this_len;
}
}
-
+ err = namelen;
out:
spin_unlock(&ci->i_ceph_lock);
return err;
@@ -1206,4 +1214,138 @@ bool ceph_security_xattr_deadlock(struct inode *in)
spin_unlock(&ci->i_ceph_lock);
return ret;
}
+
+#ifdef CONFIG_CEPH_FS_SECURITY_LABEL
+int ceph_security_init_secctx(struct dentry *dentry, umode_t mode,
+ struct ceph_acl_sec_ctx *as_ctx)
+{
+ struct ceph_pagelist *pagelist = as_ctx->pagelist;
+ const char *name;
+ size_t name_len;
+ int err;
+
+ err = security_dentry_init_security(dentry, mode, &dentry->d_name,
+ &as_ctx->sec_ctx,
+ &as_ctx->sec_ctxlen);
+ if (err < 0) {
+ WARN_ON_ONCE(err != -EOPNOTSUPP);
+ err = 0; /* do nothing */
+ goto out;
+ }
+
+ err = -ENOMEM;
+ if (!pagelist) {
+ pagelist = ceph_pagelist_alloc(GFP_KERNEL);
+ if (!pagelist)
+ goto out;
+ err = ceph_pagelist_reserve(pagelist, PAGE_SIZE);
+ if (err)
+ goto out;
+ ceph_pagelist_encode_32(pagelist, 1);
+ }
+
+ /*
+ * FIXME: Make security_dentry_init_security() generic. Currently
+ * It only supports single security module and only selinux has
+ * dentry_init_security hook.
+ */
+ name = XATTR_NAME_SELINUX;
+ name_len = strlen(name);
+ err = ceph_pagelist_reserve(pagelist,
+ 4 * 2 + name_len + as_ctx->sec_ctxlen);
+ if (err)
+ goto out;
+
+ if (as_ctx->pagelist) {
+ /* update count of KV pairs */
+ BUG_ON(pagelist->length <= sizeof(__le32));
+ if (list_is_singular(&pagelist->head)) {
+ le32_add_cpu((__le32*)pagelist->mapped_tail, 1);
+ } else {
+ struct page *page = list_first_entry(&pagelist->head,
+ struct page, lru);
+ void *addr = kmap_atomic(page);
+ le32_add_cpu((__le32*)addr, 1);
+ kunmap_atomic(addr);
+ }
+ } else {
+ as_ctx->pagelist = pagelist;
+ }
+
+ ceph_pagelist_encode_32(pagelist, name_len);
+ ceph_pagelist_append(pagelist, name, name_len);
+
+ ceph_pagelist_encode_32(pagelist, as_ctx->sec_ctxlen);
+ ceph_pagelist_append(pagelist, as_ctx->sec_ctx, as_ctx->sec_ctxlen);
+
+ err = 0;
+out:
+ if (pagelist && !as_ctx->pagelist)
+ ceph_pagelist_release(pagelist);
+ return err;
+}
+
+void ceph_security_invalidate_secctx(struct inode *inode)
+{
+ security_inode_invalidate_secctx(inode);
+}
+
+static int ceph_xattr_set_security_label(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *key, const void *buf,
+ size_t buflen, int flags)
+{
+ if (security_ismaclabel(key)) {
+ const char *name = xattr_full_name(handler, key);
+ return __ceph_setxattr(inode, name, buf, buflen, flags);
+ }
+ return -EOPNOTSUPP;
+}
+
+static int ceph_xattr_get_security_label(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *key, void *buf, size_t buflen)
+{
+ if (security_ismaclabel(key)) {
+ const char *name = xattr_full_name(handler, key);
+ return __ceph_getxattr(inode, name, buf, buflen);
+ }
+ return -EOPNOTSUPP;
+}
+
+static const struct xattr_handler ceph_security_label_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .get = ceph_xattr_get_security_label,
+ .set = ceph_xattr_set_security_label,
+};
+#endif
#endif
+
+void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx)
+{
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+ posix_acl_release(as_ctx->acl);
+ posix_acl_release(as_ctx->default_acl);
+#endif
+#ifdef CONFIG_CEPH_FS_SECURITY_LABEL
+ security_release_secctx(as_ctx->sec_ctx, as_ctx->sec_ctxlen);
+#endif
+ if (as_ctx->pagelist)
+ ceph_pagelist_release(as_ctx->pagelist);
+}
+
+/*
+ * List of handlers for synthetic system.* attributes. Other
+ * attributes are handled directly.
+ */
+const struct xattr_handler *ceph_xattr_handlers[] = {
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
+#endif
+#ifdef CONFIG_CEPH_FS_SECURITY_LABEL
+ &ceph_security_label_handler,
+#endif
+ &ceph_other_xattr_handler,
+ NULL,
+};
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 523e9ea78a28..b16219e5dac9 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -13,9 +13,11 @@ config CIFS
select CRYPTO_LIB_ARC4
select CRYPTO_AEAD2
select CRYPTO_CCM
+ select CRYPTO_GCM
select CRYPTO_ECB
select CRYPTO_AES
select CRYPTO_DES
+ select KEYS
help
This is the client VFS module for the SMB3 family of NAS protocols,
(including support for the most recent, most secure dialect SMB3.1.1)
@@ -109,7 +111,7 @@ config CIFS_WEAK_PW_HASH
config CIFS_UPCALL
bool "Kerberos/SPNEGO advanced session setup"
- depends on CIFS && KEYS
+ depends on CIFS
select DNS_RESOLVER
help
Enables an upcall mechanism for CIFS which accesses userspace helper
@@ -144,14 +146,6 @@ config CIFS_POSIX
(such as Samba 3.10 and later) which can negotiate
CIFS POSIX ACL support. If unsure, say N.
-config CIFS_ACL
- bool "Provide CIFS ACL support"
- depends on CIFS_XATTR && KEYS
- help
- Allows fetching CIFS/NTFS ACL from the server. The DACL blob
- is handed over to the application/caller. See the man
- page for getcifsacl for more information. If unsure, say Y.
-
config CIFS_DEBUG
bool "Enable CIFS debugging routines"
default y
@@ -184,7 +178,7 @@ config CIFS_DEBUG_DUMP_KEYS
config CIFS_DFS_UPCALL
bool "DFS feature support"
- depends on CIFS && KEYS
+ depends on CIFS
select DNS_RESOLVER
help
Distributed File System (DFS) support is used to access shares
@@ -203,10 +197,10 @@ config CIFS_NFSD_EXPORT
Allows NFS server to export a CIFS mounted share (nfsd over cifs)
config CIFS_SMB_DIRECT
- bool "SMB Direct support (Experimental)"
+ bool "SMB Direct support"
depends on CIFS=m && INFINIBAND && INFINIBAND_ADDR_TRANS || CIFS=y && INFINIBAND=y && INFINIBAND_ADDR_TRANS=y
help
- Enables SMB Direct experimental support for SMB 3.0, 3.02 and 3.1.1.
+ Enables SMB Direct support for SMB 3.0, 3.02 and 3.1.1.
SMB Direct allows transferring SMB packets over RDMA. If unsure,
say N.
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 51af69a1a328..41332f20055b 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -10,10 +10,9 @@ cifs-y := trace.o cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o \
cifs_unicode.o nterr.o cifsencrypt.o \
readdir.o ioctl.o sess.o export.o smb1ops.o winucase.o \
smb2ops.o smb2maperror.o smb2transport.o \
- smb2misc.o smb2pdu.o smb2inode.o smb2file.o
+ smb2misc.o smb2pdu.o smb2inode.o smb2file.o cifsacl.o
cifs-$(CONFIG_CIFS_XATTR) += xattr.o
-cifs-$(CONFIG_CIFS_ACL) += cifsacl.o
cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index ec933fb0b36e..a38d796f5ffe 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -240,9 +240,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
#ifdef CONFIG_CIFS_XATTR
seq_printf(m, ",XATTR");
#endif
-#ifdef CONFIG_CIFS_ACL
seq_printf(m, ",ACL");
-#endif
seq_putc(m, '\n');
seq_printf(m, "CIFSMaxBufSize: %d\n", CIFSMaxBufSize);
seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid);
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index ed49222abecb..b326d2ca3765 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -52,6 +52,7 @@
#define CIFS_MOUNT_UID_FROM_ACL 0x2000000 /* try to get UID via special SID */
#define CIFS_MOUNT_NO_HANDLE_CACHE 0x4000000 /* disable caching dir handles */
#define CIFS_MOUNT_NO_DFS 0x8000000 /* disable DFS resolving */
+#define CIFS_MOUNT_MODE_FROM_SID 0x10000000 /* retrieve mode from special ACE */
struct cifs_sb_info {
struct rb_root tlink_tree;
@@ -83,5 +84,10 @@ struct cifs_sb_info {
* failover properly.
*/
char *origin_fullpath; /* \\HOST\SHARE\[OPTIONAL PATH] */
+ /*
+ * Indicate whether serverino option was turned off later
+ * (cifs_autodisable_serverino) in order to match new mounts.
+ */
+ bool mnt_cifs_serverino_autodisabled;
};
#endif /* _CIFS_FS_SB_H */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 24635b65effa..270d3c58fb3b 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -526,6 +526,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
seq_puts(s, ",nobrl");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_HANDLE_CACHE)
seq_puts(s, ",nohandlecache");
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MODE_FROM_SID)
+ seq_puts(s, ",modefromsid");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
seq_puts(s, ",cifsacl");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
@@ -554,6 +556,11 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
seq_printf(s, ",bsize=%u", cifs_sb->bsize);
seq_printf(s, ",echo_interval=%lu",
tcon->ses->server->echo_interval / HZ);
+
+ /* Only display max_credits if it was overridden on mount */
+ if (tcon->ses->server->max_credits != SMB2_MAX_CREDITS_AVAILABLE)
+ seq_printf(s, ",max_credits=%u", tcon->ses->server->max_credits);
+
if (tcon->snapshot_time)
seq_printf(s, ",snapshot=%llu", tcon->snapshot_time);
if (tcon->handle_timeout)
@@ -1517,11 +1524,9 @@ init_cifs(void)
goto out_destroy_dfs_cache;
#endif /* CONFIG_CIFS_UPCALL */
-#ifdef CONFIG_CIFS_ACL
rc = init_cifs_idmap();
if (rc)
goto out_register_key_type;
-#endif /* CONFIG_CIFS_ACL */
rc = register_filesystem(&cifs_fs_type);
if (rc)
@@ -1536,10 +1541,8 @@ init_cifs(void)
return 0;
out_init_cifs_idmap:
-#ifdef CONFIG_CIFS_ACL
exit_cifs_idmap();
out_register_key_type:
-#endif
#ifdef CONFIG_CIFS_UPCALL
exit_cifs_spnego();
out_destroy_dfs_cache:
@@ -1571,9 +1574,7 @@ exit_cifs(void)
unregister_filesystem(&cifs_fs_type);
unregister_filesystem(&smb3_fs_type);
cifs_dfs_release_automount_timer();
-#ifdef CONFIG_CIFS_ACL
exit_cifs_idmap();
-#endif
#ifdef CONFIG_CIFS_UPCALL
exit_cifs_spnego();
#endif
@@ -1607,5 +1608,6 @@ MODULE_SOFTDEP("pre: sha256");
MODULE_SOFTDEP("pre: sha512");
MODULE_SOFTDEP("pre: aead2");
MODULE_SOFTDEP("pre: ccm");
+MODULE_SOFTDEP("pre: gcm");
module_init(init_cifs)
module_exit(exit_cifs)
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 4777b3c4a92c..fe610e7e3670 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -550,6 +550,7 @@ struct smb_vol {
bool override_gid:1;
bool dynperm:1;
bool noperm:1;
+ bool mode_ace:1;
bool no_psx_acl:1; /* set if posix acl support should be disabled */
bool cifs_acl:1;
bool backupuid_specified; /* mount option backupuid is specified */
@@ -600,6 +601,7 @@ struct smb_vol {
__u64 snapshot_time; /* needed for timewarp tokens */
__u32 handle_timeout; /* persistent and durable handle timeout in ms */
unsigned int max_credits; /* smb3 max_credits 10 < credits < 60000 */
+ __u16 compression; /* compression algorithm 0xFFFF default 0=disabled */
};
/**
@@ -617,7 +619,8 @@ struct smb_vol {
CIFS_MOUNT_FSCACHE | CIFS_MOUNT_MF_SYMLINKS | \
CIFS_MOUNT_MULTIUSER | CIFS_MOUNT_STRICT_IO | \
CIFS_MOUNT_CIFS_BACKUPUID | CIFS_MOUNT_CIFS_BACKUPGID | \
- CIFS_MOUNT_NO_DFS)
+ CIFS_MOUNT_UID_FROM_ACL | CIFS_MOUNT_NO_HANDLE_CACHE | \
+ CIFS_MOUNT_NO_DFS | CIFS_MOUNT_MODE_FROM_SID)
/**
* Generic VFS superblock mount flags (s_flags) to consider when
@@ -1870,7 +1873,6 @@ extern unsigned int cifs_min_small; /* min size of small buf pool */
extern unsigned int cifs_max_pending; /* MAX requests at once to server*/
extern bool disable_legacy_dialects; /* forbid vers=1.0 and vers=2.0 mounts */
-#ifdef CONFIG_CIFS_ACL
GLOBAL_EXTERN struct rb_root uidtree;
GLOBAL_EXTERN struct rb_root gidtree;
GLOBAL_EXTERN spinlock_t siduidlock;
@@ -1879,7 +1881,6 @@ GLOBAL_EXTERN struct rb_root siduidtree;
GLOBAL_EXTERN struct rb_root sidgidtree;
GLOBAL_EXTERN spinlock_t uidsidlock;
GLOBAL_EXTERN spinlock_t gidsidlock;
-#endif /* CONFIG_CIFS_ACL */
void cifs_oplock_break(struct work_struct *work);
void cifs_queue_oplock_break(struct cifsFileInfo *cfile);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 1fbd92843a73..e2f95965065d 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -3600,11 +3600,9 @@ static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen,
return size;
}
-static __u16 convert_ace_to_cifs_ace(struct cifs_posix_ace *cifs_ace,
+static void convert_ace_to_cifs_ace(struct cifs_posix_ace *cifs_ace,
const struct posix_acl_xattr_entry *local_ace)
{
- __u16 rc = 0; /* 0 = ACL converted ok */
-
cifs_ace->cifs_e_perm = le16_to_cpu(local_ace->e_perm);
cifs_ace->cifs_e_tag = le16_to_cpu(local_ace->e_tag);
/* BB is there a better way to handle the large uid? */
@@ -3617,7 +3615,6 @@ static __u16 convert_ace_to_cifs_ace(struct cifs_posix_ace *cifs_ace,
cifs_dbg(FYI, "perm %d tag %d id %d\n",
ace->e_perm, ace->e_tag, ace->e_id);
*/
- return rc;
}
/* Convert ACL from local Linux POSIX xattr to CIFS POSIX ACL wire format */
@@ -3653,13 +3650,8 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
cifs_dbg(FYI, "unknown ACL type %d\n", acl_type);
return 0;
}
- for (i = 0; i < count; i++) {
- rc = convert_ace_to_cifs_ace(&cifs_acl->ace_array[i], &ace[i]);
- if (rc != 0) {
- /* ACE not converted */
- break;
- }
- }
+ for (i = 0; i < count; i++)
+ convert_ace_to_cifs_ace(&cifs_acl->ace_array[i], &ace[i]);
if (rc == 0) {
rc = (__u16)(count * sizeof(struct cifs_posix_ace));
rc += sizeof(struct cifs_posix_acl);
@@ -3920,7 +3912,6 @@ GetExtAttrOut:
#endif /* CONFIG_POSIX */
-#ifdef CONFIG_CIFS_ACL
/*
* Initialize NT TRANSACT SMB into small smb request buffer. This assumes that
* all NT TRANSACTS that we init here have total parm and data under about 400
@@ -4164,7 +4155,6 @@ setCifsAclRetry:
return (rc);
}
-#endif /* CONFIG_CIFS_ACL */
/* Legacy Query Path Information call for lookup to old servers such
as Win9x/WinME */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 714a359c7c8d..a4830ced0f98 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -96,7 +96,8 @@ enum {
Opt_multiuser, Opt_sloppy, Opt_nosharesock,
Opt_persistent, Opt_nopersistent,
Opt_resilient, Opt_noresilient,
- Opt_domainauto, Opt_rdma,
+ Opt_domainauto, Opt_rdma, Opt_modesid,
+ Opt_compress,
/* Mount options which take numeric value */
Opt_backupuid, Opt_backupgid, Opt_uid,
@@ -175,6 +176,7 @@ static const match_table_t cifs_mount_option_tokens = {
{ Opt_serverino, "serverino" },
{ Opt_noserverino, "noserverino" },
{ Opt_rwpidforward, "rwpidforward" },
+ { Opt_modesid, "modefromsid" },
{ Opt_cifsacl, "cifsacl" },
{ Opt_nocifsacl, "nocifsacl" },
{ Opt_acl, "acl" },
@@ -212,6 +214,7 @@ static const match_table_t cifs_mount_option_tokens = {
{ Opt_echo_interval, "echo_interval=%s" },
{ Opt_max_credits, "max_credits=%s" },
{ Opt_snapshot, "snapshot=%s" },
+ { Opt_compress, "compress=%s" },
{ Opt_blank_user, "user=" },
{ Opt_blank_user, "username=" },
@@ -706,10 +709,10 @@ static bool
server_unresponsive(struct TCP_Server_Info *server)
{
/*
- * We need to wait 2 echo intervals to make sure we handle such
+ * We need to wait 3 echo intervals to make sure we handle such
* situations right:
* 1s client sends a normal SMB request
- * 2s client gets a response
+ * 3s client gets a response
* 30s echo workqueue job pops, and decides we got a response recently
* and don't need to send another
* ...
@@ -718,9 +721,9 @@ server_unresponsive(struct TCP_Server_Info *server)
*/
if ((server->tcpStatus == CifsGood ||
server->tcpStatus == CifsNeedNegotiate) &&
- time_after(jiffies, server->lstrp + 2 * server->echo_interval)) {
+ time_after(jiffies, server->lstrp + 3 * server->echo_interval)) {
cifs_dbg(VFS, "Server %s has not responded in %lu seconds. Reconnecting...\n",
- server->hostname, (2 * server->echo_interval) / HZ);
+ server->hostname, (3 * server->echo_interval) / HZ);
cifs_reconnect(server);
wake_up(&server->response_q);
return true;
@@ -1223,11 +1226,11 @@ next_pdu:
atomic_read(&midCount));
cifs_dump_mem("Received Data is: ", bufs[i],
HEADER_SIZE(server));
+ smb2_add_credits_from_hdr(bufs[i], server);
#ifdef CONFIG_CIFS_DEBUG2
if (server->ops->dump_detail)
server->ops->dump_detail(bufs[i],
server);
- smb2_add_credits_from_hdr(bufs[i], server);
cifs_dump_mids(server);
#endif /* CIFS_DEBUG2 */
}
@@ -1830,6 +1833,9 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
case Opt_rwpidforward:
vol->rwpidforward = 1;
break;
+ case Opt_modesid:
+ vol->mode_ace = 1;
+ break;
case Opt_cifsacl:
vol->cifs_acl = 1;
break;
@@ -1911,6 +1917,11 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
case Opt_rdma:
vol->rdma = true;
break;
+ case Opt_compress:
+ vol->compression = UNKNOWN_TYPE;
+ cifs_dbg(VFS,
+ "SMB3 compression support is experimental\n");
+ break;
/* Numeric Values */
case Opt_backupuid:
@@ -2544,8 +2555,15 @@ static int match_server(struct TCP_Server_Info *server, struct smb_vol *vol)
if (vol->nosharesock)
return 0;
- /* BB update this for smb3any and default case */
- if ((server->vals != vol->vals) || (server->ops != vol->ops))
+ /* If multidialect negotiation see if existing sessions match one */
+ if (strcmp(vol->vals->version_string, SMB3ANY_VERSION_STRING) == 0) {
+ if (server->vals->protocol_id < SMB30_PROT_ID)
+ return 0;
+ } else if (strcmp(vol->vals->version_string,
+ SMBDEFAULT_VERSION_STRING) == 0) {
+ if (server->vals->protocol_id < SMB21_PROT_ID)
+ return 0;
+ } else if ((server->vals != vol->vals) || (server->ops != vol->ops))
return 0;
if (!net_eq(cifs_net_ns(server), current->nsproxy->net_ns))
@@ -2680,6 +2698,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
tcp_ses->sequence_number = 0;
tcp_ses->reconnect_instance = 1;
tcp_ses->lstrp = jiffies;
+ tcp_ses->compress_algorithm = cpu_to_le16(volume_info->compression);
spin_lock_init(&tcp_ses->req_lock);
INIT_LIST_HEAD(&tcp_ses->tcp_ses_list);
INIT_LIST_HEAD(&tcp_ses->smb_ses_list);
@@ -3460,12 +3479,16 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data)
{
struct cifs_sb_info *old = CIFS_SB(sb);
struct cifs_sb_info *new = mnt_data->cifs_sb;
+ unsigned int oldflags = old->mnt_cifs_flags & CIFS_MOUNT_MASK;
+ unsigned int newflags = new->mnt_cifs_flags & CIFS_MOUNT_MASK;
if ((sb->s_flags & CIFS_MS_MASK) != (mnt_data->flags & CIFS_MS_MASK))
return 0;
- if ((old->mnt_cifs_flags & CIFS_MOUNT_MASK) !=
- (new->mnt_cifs_flags & CIFS_MOUNT_MASK))
+ if (old->mnt_cifs_serverino_autodisabled)
+ newflags &= ~CIFS_MOUNT_SERVER_INUM;
+
+ if (oldflags != newflags)
return 0;
/*
@@ -3965,6 +3988,8 @@ int cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NOPOSIXBRL;
if (pvolume_info->rwpidforward)
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_RWPIDFORWARD;
+ if (pvolume_info->mode_ace)
+ cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MODE_FROM_SID;
if (pvolume_info->cifs_acl)
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_ACL;
if (pvolume_info->backupuid_specified) {
@@ -4459,11 +4484,13 @@ cifs_are_all_path_components_accessible(struct TCP_Server_Info *server,
unsigned int xid,
struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb,
- char *full_path)
+ char *full_path,
+ int added_treename)
{
int rc;
char *s;
char sep, tmp;
+ int skip = added_treename ? 1 : 0;
sep = CIFS_DIR_SEP(cifs_sb);
s = full_path;
@@ -4478,7 +4505,14 @@ cifs_are_all_path_components_accessible(struct TCP_Server_Info *server,
/* next separator */
while (*s && *s != sep)
s++;
-
+ /*
+ * if the treename is added, we then have to skip the first
+ * part within the separators
+ */
+ if (skip) {
+ skip = 0;
+ continue;
+ }
/*
* temporarily null-terminate the path at the end of
* the current component
@@ -4526,8 +4560,7 @@ static int is_path_remote(struct cifs_sb_info *cifs_sb, struct smb_vol *vol,
if (rc != -EREMOTE) {
rc = cifs_are_all_path_components_accessible(server, xid, tcon,
- cifs_sb,
- full_path);
+ cifs_sb, full_path, tcon->Flags & SMB_SHARE_IS_IN_DFS);
if (rc != 0) {
cifs_dbg(VFS, "cannot query dirs between root and final path, "
"enabling CIFS_MOUNT_USE_PREFIX_PATH\n");
diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c
index e3e1c13df439..1692c0c6c23a 100644
--- a/fs/cifs/dfs_cache.c
+++ b/fs/cifs/dfs_cache.c
@@ -492,7 +492,7 @@ static struct dfs_cache_entry *__find_cache_entry(unsigned int hash,
#ifdef CONFIG_CIFS_DEBUG2
char *name = get_tgt_name(ce);
- if (unlikely(IS_ERR(name))) {
+ if (IS_ERR(name)) {
rcu_read_unlock();
return ERR_CAST(name);
}
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index d7cc62252634..1bffe029fb66 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -892,7 +892,6 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
cifs_dbg(FYI, "cifs_sfu_type failed: %d\n", tmprc);
}
-#ifdef CONFIG_CIFS_ACL
/* fill in 0777 bits from ACL */
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
rc = cifs_acl_to_fattr(cifs_sb, &fattr, *inode, full_path, fid);
@@ -902,7 +901,6 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
goto cgii_exit;
}
}
-#endif /* CONFIG_CIFS_ACL */
/* fill in remaining high mode bits e.g. SUID, VTX */
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)
@@ -2415,7 +2413,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
xid = get_xid();
- cifs_dbg(FYI, "setattr on file %pd attrs->iavalid 0x%x\n",
+ cifs_dbg(FYI, "setattr on file %pd attrs->ia_valid 0x%x\n",
direntry, attrs->ia_valid);
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
@@ -2466,7 +2464,6 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
if (attrs->ia_valid & ATTR_GID)
gid = attrs->ia_gid;
-#ifdef CONFIG_CIFS_ACL
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
if (uid_valid(uid) || gid_valid(gid)) {
rc = id_mode_to_cifs_acl(inode, full_path, NO_CHANGE_64,
@@ -2478,7 +2475,6 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
}
}
} else
-#endif /* CONFIG_CIFS_ACL */
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID))
attrs->ia_valid &= ~(ATTR_UID | ATTR_GID);
@@ -2489,7 +2485,6 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
if (attrs->ia_valid & ATTR_MODE) {
mode = attrs->ia_mode;
rc = 0;
-#ifdef CONFIG_CIFS_ACL
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
rc = id_mode_to_cifs_acl(inode, full_path, mode,
INVALID_UID, INVALID_GID);
@@ -2499,7 +2494,6 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
goto cifs_setattr_exit;
}
} else
-#endif /* CONFIG_CIFS_ACL */
if (((mode & S_IWUGO) == 0) &&
(cifsInode->cifsAttrs & ATTR_READONLY) == 0) {
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index b1a696a73f7c..f383877a6511 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -539,6 +539,7 @@ cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
tcon = cifs_sb_master_tcon(cifs_sb);
cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM;
+ cifs_sb->mnt_cifs_serverino_autodisabled = true;
cifs_dbg(VFS, "Autodisabling the use of server inode numbers on %s.\n",
tcon ? tcon->treeName : "new server");
cifs_dbg(VFS, "The server doesn't seem to support them properly or the files might be on different servers (DFS).\n");
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 9e430ae9314f..b7421a096319 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -1223,16 +1223,15 @@ struct smb_version_operations smb1_operations = {
.query_all_EAs = CIFSSMBQAllEAs,
.set_EA = CIFSSMBSetEA,
#endif /* CIFS_XATTR */
-#ifdef CONFIG_CIFS_ACL
.get_acl = get_cifs_acl,
.get_acl_by_fid = get_cifs_acl_by_fid,
.set_acl = set_cifs_acl,
-#endif /* CIFS_ACL */
.make_node = cifs_make_node,
};
struct smb_version_values smb1_values = {
.version_string = SMB1_VERSION_STRING,
+ .protocol_id = SMB10_PROT_ID,
.large_lock_type = LOCKING_ANDX_LARGE_FILES,
.exclusive_lock_type = 0,
.shared_lock_type = LOCKING_ANDX_SHARED_LOCK,
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index 278405d26c47..d8d9cdfa30b6 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -120,6 +120,8 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
SMB2_O_INFO_FILE, 0,
sizeof(struct smb2_file_all_info) +
PATH_MAX * 2, 0, NULL);
+ if (rc)
+ goto finished;
smb2_set_next_command(tcon, &rqst[num_rqst]);
smb2_set_related(&rqst[num_rqst++]);
trace_smb3_query_info_compound_enter(xid, ses->Suid, tcon->tid,
@@ -147,6 +149,8 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
COMPOUND_FID, current->tgid,
FILE_DISPOSITION_INFORMATION,
SMB2_O_INFO_FILE, 0, data, size);
+ if (rc)
+ goto finished;
smb2_set_next_command(tcon, &rqst[num_rqst]);
smb2_set_related(&rqst[num_rqst++]);
trace_smb3_rmdir_enter(xid, ses->Suid, tcon->tid, full_path);
@@ -163,6 +167,8 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
COMPOUND_FID, current->tgid,
FILE_END_OF_FILE_INFORMATION,
SMB2_O_INFO_FILE, 0, data, size);
+ if (rc)
+ goto finished;
smb2_set_next_command(tcon, &rqst[num_rqst]);
smb2_set_related(&rqst[num_rqst++]);
trace_smb3_set_eof_enter(xid, ses->Suid, tcon->tid, full_path);
@@ -180,6 +186,8 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
COMPOUND_FID, current->tgid,
FILE_BASIC_INFORMATION,
SMB2_O_INFO_FILE, 0, data, size);
+ if (rc)
+ goto finished;
smb2_set_next_command(tcon, &rqst[num_rqst]);
smb2_set_related(&rqst[num_rqst++]);
trace_smb3_set_info_compound_enter(xid, ses->Suid, tcon->tid,
@@ -206,6 +214,8 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
COMPOUND_FID, current->tgid,
FILE_RENAME_INFORMATION,
SMB2_O_INFO_FILE, 0, data, size);
+ if (rc)
+ goto finished;
smb2_set_next_command(tcon, &rqst[num_rqst]);
smb2_set_related(&rqst[num_rqst++]);
trace_smb3_rename_enter(xid, ses->Suid, tcon->tid, full_path);
@@ -231,6 +241,8 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
COMPOUND_FID, current->tgid,
FILE_LINK_INFORMATION,
SMB2_O_INFO_FILE, 0, data, size);
+ if (rc)
+ goto finished;
smb2_set_next_command(tcon, &rqst[num_rqst]);
smb2_set_related(&rqst[num_rqst++]);
trace_smb3_hardlink_enter(xid, ses->Suid, tcon->tid, full_path);
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 9fd56b0acd7e..0cdc4e47ca87 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -2027,6 +2027,10 @@ smb2_set_related(struct smb_rqst *rqst)
struct smb2_sync_hdr *shdr;
shdr = (struct smb2_sync_hdr *)(rqst->rq_iov[0].iov_base);
+ if (shdr == NULL) {
+ cifs_dbg(FYI, "shdr NULL in smb2_set_related\n");
+ return;
+ }
shdr->Flags |= SMB2_FLAGS_RELATED_OPERATIONS;
}
@@ -2041,6 +2045,12 @@ smb2_set_next_command(struct cifs_tcon *tcon, struct smb_rqst *rqst)
unsigned long len = smb_rqst_len(server, rqst);
int i, num_padding;
+ shdr = (struct smb2_sync_hdr *)(rqst->rq_iov[0].iov_base);
+ if (shdr == NULL) {
+ cifs_dbg(FYI, "shdr NULL in smb2_set_next_command\n");
+ return;
+ }
+
/* SMB headers in a compound are 8 byte aligned. */
/* No padding needed */
@@ -2080,7 +2090,6 @@ smb2_set_next_command(struct cifs_tcon *tcon, struct smb_rqst *rqst)
}
finished:
- shdr = (struct smb2_sync_hdr *)(rqst->rq_iov[0].iov_base);
shdr->NextCommand = cpu_to_le32(len);
}
@@ -2374,6 +2383,34 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses,
}
static int
+parse_reparse_posix(struct reparse_posix_data *symlink_buf,
+ u32 plen, char **target_path,
+ struct cifs_sb_info *cifs_sb)
+{
+ unsigned int len;
+
+ /* See MS-FSCC 2.1.2.6 for the 'NFS' style reparse tags */
+ len = le16_to_cpu(symlink_buf->ReparseDataLength);
+
+ if (le64_to_cpu(symlink_buf->InodeType) != NFS_SPECFILE_LNK) {
+ cifs_dbg(VFS, "%lld not a supported symlink type\n",
+ le64_to_cpu(symlink_buf->InodeType));
+ return -EOPNOTSUPP;
+ }
+
+ *target_path = cifs_strndup_from_utf16(
+ symlink_buf->PathBuffer,
+ len, true, cifs_sb->local_nls);
+ if (!(*target_path))
+ return -ENOMEM;
+
+ convert_delimiter(*target_path, '/');
+ cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path);
+
+ return 0;
+}
+
+static int
parse_reparse_symlink(struct reparse_symlink_data_buffer *symlink_buf,
u32 plen, char **target_path,
struct cifs_sb_info *cifs_sb)
@@ -2381,11 +2418,7 @@ parse_reparse_symlink(struct reparse_symlink_data_buffer *symlink_buf,
unsigned int sub_len;
unsigned int sub_offset;
- /* We only handle Symbolic Link : MS-FSCC 2.1.2.4 */
- if (le32_to_cpu(symlink_buf->ReparseTag) != IO_REPARSE_TAG_SYMLINK) {
- cifs_dbg(VFS, "srv returned invalid symlink buffer\n");
- return -EIO;
- }
+ /* We handle Symbolic Link reparse tag here. See: MS-FSCC 2.1.2.4 */
sub_offset = le16_to_cpu(symlink_buf->SubstituteNameOffset);
sub_len = le16_to_cpu(symlink_buf->SubstituteNameLength);
@@ -2407,6 +2440,41 @@ parse_reparse_symlink(struct reparse_symlink_data_buffer *symlink_buf,
return 0;
}
+static int
+parse_reparse_point(struct reparse_data_buffer *buf,
+ u32 plen, char **target_path,
+ struct cifs_sb_info *cifs_sb)
+{
+ if (plen < sizeof(struct reparse_data_buffer)) {
+ cifs_dbg(VFS, "reparse buffer is too small. Must be "
+ "at least 8 bytes but was %d\n", plen);
+ return -EIO;
+ }
+
+ if (plen < le16_to_cpu(buf->ReparseDataLength) +
+ sizeof(struct reparse_data_buffer)) {
+ cifs_dbg(VFS, "srv returned invalid reparse buf "
+ "length: %d\n", plen);
+ return -EIO;
+ }
+
+ /* See MS-FSCC 2.1.2 */
+ switch (le32_to_cpu(buf->ReparseTag)) {
+ case IO_REPARSE_TAG_NFS:
+ return parse_reparse_posix(
+ (struct reparse_posix_data *)buf,
+ plen, target_path, cifs_sb);
+ case IO_REPARSE_TAG_SYMLINK:
+ return parse_reparse_symlink(
+ (struct reparse_symlink_data_buffer *)buf,
+ plen, target_path, cifs_sb);
+ default:
+ cifs_dbg(VFS, "srv returned unknown symlink buffer "
+ "tag:0x%08x\n", le32_to_cpu(buf->ReparseTag));
+ return -EOPNOTSUPP;
+ }
+}
+
#define SMB2_SYMLINK_STRUCT_SIZE \
(sizeof(struct smb2_err_rsp) - 1 + sizeof(struct smb2_symlink_err_rsp))
@@ -2533,23 +2601,8 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
goto querty_exit;
}
- if (plen < 8) {
- cifs_dbg(VFS, "reparse buffer is too small. Must be "
- "at least 8 bytes but was %d\n", plen);
- rc = -EIO;
- goto querty_exit;
- }
-
- if (plen < le16_to_cpu(reparse_buf->ReparseDataLength) + 8) {
- cifs_dbg(VFS, "srv returned invalid reparse buf "
- "length: %d\n", plen);
- rc = -EIO;
- goto querty_exit;
- }
-
- rc = parse_reparse_symlink(
- (struct reparse_symlink_data_buffer *)reparse_buf,
- plen, target_path, cifs_sb);
+ rc = parse_reparse_point(reparse_buf, plen, target_path,
+ cifs_sb);
goto querty_exit;
}
@@ -2561,26 +2614,32 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
err_buf = err_iov.iov_base;
if (le32_to_cpu(err_buf->ByteCount) < sizeof(struct smb2_symlink_err_rsp) ||
err_iov.iov_len < SMB2_SYMLINK_STRUCT_SIZE) {
- rc = -ENOENT;
+ rc = -EINVAL;
+ goto querty_exit;
+ }
+
+ symlink = (struct smb2_symlink_err_rsp *)err_buf->ErrorData;
+ if (le32_to_cpu(symlink->SymLinkErrorTag) != SYMLINK_ERROR_TAG ||
+ le32_to_cpu(symlink->ReparseTag) != IO_REPARSE_TAG_SYMLINK) {
+ rc = -EINVAL;
goto querty_exit;
}
/* open must fail on symlink - reset rc */
rc = 0;
- symlink = (struct smb2_symlink_err_rsp *)err_buf->ErrorData;
sub_len = le16_to_cpu(symlink->SubstituteNameLength);
sub_offset = le16_to_cpu(symlink->SubstituteNameOffset);
print_len = le16_to_cpu(symlink->PrintNameLength);
print_offset = le16_to_cpu(symlink->PrintNameOffset);
if (err_iov.iov_len < SMB2_SYMLINK_STRUCT_SIZE + sub_offset + sub_len) {
- rc = -ENOENT;
+ rc = -EINVAL;
goto querty_exit;
}
if (err_iov.iov_len <
SMB2_SYMLINK_STRUCT_SIZE + print_offset + print_len) {
- rc = -ENOENT;
+ rc = -EINVAL;
goto querty_exit;
}
@@ -2606,7 +2665,6 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
return rc;
}
-#ifdef CONFIG_CIFS_ACL
static struct cifs_ntsd *
get_smb2_acl_by_fid(struct cifs_sb_info *cifs_sb,
const struct cifs_fid *cifsfid, u32 *pacllen)
@@ -2691,7 +2749,6 @@ get_smb2_acl_by_path(struct cifs_sb_info *cifs_sb,
return pntsd;
}
-#ifdef CONFIG_CIFS_ACL
static int
set_smb2_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
struct inode *inode, const char *path, int aclflag)
@@ -2749,7 +2806,6 @@ set_smb2_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
free_xid(xid);
return rc;
}
-#endif /* CIFS_ACL */
/* Retrieve an ACL from the server */
static struct cifs_ntsd *
@@ -2769,7 +2825,6 @@ get_smb2_acl(struct cifs_sb_info *cifs_sb,
cifsFileInfo_put(open_file);
return pntsd;
}
-#endif
static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
loff_t offset, loff_t len, bool keep_size)
@@ -3367,7 +3422,7 @@ smb2_dir_needs_close(struct cifsFileInfo *cfile)
static void
fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, unsigned int orig_len,
- struct smb_rqst *old_rq)
+ struct smb_rqst *old_rq, __le16 cipher_type)
{
struct smb2_sync_hdr *shdr =
(struct smb2_sync_hdr *)old_rq->rq_iov[0].iov_base;
@@ -3376,7 +3431,10 @@ fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, unsigned int orig_len,
tr_hdr->ProtocolId = SMB2_TRANSFORM_PROTO_NUM;
tr_hdr->OriginalMessageSize = cpu_to_le32(orig_len);
tr_hdr->Flags = cpu_to_le16(0x01);
- get_random_bytes(&tr_hdr->Nonce, SMB3_AES128CMM_NONCE);
+ if (cipher_type == SMB2_ENCRYPTION_AES128_GCM)
+ get_random_bytes(&tr_hdr->Nonce, SMB3_AES128GCM_NONCE);
+ else
+ get_random_bytes(&tr_hdr->Nonce, SMB3_AES128CCM_NONCE);
memcpy(&tr_hdr->SessionId, &shdr->SessionId, 8);
}
@@ -3534,8 +3592,13 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
rc = -ENOMEM;
goto free_sg;
}
- iv[0] = 3;
- memcpy(iv + 1, (char *)tr_hdr->Nonce, SMB3_AES128CMM_NONCE);
+
+ if (server->cipher_type == SMB2_ENCRYPTION_AES128_GCM)
+ memcpy(iv, (char *)tr_hdr->Nonce, SMB3_AES128GCM_NONCE);
+ else {
+ iv[0] = 3;
+ memcpy(iv + 1, (char *)tr_hdr->Nonce, SMB3_AES128CCM_NONCE);
+ }
aead_request_set_crypt(req, sg, sg, crypt_len, iv);
aead_request_set_ad(req, assoc_data_len);
@@ -3635,7 +3698,7 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, int num_rqst,
}
/* fill the 1st iov with a transform header */
- fill_transform_hdr(tr_hdr, orig_len, old_rq);
+ fill_transform_hdr(tr_hdr, orig_len, old_rq, server->cipher_type);
rc = crypt_message(server, num_rqst, new_rq, 1);
cifs_dbg(FYI, "Encrypt message returned %d\n", rc);
@@ -4284,11 +4347,9 @@ struct smb_version_operations smb20_operations = {
.query_all_EAs = smb2_query_eas,
.set_EA = smb2_set_ea,
#endif /* CIFS_XATTR */
-#ifdef CONFIG_CIFS_ACL
.get_acl = get_smb2_acl,
.get_acl_by_fid = get_smb2_acl_by_fid,
.set_acl = set_smb2_acl,
-#endif /* CIFS_ACL */
.next_header = smb2_next_header,
.ioctl_query_info = smb2_ioctl_query_info,
.make_node = smb2_make_node,
@@ -4385,11 +4446,9 @@ struct smb_version_operations smb21_operations = {
.query_all_EAs = smb2_query_eas,
.set_EA = smb2_set_ea,
#endif /* CIFS_XATTR */
-#ifdef CONFIG_CIFS_ACL
.get_acl = get_smb2_acl,
.get_acl_by_fid = get_smb2_acl_by_fid,
.set_acl = set_smb2_acl,
-#endif /* CIFS_ACL */
.next_header = smb2_next_header,
.ioctl_query_info = smb2_ioctl_query_info,
.make_node = smb2_make_node,
@@ -4495,11 +4554,9 @@ struct smb_version_operations smb30_operations = {
.query_all_EAs = smb2_query_eas,
.set_EA = smb2_set_ea,
#endif /* CIFS_XATTR */
-#ifdef CONFIG_CIFS_ACL
.get_acl = get_smb2_acl,
.get_acl_by_fid = get_smb2_acl_by_fid,
.set_acl = set_smb2_acl,
-#endif /* CIFS_ACL */
.next_header = smb2_next_header,
.ioctl_query_info = smb2_ioctl_query_info,
.make_node = smb2_make_node,
@@ -4606,11 +4663,9 @@ struct smb_version_operations smb311_operations = {
.query_all_EAs = smb2_query_eas,
.set_EA = smb2_set_ea,
#endif /* CIFS_XATTR */
-#ifdef CONFIG_CIFS_ACL
.get_acl = get_smb2_acl,
.get_acl_by_fid = get_smb2_acl_by_fid,
.set_acl = set_smb2_acl,
-#endif /* CIFS_ACL */
.next_header = smb2_next_header,
.ioctl_query_info = smb2_ioctl_query_info,
.make_node = smb2_make_node,
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 75311a8a68bf..f58e4dc3987b 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -489,10 +489,25 @@ static void
build_encrypt_ctxt(struct smb2_encryption_neg_context *pneg_ctxt)
{
pneg_ctxt->ContextType = SMB2_ENCRYPTION_CAPABILITIES;
- pneg_ctxt->DataLength = cpu_to_le16(4); /* Cipher Count + le16 cipher */
- pneg_ctxt->CipherCount = cpu_to_le16(1);
-/* pneg_ctxt->Ciphers[0] = SMB2_ENCRYPTION_AES128_GCM;*/ /* not supported yet */
- pneg_ctxt->Ciphers[0] = SMB2_ENCRYPTION_AES128_CCM;
+ pneg_ctxt->DataLength = cpu_to_le16(6); /* Cipher Count + two ciphers */
+ pneg_ctxt->CipherCount = cpu_to_le16(2);
+ pneg_ctxt->Ciphers[0] = SMB2_ENCRYPTION_AES128_GCM;
+ pneg_ctxt->Ciphers[1] = SMB2_ENCRYPTION_AES128_CCM;
+}
+
+static unsigned int
+build_netname_ctxt(struct smb2_netname_neg_context *pneg_ctxt, char *hostname)
+{
+ struct nls_table *cp = load_nls_default();
+
+ pneg_ctxt->ContextType = SMB2_NETNAME_NEGOTIATE_CONTEXT_ID;
+
+ /* copy up to max of first 100 bytes of server name to NetName field */
+ pneg_ctxt->DataLength = cpu_to_le16(2 +
+ (2 * cifs_strtoUTF16(pneg_ctxt->NetName, hostname, 100, cp)));
+ /* context size is DataLength + minimal smb2_neg_context */
+ return DIV_ROUND_UP(le16_to_cpu(pneg_ctxt->DataLength) +
+ sizeof(struct smb2_neg_context), 8) * 8;
}
static void
@@ -521,7 +536,7 @@ build_posix_ctxt(struct smb2_posix_neg_context *pneg_ctxt)
static void
assemble_neg_contexts(struct smb2_negotiate_req *req,
- unsigned int *total_len)
+ struct TCP_Server_Info *server, unsigned int *total_len)
{
char *pneg_ctxt = (char *)req;
unsigned int ctxt_len;
@@ -551,17 +566,25 @@ assemble_neg_contexts(struct smb2_negotiate_req *req,
*total_len += ctxt_len;
pneg_ctxt += ctxt_len;
- build_compression_ctxt((struct smb2_compression_capabilities_context *)
+ if (server->compress_algorithm) {
+ build_compression_ctxt((struct smb2_compression_capabilities_context *)
pneg_ctxt);
- ctxt_len = DIV_ROUND_UP(
- sizeof(struct smb2_compression_capabilities_context), 8) * 8;
+ ctxt_len = DIV_ROUND_UP(
+ sizeof(struct smb2_compression_capabilities_context),
+ 8) * 8;
+ *total_len += ctxt_len;
+ pneg_ctxt += ctxt_len;
+ req->NegotiateContextCount = cpu_to_le16(5);
+ } else
+ req->NegotiateContextCount = cpu_to_le16(4);
+
+ ctxt_len = build_netname_ctxt((struct smb2_netname_neg_context *)pneg_ctxt,
+ server->hostname);
*total_len += ctxt_len;
pneg_ctxt += ctxt_len;
build_posix_ctxt((struct smb2_posix_neg_context *)pneg_ctxt);
*total_len += sizeof(struct smb2_posix_neg_context);
-
- req->NegotiateContextCount = cpu_to_le16(4);
}
static void decode_preauth_context(struct smb2_preauth_neg_context *ctxt)
@@ -829,7 +852,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
if ((ses->server->vals->protocol_id == SMB311_PROT_ID) ||
(strcmp(ses->server->vals->version_string,
SMBDEFAULT_VERSION_STRING) == 0))
- assemble_neg_contexts(req, &total_len);
+ assemble_neg_contexts(req, server, &total_len);
}
iov[0].iov_base = (char *)req;
iov[0].iov_len = total_len;
@@ -2095,6 +2118,48 @@ add_twarp_context(struct kvec *iov, unsigned int *num_iovec, __u64 timewarp)
return 0;
}
+static struct crt_query_id_ctxt *
+create_query_id_buf(void)
+{
+ struct crt_query_id_ctxt *buf;
+
+ buf = kzalloc(sizeof(struct crt_query_id_ctxt), GFP_KERNEL);
+ if (!buf)
+ return NULL;
+
+ buf->ccontext.DataOffset = cpu_to_le16(0);
+ buf->ccontext.DataLength = cpu_to_le32(0);
+ buf->ccontext.NameOffset = cpu_to_le16(offsetof
+ (struct crt_query_id_ctxt, Name));
+ buf->ccontext.NameLength = cpu_to_le16(4);
+ /* SMB2_CREATE_QUERY_ON_DISK_ID is "QFid" */
+ buf->Name[0] = 'Q';
+ buf->Name[1] = 'F';
+ buf->Name[2] = 'i';
+ buf->Name[3] = 'd';
+ return buf;
+}
+
+/* See MS-SMB2 2.2.13.2.9 */
+static int
+add_query_id_context(struct kvec *iov, unsigned int *num_iovec)
+{
+ struct smb2_create_req *req = iov[0].iov_base;
+ unsigned int num = *num_iovec;
+
+ iov[num].iov_base = create_query_id_buf();
+ if (iov[num].iov_base == NULL)
+ return -ENOMEM;
+ iov[num].iov_len = sizeof(struct crt_query_id_ctxt);
+ if (!req->CreateContextsOffset)
+ req->CreateContextsOffset = cpu_to_le32(
+ sizeof(struct smb2_create_req) +
+ iov[num - 1].iov_len);
+ le32_add_cpu(&req->CreateContextsLength, sizeof(struct crt_query_id_ctxt));
+ *num_iovec = num + 1;
+ return 0;
+}
+
static int
alloc_path_with_tree_prefix(__le16 **out_path, int *out_size, int *out_len,
const char *treename, const __le16 *path)
@@ -2423,6 +2488,12 @@ SMB2_open_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, __u8 *oplock,
return rc;
}
+ if (n_iov > 2) {
+ struct create_context *ccontext =
+ (struct create_context *)iov[n_iov-1].iov_base;
+ ccontext->Next = cpu_to_le32(iov[n_iov-1].iov_len);
+ }
+ add_query_id_context(iov, &n_iov);
rqst->rq_nvec = n_iov;
return 0;
@@ -2550,12 +2621,11 @@ SMB2_ioctl_init(struct cifs_tcon *tcon, struct smb_rqst *rqst,
* indatalen is usually small at a couple of bytes max, so
* just allocate through generic pool
*/
- in_data_buf = kmalloc(indatalen, GFP_NOFS);
+ in_data_buf = kmemdup(in_data, indatalen, GFP_NOFS);
if (!in_data_buf) {
cifs_small_buf_release(req);
return -ENOMEM;
}
- memcpy(in_data_buf, in_data, indatalen);
}
req->CtlCode = cpu_to_le32(opcode);
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 858353d20c39..7e2e782f8edd 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -123,7 +123,7 @@ struct smb2_sync_pdu {
__le16 StructureSize2; /* size of wct area (varies, request specific) */
} __packed;
-#define SMB3_AES128CMM_NONCE 11
+#define SMB3_AES128CCM_NONCE 11
#define SMB3_AES128GCM_NONCE 12
struct smb2_transform_hdr {
@@ -166,6 +166,8 @@ struct smb2_err_rsp {
__u8 ErrorData[1]; /* variable length */
} __packed;
+#define SYMLINK_ERROR_TAG 0x4c4d5953
+
struct smb2_symlink_err_rsp {
__le32 SymLinkLength;
__le32 SymLinkErrorTag;
@@ -227,6 +229,7 @@ struct smb2_negotiate_req {
} __packed;
/* Dialects */
+#define SMB10_PROT_ID 0x0000 /* local only, not sent on wire w/CIFS negprot */
#define SMB20_PROT_ID 0x0202
#define SMB21_PROT_ID 0x0210
#define SMB30_PROT_ID 0x0300
@@ -293,7 +296,7 @@ struct smb2_encryption_neg_context {
__le16 DataLength;
__le32 Reserved;
__le16 CipherCount; /* AES-128-GCM and AES-128-CCM */
- __le16 Ciphers[1]; /* Ciphers[0] since only one used now */
+ __le16 Ciphers[2];
} __packed;
/* See MS-SMB2 2.2.3.1.3 */
@@ -316,6 +319,12 @@ struct smb2_compression_capabilities_context {
* For smb2_netname_negotiate_context_id See MS-SMB2 2.2.3.1.4.
* Its struct simply contains NetName, an array of Unicode characters
*/
+struct smb2_netname_neg_context {
+ __le16 ContextType; /* 0x100 */
+ __le16 DataLength;
+ __le32 Reserved;
+ __le16 NetName[0]; /* hostname of target converted to UCS-2 */
+} __packed;
#define POSIX_CTXT_DATA_LEN 16
struct smb2_posix_neg_context {
@@ -640,6 +649,7 @@ struct smb2_tree_disconnect_rsp {
#define SMB2_CREATE_DURABLE_HANDLE_REQUEST_V2 "DH2Q"
#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT_V2 "DH2C"
#define SMB2_CREATE_APP_INSTANCE_ID 0x45BCA66AEFA7F74A9008FA462E144D74
+#define SMB2_CREATE_APP_INSTANCE_VERSION 0xB982D0B73B56074FA07B524A8116A010
#define SVHDX_OPEN_DEVICE_CONTEX 0x9CCBCF9E04C1E643980E158DA1F6EC83
#define SMB2_CREATE_TAG_POSIX 0x93AD25509CB411E7B42383DE968BCD7C
@@ -654,9 +664,10 @@ struct smb2_tree_disconnect_rsp {
* [3] : durable context
* [4] : posix context
* [5] : time warp context
- * [6] : compound padding
+ * [6] : query id context
+ * [7] : compound padding
*/
-#define SMB2_CREATE_IOV_SIZE 7
+#define SMB2_CREATE_IOV_SIZE 8
struct smb2_create_req {
struct smb2_sync_hdr sync_hdr;
@@ -680,10 +691,10 @@ struct smb2_create_req {
/*
* Maximum size of a SMB2_CREATE response is 64 (smb2 header) +
- * 88 (fixed part of create response) + 520 (path) + 150 (contexts) +
+ * 88 (fixed part of create response) + 520 (path) + 208 (contexts) +
* 2 bytes of padding.
*/
-#define MAX_SMB2_CREATE_RESPONSE_SIZE 824
+#define MAX_SMB2_CREATE_RESPONSE_SIZE 880
struct smb2_create_rsp {
struct smb2_sync_hdr sync_hdr;
@@ -806,6 +817,13 @@ struct durable_reconnect_context_v2 {
__le32 Flags; /* see above DHANDLE_FLAG_PERSISTENT */
} __packed;
+/* See MS-SMB2 2.2.14.2.9 */
+struct on_disk_id {
+ __le64 DiskFileId;
+ __le64 VolumeId;
+ __u32 Reserved[4];
+} __packed;
+
/* See MS-SMB2 2.2.14.2.12 */
struct durable_reconnect_context_v2_rsp {
__le32 Timeout;
@@ -826,6 +844,12 @@ struct crt_twarp_ctxt {
} __packed;
+/* See MS-SMB2 2.2.13.2.9 */
+struct crt_query_id_ctxt {
+ struct create_context ccontext;
+ __u8 Name[8];
+} __packed;
+
#define COPY_CHUNK_RES_KEY_SIZE 24
struct resume_key_req {
char ResumeKey[COPY_CHUNK_RES_KEY_SIZE];
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index d1181572758b..1ccbcf9c2c3b 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -734,7 +734,10 @@ smb3_crypto_aead_allocate(struct TCP_Server_Info *server)
struct crypto_aead *tfm;
if (!server->secmech.ccmaesencrypt) {
- tfm = crypto_alloc_aead("ccm(aes)", 0, 0);
+ if (server->cipher_type == SMB2_ENCRYPTION_AES128_GCM)
+ tfm = crypto_alloc_aead("gcm(aes)", 0, 0);
+ else
+ tfm = crypto_alloc_aead("ccm(aes)", 0, 0);
if (IS_ERR(tfm)) {
cifs_dbg(VFS, "%s: Failed to alloc encrypt aead\n",
__func__);
@@ -744,7 +747,10 @@ smb3_crypto_aead_allocate(struct TCP_Server_Info *server)
}
if (!server->secmech.ccmaesdecrypt) {
- tfm = crypto_alloc_aead("ccm(aes)", 0, 0);
+ if (server->cipher_type == SMB2_ENCRYPTION_AES128_GCM)
+ tfm = crypto_alloc_aead("gcm(aes)", 0, 0);
+ else
+ tfm = crypto_alloc_aead("ccm(aes)", 0, 0);
if (IS_ERR(tfm)) {
crypto_free_aead(server->secmech.ccmaesencrypt);
server->secmech.ccmaesencrypt = NULL;
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 60661b3f983a..5d6d44bfe10a 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -979,6 +979,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
};
unsigned int instance;
char *buf;
+ struct TCP_Server_Info *server;
optype = flags & CIFS_OP_MASK;
@@ -990,7 +991,8 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
return -EIO;
}
- if (ses->server->tcpStatus == CifsExiting)
+ server = ses->server;
+ if (server->tcpStatus == CifsExiting)
return -ENOENT;
/*
@@ -1001,7 +1003,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
* other requests.
* This can be handled by the eventual session reconnect.
*/
- rc = wait_for_compound_request(ses->server, num_rqst, flags,
+ rc = wait_for_compound_request(server, num_rqst, flags,
&instance);
if (rc)
return rc;
@@ -1017,7 +1019,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
* of smb data.
*/
- mutex_lock(&ses->server->srv_mutex);
+ mutex_lock(&server->srv_mutex);
/*
* All the parts of the compound chain belong obtained credits from the
@@ -1026,24 +1028,24 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
* we obtained credits and return -EAGAIN in such cases to let callers
* handle it.
*/
- if (instance != ses->server->reconnect_instance) {
- mutex_unlock(&ses->server->srv_mutex);
+ if (instance != server->reconnect_instance) {
+ mutex_unlock(&server->srv_mutex);
for (j = 0; j < num_rqst; j++)
- add_credits(ses->server, &credits[j], optype);
+ add_credits(server, &credits[j], optype);
return -EAGAIN;
}
for (i = 0; i < num_rqst; i++) {
- midQ[i] = ses->server->ops->setup_request(ses, &rqst[i]);
+ midQ[i] = server->ops->setup_request(ses, &rqst[i]);
if (IS_ERR(midQ[i])) {
- revert_current_mid(ses->server, i);
+ revert_current_mid(server, i);
for (j = 0; j < i; j++)
cifs_delete_mid(midQ[j]);
- mutex_unlock(&ses->server->srv_mutex);
+ mutex_unlock(&server->srv_mutex);
/* Update # of requests on wire to server */
for (j = 0; j < num_rqst; j++)
- add_credits(ses->server, &credits[j], optype);
+ add_credits(server, &credits[j], optype);
return PTR_ERR(midQ[i]);
}
@@ -1059,19 +1061,19 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
else
midQ[i]->callback = cifs_compound_last_callback;
}
- cifs_in_send_inc(ses->server);
- rc = smb_send_rqst(ses->server, num_rqst, rqst, flags);
- cifs_in_send_dec(ses->server);
+ cifs_in_send_inc(server);
+ rc = smb_send_rqst(server, num_rqst, rqst, flags);
+ cifs_in_send_dec(server);
for (i = 0; i < num_rqst; i++)
cifs_save_when_sent(midQ[i]);
if (rc < 0) {
- revert_current_mid(ses->server, num_rqst);
- ses->server->sequence_number -= 2;
+ revert_current_mid(server, num_rqst);
+ server->sequence_number -= 2;
}
- mutex_unlock(&ses->server->srv_mutex);
+ mutex_unlock(&server->srv_mutex);
/*
* If sending failed for some reason or it is an oplock break that we
@@ -1079,7 +1081,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
*/
if (rc < 0 || (flags & CIFS_NO_SRV_RSP)) {
for (i = 0; i < num_rqst; i++)
- add_credits(ses->server, &credits[i], optype);
+ add_credits(server, &credits[i], optype);
goto out;
}
@@ -1099,7 +1101,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
rqst[0].rq_nvec);
for (i = 0; i < num_rqst; i++) {
- rc = wait_for_response(ses->server, midQ[i]);
+ rc = wait_for_response(server, midQ[i]);
if (rc != 0)
break;
}
@@ -1107,7 +1109,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
for (; i < num_rqst; i++) {
cifs_dbg(VFS, "Cancelling wait for mid %llu cmd: %d\n",
midQ[i]->mid, le16_to_cpu(midQ[i]->command));
- send_cancel(ses->server, &rqst[i], midQ[i]);
+ send_cancel(server, &rqst[i], midQ[i]);
spin_lock(&GlobalMid_Lock);
if (midQ[i]->mid_state == MID_REQUEST_SUBMITTED) {
midQ[i]->mid_flags |= MID_WAIT_CANCELLED;
@@ -1123,7 +1125,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
if (rc < 0)
goto out;
- rc = cifs_sync_mid_result(midQ[i], ses->server);
+ rc = cifs_sync_mid_result(midQ[i], server);
if (rc != 0) {
/* mark this mid as cancelled to not free it below */
cancelled_mid[i] = true;
@@ -1140,14 +1142,14 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
buf = (char *)midQ[i]->resp_buf;
resp_iov[i].iov_base = buf;
resp_iov[i].iov_len = midQ[i]->resp_buf_size +
- ses->server->vals->header_preamble_size;
+ server->vals->header_preamble_size;
if (midQ[i]->large_buf)
resp_buf_type[i] = CIFS_LARGE_BUFFER;
else
resp_buf_type[i] = CIFS_SMALL_BUFFER;
- rc = ses->server->ops->check_receive(midQ[i], ses->server,
+ rc = server->ops->check_receive(midQ[i], server,
flags & CIFS_LOG_ERROR);
/* mark it so buf will not be freed by cifs_delete_mid */
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 50ddb795aaeb..9076150758d8 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -96,7 +96,6 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
break;
case XATTR_CIFS_ACL: {
-#ifdef CONFIG_CIFS_ACL
struct cifs_ntsd *pacl;
if (!value)
@@ -117,7 +116,6 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
CIFS_I(inode)->time = 0;
kfree(pacl);
}
-#endif /* CONFIG_CIFS_ACL */
break;
}
@@ -247,7 +245,6 @@ static int cifs_xattr_get(const struct xattr_handler *handler,
break;
case XATTR_CIFS_ACL: {
-#ifdef CONFIG_CIFS_ACL
u32 acllen;
struct cifs_ntsd *pacl;
@@ -270,7 +267,6 @@ static int cifs_xattr_get(const struct xattr_handler *handler,
rc = acllen;
kfree(pacl);
}
-#endif /* CONFIG_CIFS_ACL */
break;
}
diff --git a/fs/dax.c b/fs/dax.c
index fe5e33810cd4..e99e5f373c88 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -124,6 +124,15 @@ static int dax_is_empty_entry(void *entry)
}
/*
+ * true if the entry that was found is of a smaller order than the entry
+ * we were looking for
+ */
+static bool dax_is_conflict(void *entry)
+{
+ return entry == XA_RETRY_ENTRY;
+}
+
+/*
* DAX page cache entry locking
*/
struct exceptional_entry_key {
@@ -195,11 +204,13 @@ static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all)
* Look up entry in page cache, wait for it to become unlocked if it
* is a DAX entry and return it. The caller must subsequently call
* put_unlocked_entry() if it did not lock the entry or dax_unlock_entry()
- * if it did.
+ * if it did. The entry returned may have a larger order than @order.
+ * If @order is larger than the order of the entry found in i_pages, this
+ * function returns a dax_is_conflict entry.
*
* Must be called with the i_pages lock held.
*/
-static void *get_unlocked_entry(struct xa_state *xas)
+static void *get_unlocked_entry(struct xa_state *xas, unsigned int order)
{
void *entry;
struct wait_exceptional_entry_queue ewait;
@@ -210,6 +221,8 @@ static void *get_unlocked_entry(struct xa_state *xas)
for (;;) {
entry = xas_find_conflict(xas);
+ if (dax_entry_order(entry) < order)
+ return XA_RETRY_ENTRY;
if (!entry || WARN_ON_ONCE(!xa_is_value(entry)) ||
!dax_is_locked(entry))
return entry;
@@ -254,7 +267,7 @@ static void wait_entry_unlocked(struct xa_state *xas, void *entry)
static void put_unlocked_entry(struct xa_state *xas, void *entry)
{
/* If we were the only waiter woken, wake the next one */
- if (entry)
+ if (entry && dax_is_conflict(entry))
dax_wake_entry(xas, entry, false);